Annotation of src/usr.bin/awk/b.c, Revision 1.25
1.25 ! millert 1: /* $OpenBSD: b.c,v 1.24 2020/06/10 21:02:19 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.5 kstailey 36: #include "ytab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.10 millert 65: static uschar *rlxstr;
66: static uschar *prestr; /* current position in current re */
67: static uschar *lastre; /* origin of last re */
1.23 millert 68: static uschar *lastatom; /* origin of last Atom */
69: static uschar *starttok;
70: static uschar *basestr; /* starts with original, replaced during
71: repetition processing */
72: static uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
77: char *patbeg;
78: int patlen;
79:
80: #define NFA 20 /* cache this many dynamic fa's */
81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.11 millert 84: fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */
1.1 tholo 85: {
86: int i, use, nuse;
87: fa *pfa;
1.6 millert 88: static int now = 1;
1.1 tholo 89:
1.25 ! millert 90: if (setvec == NULL) { /* first time through any RE */
1.1 tholo 91: maxsetvec = MAXLIN;
1.14 deraadt 92: setvec = (int *) calloc(maxsetvec, sizeof(int));
93: tmpset = (int *) calloc(maxsetvec, sizeof(int));
1.25 ! millert 94: if (setvec == NULL || tmpset == NULL)
1.1 tholo 95: overflo("out of space initializing makedfa");
96: }
97:
98: if (compile_time) /* a constant for sure */
99: return mkdfa(s, anchor);
100: for (i = 0; i < nfatab; i++) /* is it there already? */
101: if (fatab[i]->anchor == anchor
1.11 millert 102: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 103: fatab[i]->use = now++;
1.1 tholo 104: return fatab[i];
1.10 millert 105: }
1.1 tholo 106: pfa = mkdfa(s, anchor);
107: if (nfatab < NFA) { /* room for another */
108: fatab[nfatab] = pfa;
1.6 millert 109: fatab[nfatab]->use = now++;
1.1 tholo 110: nfatab++;
111: return pfa;
112: }
113: use = fatab[0]->use; /* replace least-recently used */
114: nuse = 0;
115: for (i = 1; i < nfatab; i++)
116: if (fatab[i]->use < use) {
117: use = fatab[i]->use;
118: nuse = i;
119: }
120: freefa(fatab[nuse]);
121: fatab[nuse] = pfa;
1.6 millert 122: pfa->use = now++;
1.1 tholo 123: return pfa;
124: }
125:
1.11 millert 126: fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
1.1 tholo 127: /* anchor = 1 for anchored matches, else 0 */
128: {
129: Node *p, *p1;
130: fa *f;
131:
1.23 millert 132: firstbasestr = (uschar *) s;
133: basestr = firstbasestr;
1.1 tholo 134: p = reparse(s);
135: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
136: /* put ALL STAR in front of reg. exp. */
137: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
138: /* put FINAL after reg. exp. */
139:
140: poscnt = 0;
141: penter(p1); /* enter parent pointers and leaf indices */
142: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
143: overflo("out of space for fa");
144: f->accept = poscnt-1; /* penter has computed number of positions in re */
145: cfoll(f, p1); /* set up follow sets */
146: freetr(p1);
1.13 pvalchev 147: if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL)
1.1 tholo 148: overflo("out of space in makedfa");
1.4 millert 149: if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
1.1 tholo 150: overflo("out of space in makedfa");
151: *f->posns[1] = 0;
152: f->initstat = makeinit(f, anchor);
153: f->anchor = anchor;
1.10 millert 154: f->restr = (uschar *) tostring(s);
1.23 millert 155: if (firstbasestr != basestr) {
156: if (basestr)
157: xfree(basestr);
158: }
1.1 tholo 159: return f;
160: }
161:
162: int makeinit(fa *f, int anchor)
163: {
164: int i, k;
165:
166: f->curstat = 2;
167: f->out[2] = 0;
168: f->reset = 0;
169: k = *(f->re[0].lfollow);
1.25 ! millert 170: xfree(f->posns[2]);
1.13 pvalchev 171: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 172: overflo("out of space in makeinit");
173: for (i=0; i <= k; i++) {
174: (f->posns[2])[i] = (f->re[0].lfollow)[i];
175: }
176: if ((f->posns[2])[1] == f->accept)
177: f->out[2] = 1;
178: for (i=0; i < NCHARS; i++)
179: f->gototab[2][i] = 0;
180: f->curstat = cgoto(f, 2, HAT);
181: if (anchor) {
182: *f->posns[2] = k-1; /* leave out position 0 */
183: for (i=0; i < k; i++) {
184: (f->posns[0])[i] = (f->posns[2])[i];
185: }
186:
187: f->out[0] = f->out[2];
188: if (f->curstat != 2)
189: --(*f->posns[f->curstat]);
190: }
191: return f->curstat;
192: }
193:
194: void penter(Node *p) /* set up parent pointers and leaf indices */
195: {
196: switch (type(p)) {
1.15 millert 197: ELEAF
1.1 tholo 198: LEAF
1.7 millert 199: info(p) = poscnt;
1.1 tholo 200: poscnt++;
201: break;
202: UNARY
203: penter(left(p));
204: parent(left(p)) = p;
205: break;
206: case CAT:
207: case OR:
208: penter(left(p));
209: penter(right(p));
210: parent(left(p)) = p;
211: parent(right(p)) = p;
212: break;
213: default: /* can't happen */
1.8 millert 214: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 215: break;
216: }
217: }
218:
219: void freetr(Node *p) /* free parse tree */
220: {
221: switch (type(p)) {
1.15 millert 222: ELEAF
1.1 tholo 223: LEAF
224: xfree(p);
225: break;
226: UNARY
227: freetr(left(p));
228: xfree(p);
229: break;
230: case CAT:
231: case OR:
232: freetr(left(p));
233: freetr(right(p));
234: xfree(p);
235: break;
236: default: /* can't happen */
1.8 millert 237: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 238: break;
239: }
240: }
241:
242: /* in the parsing of regular expressions, metacharacters like . have */
243: /* to be seen literally; \056 is not a metacharacter. */
244:
1.17 millert 245: int hexstr(uschar **pp) /* find and eval hex string at pp, return new p */
1.2 millert 246: { /* only pick up one 8-bit byte (2 chars) */
1.10 millert 247: uschar *p;
1.1 tholo 248: int n = 0;
1.2 millert 249: int i;
1.1 tholo 250:
1.10 millert 251: for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
1.1 tholo 252: if (isdigit(*p))
253: n = 16 * n + *p - '0';
254: else if (*p >= 'a' && *p <= 'f')
255: n = 16 * n + *p - 'a' + 10;
256: else if (*p >= 'A' && *p <= 'F')
257: n = 16 * n + *p - 'A' + 10;
258: }
1.17 millert 259: *pp = (uschar *) p;
1.1 tholo 260: return n;
261: }
262:
1.2 millert 263: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 264:
1.17 millert 265: int quoted(uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 266: /* and increment *pp */
267: {
1.17 millert 268: uschar *p = *pp;
1.1 tholo 269: int c;
270:
271: if ((c = *p++) == 't')
272: c = '\t';
273: else if (c == 'n')
274: c = '\n';
275: else if (c == 'f')
276: c = '\f';
277: else if (c == 'r')
278: c = '\r';
279: else if (c == 'b')
280: c = '\b';
1.25 ! millert 281: else if (c == 'v')
! 282: c = '\v';
1.20 millert 283: else if (c == 'a')
1.25 ! millert 284: c = '\a';
1.1 tholo 285: else if (c == '\\')
286: c = '\\';
287: else if (c == 'x') { /* hexadecimal goo follows */
288: c = hexstr(&p); /* this adds a null if number is invalid */
289: } else if (isoctdigit(c)) { /* \d \dd \ddd */
290: int n = c - '0';
291: if (isoctdigit(*p)) {
292: n = 8 * n + *p++ - '0';
293: if (isoctdigit(*p))
294: n = 8 * n + *p++ - '0';
295: }
296: c = n;
297: } /* else */
298: /* c = c; */
299: *pp = p;
300: return c;
301: }
302:
1.11 millert 303: char *cclenter(const char *argp) /* add a character class */
1.1 tholo 304: {
305: int i, c, c2;
1.10 millert 306: uschar *p = (uschar *) argp;
307: uschar *op, *bp;
1.25 ! millert 308: static uschar *buf = NULL;
1.5 kstailey 309: static int bufsz = 100;
1.1 tholo 310:
311: op = p;
1.25 ! millert 312: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 313: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 314: bp = buf;
315: for (i = 0; (c = *p++) != 0; ) {
1.1 tholo 316: if (c == '\\') {
1.17 millert 317: c = quoted(&p);
1.5 kstailey 318: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 319: if (*p != 0) {
1.5 kstailey 320: c = bp[-1];
1.1 tholo 321: c2 = *p++;
322: if (c2 == '\\')
1.17 millert 323: c2 = quoted(&p);
1.1 tholo 324: if (c > c2) { /* empty; ignore */
1.5 kstailey 325: bp--;
1.1 tholo 326: i--;
327: continue;
328: }
329: while (c < c2) {
1.15 millert 330: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
1.8 millert 331: FATAL("out of space for character class [%.10s...] 2", p);
1.5 kstailey 332: *bp++ = ++c;
1.1 tholo 333: i++;
334: }
335: continue;
336: }
337: }
1.15 millert 338: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
1.8 millert 339: FATAL("out of space for character class [%.10s...] 3", p);
1.5 kstailey 340: *bp++ = c;
1.1 tholo 341: i++;
342: }
1.5 kstailey 343: *bp = 0;
1.19 deraadt 344: DPRINTF( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
1.1 tholo 345: xfree(op);
1.10 millert 346: return (char *) tostring((char *) buf);
1.1 tholo 347: }
348:
1.11 millert 349: void overflo(const char *s)
1.1 tholo 350: {
1.8 millert 351: FATAL("regular expression too big: %.30s...", s);
1.1 tholo 352: }
353:
354: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
355: {
356: int i;
1.4 millert 357: int *p;
1.1 tholo 358:
359: switch (type(v)) {
1.15 millert 360: ELEAF
1.1 tholo 361: LEAF
1.7 millert 362: f->re[info(v)].ltype = type(v);
363: f->re[info(v)].lval.np = right(v);
1.1 tholo 364: while (f->accept >= maxsetvec) { /* guessing here! */
1.18 deraadt 365: setvec = reallocarray(setvec, maxsetvec,
366: 4 * sizeof(int));
367: tmpset = reallocarray(tmpset, maxsetvec,
368: 4 * sizeof(int));
1.25 ! millert 369: if (setvec == NULL || tmpset == NULL)
1.1 tholo 370: overflo("out of space in cfoll()");
1.18 deraadt 371: maxsetvec *= 4;
1.1 tholo 372: }
373: for (i = 0; i <= f->accept; i++)
374: setvec[i] = 0;
375: setcnt = 0;
376: follow(v); /* computes setvec and setcnt */
1.13 pvalchev 377: if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1.1 tholo 378: overflo("out of space building follow set");
1.7 millert 379: f->re[info(v)].lfollow = p;
1.1 tholo 380: *p = setcnt;
381: for (i = f->accept; i >= 0; i--)
382: if (setvec[i] == 1)
383: *++p = i;
384: break;
385: UNARY
386: cfoll(f,left(v));
387: break;
388: case CAT:
389: case OR:
390: cfoll(f,left(v));
391: cfoll(f,right(v));
392: break;
393: default: /* can't happen */
1.8 millert 394: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 395: }
396: }
397:
398: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 399: /* returns 0 if p matches empty string */
1.1 tholo 400: {
1.4 millert 401: int b, lp;
1.1 tholo 402:
403: switch (type(p)) {
1.15 millert 404: ELEAF
1.1 tholo 405: LEAF
1.7 millert 406: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 407: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.18 deraadt 408: setvec = reallocarray(setvec, maxsetvec,
409: 4 * sizeof(int));
410: tmpset = reallocarray(tmpset, maxsetvec,
411: 4 * sizeof(int));
1.25 ! millert 412: if (setvec == NULL || tmpset == NULL)
1.1 tholo 413: overflo("out of space in first()");
1.18 deraadt 414: maxsetvec *= 4;
1.1 tholo 415: }
1.15 millert 416: if (type(p) == EMPTYRE) {
417: setvec[lp] = 0;
418: return(0);
419: }
1.1 tholo 420: if (setvec[lp] != 1) {
421: setvec[lp] = 1;
422: setcnt++;
423: }
424: if (type(p) == CCL && (*(char *) right(p)) == '\0')
425: return(0); /* empty CCL */
426: else return(1);
427: case PLUS:
428: if (first(left(p)) == 0) return(0);
429: return(1);
430: case STAR:
431: case QUEST:
432: first(left(p));
433: return(0);
434: case CAT:
435: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
436: return(1);
437: case OR:
438: b = first(right(p));
439: if (first(left(p)) == 0 || b == 0) return(0);
440: return(1);
441: }
1.8 millert 442: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 443: return(-1);
444: }
445:
446: void follow(Node *v) /* collects leaves that can follow v into setvec */
447: {
448: Node *p;
449:
450: if (type(v) == FINAL)
451: return;
452: p = parent(v);
453: switch (type(p)) {
454: case STAR:
455: case PLUS:
456: first(v);
457: follow(p);
458: return;
459:
460: case OR:
461: case QUEST:
462: follow(p);
463: return;
464:
465: case CAT:
466: if (v == left(p)) { /* v is left child of p */
467: if (first(right(p)) == 0) {
468: follow(p);
469: return;
470: }
471: } else /* v is right child */
472: follow(p);
473: return;
474: }
475: }
476:
1.11 millert 477: int member(int c, const char *sarg) /* is c in s? */
1.1 tholo 478: {
1.10 millert 479: uschar *s = (uschar *) sarg;
480:
1.1 tholo 481: while (*s)
482: if (c == *s++)
483: return(1);
484: return(0);
485: }
486:
1.11 millert 487: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 488: {
489: int s, ns;
490: uschar *p = (uschar *) p0;
491:
492: s = f->reset ? makeinit(f,0) : f->initstat;
493: if (f->out[s])
494: return(1);
495: do {
1.15 millert 496: /* assert(*p < NCHARS); */
1.1 tholo 497: if ((ns = f->gototab[s][*p]) != 0)
498: s = ns;
499: else
500: s = cgoto(f, s, *p);
501: if (f->out[s])
502: return(1);
503: } while (*p++ != 0);
504: return(0);
505: }
506:
1.11 millert 507: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 508: {
509: int s, ns;
510: uschar *p = (uschar *) p0;
511: uschar *q;
512: int i, k;
513:
1.12 millert 514: /* s = f->reset ? makeinit(f,1) : f->initstat; */
515: if (f->reset) {
516: f->initstat = s = makeinit(f,1);
517: } else {
518: s = f->initstat;
519: }
1.1 tholo 520: patbeg = (char *) p;
521: patlen = -1;
522: do {
523: q = p;
524: do {
525: if (f->out[s]) /* final state */
526: patlen = q-p;
1.15 millert 527: /* assert(*q < NCHARS); */
1.1 tholo 528: if ((ns = f->gototab[s][*q]) != 0)
529: s = ns;
530: else
531: s = cgoto(f, s, *q);
1.9 deraadt 532: if (s == 1) { /* no transition */
1.1 tholo 533: if (patlen >= 0) {
534: patbeg = (char *) p;
535: return(1);
536: }
537: else
538: goto nextin; /* no match */
1.9 deraadt 539: }
1.1 tholo 540: } while (*q++ != 0);
541: if (f->out[s])
542: patlen = q-p-1; /* don't count $ */
543: if (patlen >= 0) {
544: patbeg = (char *) p;
545: return(1);
546: }
547: nextin:
548: s = 2;
549: if (f->reset) {
550: for (i = 2; i <= f->curstat; i++)
551: xfree(f->posns[i]);
1.25 ! millert 552: k = *f->posns[0];
1.13 pvalchev 553: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 554: overflo("out of space in pmatch");
555: for (i = 0; i <= k; i++)
556: (f->posns[2])[i] = (f->posns[0])[i];
557: f->initstat = f->curstat = 2;
558: f->out[2] = f->out[0];
559: for (i = 0; i < NCHARS; i++)
560: f->gototab[2][i] = 0;
561: }
562: } while (*p++ != 0);
563: return (0);
564: }
565:
1.11 millert 566: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 567: {
568: int s, ns;
569: uschar *p = (uschar *) p0;
570: uschar *q;
571: int i, k;
572:
1.12 millert 573: /* s = f->reset ? makeinit(f,1) : f->initstat; */
574: if (f->reset) {
575: f->initstat = s = makeinit(f,1);
576: } else {
577: s = f->initstat;
578: }
1.1 tholo 579: patlen = -1;
580: while (*p) {
581: q = p;
582: do {
583: if (f->out[s]) /* final state */
584: patlen = q-p;
1.15 millert 585: /* assert(*q < NCHARS); */
1.1 tholo 586: if ((ns = f->gototab[s][*q]) != 0)
587: s = ns;
588: else
589: s = cgoto(f, s, *q);
1.9 deraadt 590: if (s == 1) { /* no transition */
1.1 tholo 591: if (patlen > 0) {
592: patbeg = (char *) p;
593: return(1);
594: } else
595: goto nnextin; /* no nonempty match */
1.9 deraadt 596: }
1.1 tholo 597: } while (*q++ != 0);
598: if (f->out[s])
599: patlen = q-p-1; /* don't count $ */
600: if (patlen > 0 ) {
601: patbeg = (char *) p;
602: return(1);
603: }
604: nnextin:
605: s = 2;
606: if (f->reset) {
607: for (i = 2; i <= f->curstat; i++)
608: xfree(f->posns[i]);
1.25 ! millert 609: k = *f->posns[0];
1.13 pvalchev 610: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 611: overflo("out of state space");
612: for (i = 0; i <= k; i++)
613: (f->posns[2])[i] = (f->posns[0])[i];
614: f->initstat = f->curstat = 2;
615: f->out[2] = f->out[0];
616: for (i = 0; i < NCHARS; i++)
617: f->gototab[2][i] = 0;
618: }
619: p++;
620: }
621: return (0);
622: }
623:
1.11 millert 624: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 625: { /* uses relex() to scan regular expression */
626: Node *np;
627:
1.19 deraadt 628: DPRINTF( ("reparse <%s>\n", p) );
1.10 millert 629: lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 630: rtok = relex();
1.11 millert 631: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 632: if (rtok == '\0') {
1.11 millert 633: /* FATAL("empty regular expression"); previous */
1.15 millert 634: return(op2(EMPTYRE, NIL, NIL));
635: }
1.1 tholo 636: np = regexp();
637: if (rtok != '\0')
1.8 millert 638: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 639: return(np);
640: }
641:
642: Node *regexp(void) /* top-level parse of reg expr */
643: {
644: return (alt(concat(primary())));
645: }
646:
647: Node *primary(void)
648: {
649: Node *np;
1.23 millert 650: int savelastatom;
1.1 tholo 651:
652: switch (rtok) {
653: case CHAR:
1.23 millert 654: lastatom = starttok;
1.7 millert 655: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 656: rtok = relex();
657: return (unary(np));
658: case ALL:
659: rtok = relex();
660: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 661: case EMPTYRE:
662: rtok = relex();
1.23 millert 663: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 664: case DOT:
1.23 millert 665: lastatom = starttok;
1.1 tholo 666: rtok = relex();
667: return (unary(op2(DOT, NIL, NIL)));
668: case CCL:
1.10 millert 669: np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
1.23 millert 670: lastatom = starttok;
1.1 tholo 671: rtok = relex();
672: return (unary(np));
673: case NCCL:
1.10 millert 674: np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
1.23 millert 675: lastatom = starttok;
1.1 tholo 676: rtok = relex();
677: return (unary(np));
678: case '^':
679: rtok = relex();
1.7 millert 680: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 681: case '$':
682: rtok = relex();
683: return (unary(op2(CHAR, NIL, NIL)));
684: case '(':
1.23 millert 685: lastatom = starttok;
686: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 687: rtok = relex();
688: if (rtok == ')') { /* special pleading for () */
689: rtok = relex();
690: return unary(op2(CCL, NIL, (Node *) tostring("")));
691: }
692: np = regexp();
693: if (rtok == ')') {
1.23 millert 694: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 695: rtok = relex();
696: return (unary(np));
697: }
698: else
1.8 millert 699: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 700: default:
1.8 millert 701: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 702: }
703: return 0; /*NOTREACHED*/
704: }
705:
706: Node *concat(Node *np)
707: {
708: switch (rtok) {
1.23 millert 709: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 710: return (concat(op2(CAT, np, primary())));
1.23 millert 711: case EMPTYRE:
712: rtok = relex();
713: return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
714: primary())));
1.1 tholo 715: }
716: return (np);
717: }
718:
719: Node *alt(Node *np)
720: {
721: if (rtok == OR) {
722: rtok = relex();
723: return (alt(op2(OR, np, concat(primary()))));
724: }
725: return (np);
726: }
727:
728: Node *unary(Node *np)
729: {
730: switch (rtok) {
731: case STAR:
732: rtok = relex();
733: return (unary(op2(STAR, np, NIL)));
734: case PLUS:
735: rtok = relex();
736: return (unary(op2(PLUS, np, NIL)));
737: case QUEST:
738: rtok = relex();
739: return (unary(op2(QUEST, np, NIL)));
740: default:
741: return (np);
742: }
743: }
744:
1.11 millert 745: /*
746: * Character class definitions conformant to the POSIX locale as
747: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
748: * and operating character sets are both ASCII (ISO646) or supersets
749: * thereof.
750: *
751: * Note that to avoid overflowing the temporary buffer used in
752: * relex(), the expanded character class (prior to range expansion)
753: * must be less than twice the size of their full name.
754: */
1.12 millert 755:
756: /* Because isblank doesn't show up in any of the header files on any
757: * system i use, it's defined here. if some other locale has a richer
758: * definition of "blank", define HAS_ISBLANK and provide your own
759: * version.
760: * the parentheses here are an attempt to find a path through the maze
761: * of macro definition and/or function and/or version provided. thanks
762: * to nelson beebe for the suggestion; let's see if it works everywhere.
763: */
764:
765: #ifndef HAS_ISBLANK
766:
1.17 millert 767: int (xisblank)(int c)
1.12 millert 768: {
769: return c==' ' || c=='\t';
770: }
771:
772: #endif
773:
1.11 millert 774: struct charclass {
775: const char *cc_name;
776: int cc_namelen;
1.12 millert 777: int (*cc_func)(int);
1.11 millert 778: } charclasses[] = {
1.12 millert 779: { "alnum", 5, isalnum },
780: { "alpha", 5, isalpha },
1.17 millert 781: #ifndef HAS_ISBLANK
1.21 millert 782: { "blank", 5, xisblank },
1.17 millert 783: #else
1.12 millert 784: { "blank", 5, isblank },
1.17 millert 785: #endif
1.12 millert 786: { "cntrl", 5, iscntrl },
787: { "digit", 5, isdigit },
788: { "graph", 5, isgraph },
789: { "lower", 5, islower },
790: { "print", 5, isprint },
791: { "punct", 5, ispunct },
792: { "space", 5, isspace },
793: { "upper", 5, isupper },
794: { "xdigit", 6, isxdigit },
1.11 millert 795: { NULL, 0, NULL },
796: };
797:
1.23 millert 798: #define REPEAT_SIMPLE 0
799: #define REPEAT_PLUS_APPENDED 1
800: #define REPEAT_WITH_Q 2
801: #define REPEAT_ZERO 3
802:
803: static int
804: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
805: int atomlen, int firstnum, int secondnum, int special_case)
806: {
807: int i, j;
808: uschar *buf = 0;
809: int ret = 1;
810: int init_q = (firstnum==0); /* first added char will be ? */
811: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
812: int prefix_length = reptok - basestr; /* prefix includes first rep */
813: int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */
814: int size = prefix_length + suffix_length;
815:
816: if (firstnum > 1) { /* add room for reps 2 through firstnum */
817: size += atomlen*(firstnum-1);
818: }
819:
820: /* Adjust size of buffer for special cases */
821: if (special_case == REPEAT_PLUS_APPENDED) {
822: size++; /* for the final + */
823: } else if (special_case == REPEAT_WITH_Q) {
824: size += init_q + (atomlen+1)* n_q_reps;
825: } else if (special_case == REPEAT_ZERO) {
826: size += 2; /* just a null ERE: () */
827: }
828: if ((buf = (uschar *) malloc(size+1)) == NULL)
829: FATAL("out of space in reg expr %.10s..", lastre);
830: memcpy(buf, basestr, prefix_length); /* copy prefix */
831: j = prefix_length;
832: if (special_case == REPEAT_ZERO) {
833: j -= atomlen;
834: buf[j++] = '(';
835: buf[j++] = ')';
836: }
837: for (i=1; i < firstnum; i++) { /* copy x reps */
838: memcpy(&buf[j], atom, atomlen);
839: j += atomlen;
840: }
841: if (special_case == REPEAT_PLUS_APPENDED) {
842: buf[j++] = '+';
843: } else if (special_case == REPEAT_WITH_Q) {
844: if (init_q) buf[j++] = '?';
845: for (i=0; i < n_q_reps; i++) { /* copy x? reps */
846: memcpy(&buf[j], atom, atomlen);
847: j += atomlen;
848: buf[j++] = '?';
849: }
850: }
851: memcpy(&buf[j], reptok+reptoklen, suffix_length);
852: if (special_case == REPEAT_ZERO) {
853: buf[j+suffix_length] = '\0';
854: } else {
855: buf[size] = '\0';
856: }
857: /* free old basestr */
858: if (firstbasestr != basestr) {
859: if (basestr)
860: xfree(basestr);
861: }
862: basestr = buf;
863: prestr = buf + prefix_length;
864: if (special_case == REPEAT_ZERO) {
865: prestr -= atomlen;
866: ret++;
867: }
868: return ret;
869: }
870:
871: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
872: int atomlen, int firstnum, int secondnum)
873: {
874: /*
875: In general, the repetition specifier or "bound" is replaced here
876: by an equivalent ERE string, repeating the immediately previous atom
877: and appending ? and + as needed. Note that the first copy of the
878: atom is left in place, except in the special_case of a zero-repeat
879: (i.e., {0}).
880: */
881: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
882: if (firstnum < 2) {
883: /* 0 or 1: should be handled before you get here */
884: FATAL("internal error");
885: } else {
886: return replace_repeat(reptok, reptoklen, atom, atomlen,
887: firstnum, secondnum, REPEAT_PLUS_APPENDED);
888: }
889: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
890: if (firstnum == 0) { /* {0} or {0,0} */
891: /* This case is unusual because the resulting
892: replacement string might actually be SMALLER than
893: the original ERE */
894: return replace_repeat(reptok, reptoklen, atom, atomlen,
895: firstnum, secondnum, REPEAT_ZERO);
896: } else { /* (firstnum >= 1) */
897: return replace_repeat(reptok, reptoklen, atom, atomlen,
898: firstnum, secondnum, REPEAT_SIMPLE);
899: }
900: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
901: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
902: return replace_repeat(reptok, reptoklen, atom, atomlen,
903: firstnum, secondnum, REPEAT_WITH_Q);
904: } else { /* Error - shouldn't be here (n>m) */
905: FATAL("internal error");
906: }
907: return 0;
908: }
1.11 millert 909:
1.1 tholo 910: int relex(void) /* lexical analyzer for reparse */
911: {
1.5 kstailey 912: int c, n;
1.1 tholo 913: int cflag;
1.25 ! millert 914: static uschar *buf = NULL;
1.5 kstailey 915: static int bufsz = 100;
1.10 millert 916: uschar *bp;
1.11 millert 917: struct charclass *cc;
1.12 millert 918: int i;
1.23 millert 919: int num, m, commafound, digitfound;
920: const uschar *startreptok;
1.24 millert 921: static int parens = 0;
1.23 millert 922:
923: rescan:
924: starttok = prestr;
1.1 tholo 925:
926: switch (c = *prestr++) {
927: case '|': return OR;
928: case '*': return STAR;
929: case '+': return PLUS;
930: case '?': return QUEST;
931: case '.': return DOT;
932: case '\0': prestr--; return '\0';
933: case '^':
934: case '$':
1.24 millert 935: return c;
1.1 tholo 936: case '(':
1.24 millert 937: parens++;
938: return c;
1.1 tholo 939: case ')':
1.24 millert 940: if (parens) {
941: parens--;
942: return c;
943: }
944: /* unmatched close parenthesis; per POSIX, treat as literal */
945: rlxval = c;
946: return CHAR;
1.1 tholo 947: case '\\':
1.17 millert 948: rlxval = quoted(&prestr);
1.1 tholo 949: return CHAR;
950: default:
951: rlxval = c;
952: return CHAR;
1.25 ! millert 953: case '[':
! 954: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 955: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 956: bp = buf;
1.1 tholo 957: if (*prestr == '^') {
958: cflag = 1;
959: prestr++;
960: }
961: else
962: cflag = 0;
1.11 millert 963: n = 2 * strlen((const char *) prestr)+1;
1.15 millert 964: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 965: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 966: for (; ; ) {
967: if ((c = *prestr++) == '\\') {
1.5 kstailey 968: *bp++ = '\\';
1.1 tholo 969: if ((c = *prestr++) == '\0')
1.8 millert 970: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 971: *bp++ = c;
1.10 millert 972: /* } else if (c == '\n') { */
973: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 974: } else if (c == '[' && *prestr == ':') {
975: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
976: for (cc = charclasses; cc->cc_name; cc++)
977: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
978: break;
979: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
980: prestr[2 + cc->cc_namelen] == ']') {
981: prestr += cc->cc_namelen + 3;
1.22 millert 982: /*
983: * BUG: We begin at 1, instead of 0, since we
984: * would otherwise prematurely terminate the
985: * string for classes like [[:cntrl:]]. This
986: * means that we can't match the NUL character,
987: * not without first adapting the entire
988: * program to track each string's length.
989: */
1.23 millert 990: for (i = 1; i <= UCHAR_MAX; i++) {
1.15 millert 991: if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1.12 millert 992: FATAL("out of space for reg expr %.10s...", lastre);
993: if (cc->cc_func(i)) {
994: *bp++ = i;
995: n++;
996: }
997: }
1.11 millert 998: } else
999: *bp++ = c;
1.23 millert 1000: } else if (c == '[' && *prestr == '.') {
1001: char collate_char;
1002: prestr++;
1003: collate_char = *prestr++;
1004: if (*prestr == '.' && prestr[1] == ']') {
1005: prestr += 2;
1006: /* Found it: map via locale TBD: for
1007: now, simply return this char. This
1008: is sufficient to pass conformance
1009: test awk.ex 156
1010: */
1011: if (*prestr == ']') {
1012: prestr++;
1013: rlxval = collate_char;
1014: return CHAR;
1015: }
1016: }
1017: } else if (c == '[' && *prestr == '=') {
1018: char equiv_char;
1019: prestr++;
1020: equiv_char = *prestr++;
1021: if (*prestr == '=' && prestr[1] == ']') {
1022: prestr += 2;
1023: /* Found it: map via locale TBD: for now
1024: simply return this char. This is
1025: sufficient to pass conformance test
1026: awk.ex 156
1027: */
1028: if (*prestr == ']') {
1029: prestr++;
1030: rlxval = equiv_char;
1031: return CHAR;
1032: }
1033: }
1.1 tholo 1034: } else if (c == '\0') {
1.8 millert 1035: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1036: } else if (bp == buf) { /* 1st char is special */
1037: *bp++ = c;
1.1 tholo 1038: } else if (c == ']') {
1.5 kstailey 1039: *bp++ = 0;
1.10 millert 1040: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1041: if (cflag == 0)
1042: return CCL;
1043: else
1044: return NCCL;
1045: } else
1.5 kstailey 1046: *bp++ = c;
1.1 tholo 1047: }
1.23 millert 1048: break;
1049: case '{':
1050: if (isdigit(*(prestr))) {
1051: num = 0; /* Process as a repetition */
1052: n = -1; m = -1;
1053: commafound = 0;
1054: digitfound = 0;
1055: startreptok = prestr-1;
1056: /* Remember start of previous atom here ? */
1057: } else { /* just a { char, not a repetition */
1058: rlxval = c;
1059: return CHAR;
1060: }
1061: for (; ; ) {
1062: if ((c = *prestr++) == '}') {
1063: if (commafound) {
1064: if (digitfound) { /* {n,m} */
1065: m = num;
1066: if (m<n)
1067: FATAL("illegal repetition expression: class %.20s",
1068: lastre);
1069: if ((n==0) && (m==1)) {
1070: return QUEST;
1071: }
1072: } else { /* {n,} */
1073: if (n==0) return STAR;
1074: if (n==1) return PLUS;
1075: }
1076: } else {
1077: if (digitfound) { /* {n} same as {n,n} */
1078: n = num;
1079: m = num;
1080: } else { /* {} */
1081: FATAL("illegal repetition expression: class %.20s",
1082: lastre);
1083: }
1084: }
1085: if (repeat(starttok, prestr-starttok, lastatom,
1086: startreptok - lastatom, n, m) > 0) {
1087: if ((n==0) && (m==0)) {
1088: return EMPTYRE;
1089: }
1090: /* must rescan input for next token */
1091: goto rescan;
1092: }
1093: /* Failed to replace: eat up {...} characters
1094: and treat like just PLUS */
1095: return PLUS;
1096: } else if (c == '\0') {
1097: FATAL("nonterminated character class %.20s",
1098: lastre);
1099: } else if (isdigit(c)) {
1100: num = 10 * num + c - '0';
1101: digitfound = 1;
1102: } else if (c == ',') {
1103: if (commafound)
1104: FATAL("illegal repetition expression: class %.20s",
1105: lastre);
1106: /* looking for {n,} or {n,m} */
1107: commafound = 1;
1108: n = num;
1109: digitfound = 0; /* reset */
1110: num = 0;
1111: } else {
1112: FATAL("illegal repetition expression: class %.20s",
1113: lastre);
1114: }
1115: }
1116: break;
1.1 tholo 1117: }
1118: }
1119:
1120: int cgoto(fa *f, int s, int c)
1121: {
1122: int i, j, k;
1.4 millert 1123: int *p, *q;
1.1 tholo 1124:
1.12 millert 1125: assert(c == HAT || c < NCHARS);
1.1 tholo 1126: while (f->accept >= maxsetvec) { /* guessing here! */
1.18 deraadt 1127: setvec = reallocarray(setvec, maxsetvec, 4 * sizeof(int));
1128: tmpset = reallocarray(tmpset, maxsetvec, 4 * sizeof(int));
1.25 ! millert 1129: if (setvec == NULL || tmpset == NULL)
1.1 tholo 1130: overflo("out of space in cgoto()");
1.18 deraadt 1131: maxsetvec *= 4;
1.1 tholo 1132: }
1133: for (i = 0; i <= f->accept; i++)
1134: setvec[i] = 0;
1135: setcnt = 0;
1136: /* compute positions of gototab[s,c] into setvec */
1137: p = f->posns[s];
1138: for (i = 1; i <= *p; i++) {
1139: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1140: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1141: || (k == DOT && c != 0 && c != HAT)
1142: || (k == ALL && c != 0)
1.15 millert 1143: || (k == EMPTYRE && c != 0)
1.10 millert 1144: || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1145: || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1.1 tholo 1146: q = f->re[p[i]].lfollow;
1147: for (j = 1; j <= *q; j++) {
1148: if (q[j] >= maxsetvec) {
1.18 deraadt 1149: setvec = reallocarray(setvec,
1150: maxsetvec, 4 * sizeof(int));
1151: tmpset = reallocarray(tmpset,
1152: maxsetvec, 4 * sizeof(int));
1.1 tholo 1153: if (setvec == 0 || tmpset == 0)
1154: overflo("cgoto overflow");
1.18 deraadt 1155: maxsetvec *= 4;
1.1 tholo 1156: }
1157: if (setvec[q[j]] == 0) {
1158: setcnt++;
1159: setvec[q[j]] = 1;
1160: }
1161: }
1162: }
1163: }
1164: }
1165: /* determine if setvec is a previous state */
1166: tmpset[0] = setcnt;
1167: j = 1;
1168: for (i = f->accept; i >= 0; i--)
1169: if (setvec[i]) {
1170: tmpset[j++] = i;
1171: }
1172: /* tmpset == previous state? */
1173: for (i = 1; i <= f->curstat; i++) {
1174: p = f->posns[i];
1175: if ((k = tmpset[0]) != p[0])
1176: goto different;
1177: for (j = 1; j <= k; j++)
1178: if (tmpset[j] != p[j])
1179: goto different;
1180: /* setvec is state i */
1181: f->gototab[s][c] = i;
1182: return i;
1183: different:;
1184: }
1185:
1186: /* add tmpset to current set of states */
1187: if (f->curstat >= NSTATES-1) {
1188: f->curstat = 2;
1189: f->reset = 1;
1190: for (i = 2; i < NSTATES; i++)
1191: xfree(f->posns[i]);
1192: } else
1193: ++(f->curstat);
1194: for (i = 0; i < NCHARS; i++)
1195: f->gototab[f->curstat][i] = 0;
1196: xfree(f->posns[f->curstat]);
1.13 pvalchev 1197: if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1.1 tholo 1198: overflo("out of space in cgoto");
1199:
1200: f->posns[f->curstat] = p;
1201: f->gototab[s][c] = f->curstat;
1202: for (i = 0; i <= setcnt; i++)
1203: p[i] = tmpset[i];
1204: if (setvec[f->accept])
1205: f->out[f->curstat] = 1;
1206: else
1207: f->out[f->curstat] = 0;
1208: return f->curstat;
1209: }
1210:
1211:
1212: void freefa(fa *f) /* free a finite automaton */
1213: {
1214: int i;
1215:
1216: if (f == NULL)
1217: return;
1218: for (i = 0; i <= f->curstat; i++)
1219: xfree(f->posns[i]);
1220: for (i = 0; i <= f->accept; i++) {
1221: xfree(f->re[i].lfollow);
1222: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1223: xfree((f->re[i].lval.np));
1224: }
1225: xfree(f->restr);
1226: xfree(f);
1227: }