Annotation of src/usr.bin/awk/b.c, Revision 1.24
1.24 ! millert 1: /* $OpenBSD: b.c,v 1.23 2020/06/10 21:01:50 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.5 kstailey 36: #include "ytab.h"
1.1 tholo 37:
1.12 millert 38: #define HAT (NCHARS+2) /* matches ^ in regular expr */
1.1 tholo 39: /* NCHARS is 2**n */
40: #define MAXLIN 22
41:
1.7 millert 42: #define type(v) (v)->nobj /* badly overloaded here */
43: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 44: #define left(v) (v)->narg[0]
45: #define right(v) (v)->narg[1]
46: #define parent(v) (v)->nnext
47:
48: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 49: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 50: #define UNARY case STAR: case PLUS: case QUEST:
51:
52: /* encoding in tree Nodes:
1.15 millert 53: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 54: left is index, right contains value or pointer to value
55: unary (STAR, PLUS, QUEST): left is child, right is null
56: binary (CAT, OR): left and right are children
57: parent contains pointer to parent
58: */
59:
60:
61: int *setvec;
62: int *tmpset;
63: int maxsetvec = 0;
64:
65: int rtok; /* next token in current re */
1.4 millert 66: int rlxval;
1.10 millert 67: static uschar *rlxstr;
68: static uschar *prestr; /* current position in current re */
69: static uschar *lastre; /* origin of last re */
1.23 millert 70: static uschar *lastatom; /* origin of last Atom */
71: static uschar *starttok;
72: static uschar *basestr; /* starts with original, replaced during
73: repetition processing */
74: static uschar *firstbasestr;
1.1 tholo 75:
1.4 millert 76: static int setcnt;
77: static int poscnt;
1.1 tholo 78:
79: char *patbeg;
80: int patlen;
81:
82: #define NFA 20 /* cache this many dynamic fa's */
83: fa *fatab[NFA];
84: int nfatab = 0; /* entries in fatab */
85:
1.11 millert 86: fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */
1.1 tholo 87: {
88: int i, use, nuse;
89: fa *pfa;
1.6 millert 90: static int now = 1;
1.1 tholo 91:
92: if (setvec == 0) { /* first time through any RE */
93: maxsetvec = MAXLIN;
1.14 deraadt 94: setvec = (int *) calloc(maxsetvec, sizeof(int));
95: tmpset = (int *) calloc(maxsetvec, sizeof(int));
1.1 tholo 96: if (setvec == 0 || tmpset == 0)
97: overflo("out of space initializing makedfa");
98: }
99:
100: if (compile_time) /* a constant for sure */
101: return mkdfa(s, anchor);
102: for (i = 0; i < nfatab; i++) /* is it there already? */
103: if (fatab[i]->anchor == anchor
1.11 millert 104: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 105: fatab[i]->use = now++;
1.1 tholo 106: return fatab[i];
1.10 millert 107: }
1.1 tholo 108: pfa = mkdfa(s, anchor);
109: if (nfatab < NFA) { /* room for another */
110: fatab[nfatab] = pfa;
1.6 millert 111: fatab[nfatab]->use = now++;
1.1 tholo 112: nfatab++;
113: return pfa;
114: }
115: use = fatab[0]->use; /* replace least-recently used */
116: nuse = 0;
117: for (i = 1; i < nfatab; i++)
118: if (fatab[i]->use < use) {
119: use = fatab[i]->use;
120: nuse = i;
121: }
122: freefa(fatab[nuse]);
123: fatab[nuse] = pfa;
1.6 millert 124: pfa->use = now++;
1.1 tholo 125: return pfa;
126: }
127:
1.11 millert 128: fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
1.1 tholo 129: /* anchor = 1 for anchored matches, else 0 */
130: {
131: Node *p, *p1;
132: fa *f;
133:
1.23 millert 134: firstbasestr = (uschar *) s;
135: basestr = firstbasestr;
1.1 tholo 136: p = reparse(s);
137: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
138: /* put ALL STAR in front of reg. exp. */
139: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
140: /* put FINAL after reg. exp. */
141:
142: poscnt = 0;
143: penter(p1); /* enter parent pointers and leaf indices */
144: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
145: overflo("out of space for fa");
146: f->accept = poscnt-1; /* penter has computed number of positions in re */
147: cfoll(f, p1); /* set up follow sets */
148: freetr(p1);
1.13 pvalchev 149: if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL)
1.1 tholo 150: overflo("out of space in makedfa");
1.4 millert 151: if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
1.1 tholo 152: overflo("out of space in makedfa");
153: *f->posns[1] = 0;
154: f->initstat = makeinit(f, anchor);
155: f->anchor = anchor;
1.10 millert 156: f->restr = (uschar *) tostring(s);
1.23 millert 157: if (firstbasestr != basestr) {
158: if (basestr)
159: xfree(basestr);
160: }
1.1 tholo 161: return f;
162: }
163:
164: int makeinit(fa *f, int anchor)
165: {
166: int i, k;
167:
168: f->curstat = 2;
169: f->out[2] = 0;
170: f->reset = 0;
171: k = *(f->re[0].lfollow);
172: xfree(f->posns[2]);
1.13 pvalchev 173: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 174: overflo("out of space in makeinit");
175: for (i=0; i <= k; i++) {
176: (f->posns[2])[i] = (f->re[0].lfollow)[i];
177: }
178: if ((f->posns[2])[1] == f->accept)
179: f->out[2] = 1;
180: for (i=0; i < NCHARS; i++)
181: f->gototab[2][i] = 0;
182: f->curstat = cgoto(f, 2, HAT);
183: if (anchor) {
184: *f->posns[2] = k-1; /* leave out position 0 */
185: for (i=0; i < k; i++) {
186: (f->posns[0])[i] = (f->posns[2])[i];
187: }
188:
189: f->out[0] = f->out[2];
190: if (f->curstat != 2)
191: --(*f->posns[f->curstat]);
192: }
193: return f->curstat;
194: }
195:
196: void penter(Node *p) /* set up parent pointers and leaf indices */
197: {
198: switch (type(p)) {
1.15 millert 199: ELEAF
1.1 tholo 200: LEAF
1.7 millert 201: info(p) = poscnt;
1.1 tholo 202: poscnt++;
203: break;
204: UNARY
205: penter(left(p));
206: parent(left(p)) = p;
207: break;
208: case CAT:
209: case OR:
210: penter(left(p));
211: penter(right(p));
212: parent(left(p)) = p;
213: parent(right(p)) = p;
214: break;
215: default: /* can't happen */
1.8 millert 216: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 217: break;
218: }
219: }
220:
221: void freetr(Node *p) /* free parse tree */
222: {
223: switch (type(p)) {
1.15 millert 224: ELEAF
1.1 tholo 225: LEAF
226: xfree(p);
227: break;
228: UNARY
229: freetr(left(p));
230: xfree(p);
231: break;
232: case CAT:
233: case OR:
234: freetr(left(p));
235: freetr(right(p));
236: xfree(p);
237: break;
238: default: /* can't happen */
1.8 millert 239: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 240: break;
241: }
242: }
243:
244: /* in the parsing of regular expressions, metacharacters like . have */
245: /* to be seen literally; \056 is not a metacharacter. */
246:
1.17 millert 247: int hexstr(uschar **pp) /* find and eval hex string at pp, return new p */
1.2 millert 248: { /* only pick up one 8-bit byte (2 chars) */
1.10 millert 249: uschar *p;
1.1 tholo 250: int n = 0;
1.2 millert 251: int i;
1.1 tholo 252:
1.10 millert 253: for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
1.1 tholo 254: if (isdigit(*p))
255: n = 16 * n + *p - '0';
256: else if (*p >= 'a' && *p <= 'f')
257: n = 16 * n + *p - 'a' + 10;
258: else if (*p >= 'A' && *p <= 'F')
259: n = 16 * n + *p - 'A' + 10;
260: }
1.17 millert 261: *pp = (uschar *) p;
1.1 tholo 262: return n;
263: }
264:
1.2 millert 265: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 266:
1.17 millert 267: int quoted(uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 268: /* and increment *pp */
269: {
1.17 millert 270: uschar *p = *pp;
1.1 tholo 271: int c;
272:
273: if ((c = *p++) == 't')
274: c = '\t';
1.20 millert 275: else if (c == 'v')
276: c = '\v';
1.1 tholo 277: else if (c == 'n')
278: c = '\n';
279: else if (c == 'f')
280: c = '\f';
281: else if (c == 'r')
282: c = '\r';
283: else if (c == 'b')
284: c = '\b';
1.20 millert 285: else if (c == 'a')
286: c = '\007';
1.1 tholo 287: else if (c == '\\')
288: c = '\\';
289: else if (c == 'x') { /* hexadecimal goo follows */
290: c = hexstr(&p); /* this adds a null if number is invalid */
291: } else if (isoctdigit(c)) { /* \d \dd \ddd */
292: int n = c - '0';
293: if (isoctdigit(*p)) {
294: n = 8 * n + *p++ - '0';
295: if (isoctdigit(*p))
296: n = 8 * n + *p++ - '0';
297: }
298: c = n;
299: } /* else */
300: /* c = c; */
301: *pp = p;
302: return c;
303: }
304:
1.11 millert 305: char *cclenter(const char *argp) /* add a character class */
1.1 tholo 306: {
307: int i, c, c2;
1.10 millert 308: uschar *p = (uschar *) argp;
309: uschar *op, *bp;
310: static uschar *buf = 0;
1.5 kstailey 311: static int bufsz = 100;
1.1 tholo 312:
313: op = p;
1.10 millert 314: if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 315: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 316: bp = buf;
317: for (i = 0; (c = *p++) != 0; ) {
1.1 tholo 318: if (c == '\\') {
1.17 millert 319: c = quoted(&p);
1.5 kstailey 320: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 321: if (*p != 0) {
1.5 kstailey 322: c = bp[-1];
1.1 tholo 323: c2 = *p++;
324: if (c2 == '\\')
1.17 millert 325: c2 = quoted(&p);
1.1 tholo 326: if (c > c2) { /* empty; ignore */
1.5 kstailey 327: bp--;
1.1 tholo 328: i--;
329: continue;
330: }
331: while (c < c2) {
1.15 millert 332: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
1.8 millert 333: FATAL("out of space for character class [%.10s...] 2", p);
1.5 kstailey 334: *bp++ = ++c;
1.1 tholo 335: i++;
336: }
337: continue;
338: }
339: }
1.15 millert 340: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
1.8 millert 341: FATAL("out of space for character class [%.10s...] 3", p);
1.5 kstailey 342: *bp++ = c;
1.1 tholo 343: i++;
344: }
1.5 kstailey 345: *bp = 0;
1.19 deraadt 346: DPRINTF( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
1.1 tholo 347: xfree(op);
1.10 millert 348: return (char *) tostring((char *) buf);
1.1 tholo 349: }
350:
1.11 millert 351: void overflo(const char *s)
1.1 tholo 352: {
1.8 millert 353: FATAL("regular expression too big: %.30s...", s);
1.1 tholo 354: }
355:
356: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
357: {
358: int i;
1.4 millert 359: int *p;
1.1 tholo 360:
361: switch (type(v)) {
1.15 millert 362: ELEAF
1.1 tholo 363: LEAF
1.7 millert 364: f->re[info(v)].ltype = type(v);
365: f->re[info(v)].lval.np = right(v);
1.1 tholo 366: while (f->accept >= maxsetvec) { /* guessing here! */
1.18 deraadt 367: setvec = reallocarray(setvec, maxsetvec,
368: 4 * sizeof(int));
369: tmpset = reallocarray(tmpset, maxsetvec,
370: 4 * sizeof(int));
1.5 kstailey 371: if (setvec == 0 || tmpset == 0)
1.1 tholo 372: overflo("out of space in cfoll()");
1.18 deraadt 373: maxsetvec *= 4;
1.1 tholo 374: }
375: for (i = 0; i <= f->accept; i++)
376: setvec[i] = 0;
377: setcnt = 0;
378: follow(v); /* computes setvec and setcnt */
1.13 pvalchev 379: if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1.1 tholo 380: overflo("out of space building follow set");
1.7 millert 381: f->re[info(v)].lfollow = p;
1.1 tholo 382: *p = setcnt;
383: for (i = f->accept; i >= 0; i--)
384: if (setvec[i] == 1)
385: *++p = i;
386: break;
387: UNARY
388: cfoll(f,left(v));
389: break;
390: case CAT:
391: case OR:
392: cfoll(f,left(v));
393: cfoll(f,right(v));
394: break;
395: default: /* can't happen */
1.8 millert 396: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 397: }
398: }
399:
400: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 401: /* returns 0 if p matches empty string */
1.1 tholo 402: {
1.4 millert 403: int b, lp;
1.1 tholo 404:
405: switch (type(p)) {
1.15 millert 406: ELEAF
1.1 tholo 407: LEAF
1.7 millert 408: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 409: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.18 deraadt 410: setvec = reallocarray(setvec, maxsetvec,
411: 4 * sizeof(int));
412: tmpset = reallocarray(tmpset, maxsetvec,
413: 4 * sizeof(int));
1.7 millert 414: if (setvec == 0 || tmpset == 0)
1.1 tholo 415: overflo("out of space in first()");
1.18 deraadt 416: maxsetvec *= 4;
1.1 tholo 417: }
1.15 millert 418: if (type(p) == EMPTYRE) {
419: setvec[lp] = 0;
420: return(0);
421: }
1.1 tholo 422: if (setvec[lp] != 1) {
423: setvec[lp] = 1;
424: setcnt++;
425: }
426: if (type(p) == CCL && (*(char *) right(p)) == '\0')
427: return(0); /* empty CCL */
428: else return(1);
429: case PLUS:
430: if (first(left(p)) == 0) return(0);
431: return(1);
432: case STAR:
433: case QUEST:
434: first(left(p));
435: return(0);
436: case CAT:
437: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
438: return(1);
439: case OR:
440: b = first(right(p));
441: if (first(left(p)) == 0 || b == 0) return(0);
442: return(1);
443: }
1.8 millert 444: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 445: return(-1);
446: }
447:
448: void follow(Node *v) /* collects leaves that can follow v into setvec */
449: {
450: Node *p;
451:
452: if (type(v) == FINAL)
453: return;
454: p = parent(v);
455: switch (type(p)) {
456: case STAR:
457: case PLUS:
458: first(v);
459: follow(p);
460: return;
461:
462: case OR:
463: case QUEST:
464: follow(p);
465: return;
466:
467: case CAT:
468: if (v == left(p)) { /* v is left child of p */
469: if (first(right(p)) == 0) {
470: follow(p);
471: return;
472: }
473: } else /* v is right child */
474: follow(p);
475: return;
476: }
477: }
478:
1.11 millert 479: int member(int c, const char *sarg) /* is c in s? */
1.1 tholo 480: {
1.10 millert 481: uschar *s = (uschar *) sarg;
482:
1.1 tholo 483: while (*s)
484: if (c == *s++)
485: return(1);
486: return(0);
487: }
488:
1.11 millert 489: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 490: {
491: int s, ns;
492: uschar *p = (uschar *) p0;
493:
494: s = f->reset ? makeinit(f,0) : f->initstat;
495: if (f->out[s])
496: return(1);
497: do {
1.15 millert 498: /* assert(*p < NCHARS); */
1.1 tholo 499: if ((ns = f->gototab[s][*p]) != 0)
500: s = ns;
501: else
502: s = cgoto(f, s, *p);
503: if (f->out[s])
504: return(1);
505: } while (*p++ != 0);
506: return(0);
507: }
508:
1.11 millert 509: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 510: {
511: int s, ns;
512: uschar *p = (uschar *) p0;
513: uschar *q;
514: int i, k;
515:
1.12 millert 516: /* s = f->reset ? makeinit(f,1) : f->initstat; */
517: if (f->reset) {
518: f->initstat = s = makeinit(f,1);
519: } else {
520: s = f->initstat;
521: }
1.1 tholo 522: patbeg = (char *) p;
523: patlen = -1;
524: do {
525: q = p;
526: do {
527: if (f->out[s]) /* final state */
528: patlen = q-p;
1.15 millert 529: /* assert(*q < NCHARS); */
1.1 tholo 530: if ((ns = f->gototab[s][*q]) != 0)
531: s = ns;
532: else
533: s = cgoto(f, s, *q);
1.9 deraadt 534: if (s == 1) { /* no transition */
1.1 tholo 535: if (patlen >= 0) {
536: patbeg = (char *) p;
537: return(1);
538: }
539: else
540: goto nextin; /* no match */
1.9 deraadt 541: }
1.1 tholo 542: } while (*q++ != 0);
543: if (f->out[s])
544: patlen = q-p-1; /* don't count $ */
545: if (patlen >= 0) {
546: patbeg = (char *) p;
547: return(1);
548: }
549: nextin:
550: s = 2;
551: if (f->reset) {
552: for (i = 2; i <= f->curstat; i++)
553: xfree(f->posns[i]);
554: k = *f->posns[0];
1.13 pvalchev 555: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 556: overflo("out of space in pmatch");
557: for (i = 0; i <= k; i++)
558: (f->posns[2])[i] = (f->posns[0])[i];
559: f->initstat = f->curstat = 2;
560: f->out[2] = f->out[0];
561: for (i = 0; i < NCHARS; i++)
562: f->gototab[2][i] = 0;
563: }
564: } while (*p++ != 0);
565: return (0);
566: }
567:
1.11 millert 568: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 569: {
570: int s, ns;
571: uschar *p = (uschar *) p0;
572: uschar *q;
573: int i, k;
574:
1.12 millert 575: /* s = f->reset ? makeinit(f,1) : f->initstat; */
576: if (f->reset) {
577: f->initstat = s = makeinit(f,1);
578: } else {
579: s = f->initstat;
580: }
1.1 tholo 581: patlen = -1;
582: while (*p) {
583: q = p;
584: do {
585: if (f->out[s]) /* final state */
586: patlen = q-p;
1.15 millert 587: /* assert(*q < NCHARS); */
1.1 tholo 588: if ((ns = f->gototab[s][*q]) != 0)
589: s = ns;
590: else
591: s = cgoto(f, s, *q);
1.9 deraadt 592: if (s == 1) { /* no transition */
1.1 tholo 593: if (patlen > 0) {
594: patbeg = (char *) p;
595: return(1);
596: } else
597: goto nnextin; /* no nonempty match */
1.9 deraadt 598: }
1.1 tholo 599: } while (*q++ != 0);
600: if (f->out[s])
601: patlen = q-p-1; /* don't count $ */
602: if (patlen > 0 ) {
603: patbeg = (char *) p;
604: return(1);
605: }
606: nnextin:
607: s = 2;
608: if (f->reset) {
609: for (i = 2; i <= f->curstat; i++)
610: xfree(f->posns[i]);
611: k = *f->posns[0];
1.13 pvalchev 612: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 613: overflo("out of state space");
614: for (i = 0; i <= k; i++)
615: (f->posns[2])[i] = (f->posns[0])[i];
616: f->initstat = f->curstat = 2;
617: f->out[2] = f->out[0];
618: for (i = 0; i < NCHARS; i++)
619: f->gototab[2][i] = 0;
620: }
621: p++;
622: }
623: return (0);
624: }
625:
1.11 millert 626: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 627: { /* uses relex() to scan regular expression */
628: Node *np;
629:
1.19 deraadt 630: DPRINTF( ("reparse <%s>\n", p) );
1.10 millert 631: lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 632: rtok = relex();
1.11 millert 633: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 634: if (rtok == '\0') {
1.11 millert 635: /* FATAL("empty regular expression"); previous */
1.15 millert 636: return(op2(EMPTYRE, NIL, NIL));
637: }
1.1 tholo 638: np = regexp();
639: if (rtok != '\0')
1.8 millert 640: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 641: return(np);
642: }
643:
644: Node *regexp(void) /* top-level parse of reg expr */
645: {
646: return (alt(concat(primary())));
647: }
648:
649: Node *primary(void)
650: {
651: Node *np;
1.23 millert 652: int savelastatom;
1.1 tholo 653:
654: switch (rtok) {
655: case CHAR:
1.23 millert 656: lastatom = starttok;
1.7 millert 657: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 658: rtok = relex();
659: return (unary(np));
660: case ALL:
661: rtok = relex();
662: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 663: case EMPTYRE:
664: rtok = relex();
1.23 millert 665: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 666: case DOT:
1.23 millert 667: lastatom = starttok;
1.1 tholo 668: rtok = relex();
669: return (unary(op2(DOT, NIL, NIL)));
670: case CCL:
1.10 millert 671: np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
1.23 millert 672: lastatom = starttok;
1.1 tholo 673: rtok = relex();
674: return (unary(np));
675: case NCCL:
1.10 millert 676: np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
1.23 millert 677: lastatom = starttok;
1.1 tholo 678: rtok = relex();
679: return (unary(np));
680: case '^':
681: rtok = relex();
1.7 millert 682: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 683: case '$':
684: rtok = relex();
685: return (unary(op2(CHAR, NIL, NIL)));
686: case '(':
1.23 millert 687: lastatom = starttok;
688: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 689: rtok = relex();
690: if (rtok == ')') { /* special pleading for () */
691: rtok = relex();
692: return unary(op2(CCL, NIL, (Node *) tostring("")));
693: }
694: np = regexp();
695: if (rtok == ')') {
1.23 millert 696: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 697: rtok = relex();
698: return (unary(np));
699: }
700: else
1.8 millert 701: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 702: default:
1.8 millert 703: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 704: }
705: return 0; /*NOTREACHED*/
706: }
707:
708: Node *concat(Node *np)
709: {
710: switch (rtok) {
1.23 millert 711: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 712: return (concat(op2(CAT, np, primary())));
1.23 millert 713: case EMPTYRE:
714: rtok = relex();
715: return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
716: primary())));
1.1 tholo 717: }
718: return (np);
719: }
720:
721: Node *alt(Node *np)
722: {
723: if (rtok == OR) {
724: rtok = relex();
725: return (alt(op2(OR, np, concat(primary()))));
726: }
727: return (np);
728: }
729:
730: Node *unary(Node *np)
731: {
732: switch (rtok) {
733: case STAR:
734: rtok = relex();
735: return (unary(op2(STAR, np, NIL)));
736: case PLUS:
737: rtok = relex();
738: return (unary(op2(PLUS, np, NIL)));
739: case QUEST:
740: rtok = relex();
741: return (unary(op2(QUEST, np, NIL)));
742: default:
743: return (np);
744: }
745: }
746:
1.11 millert 747: /*
748: * Character class definitions conformant to the POSIX locale as
749: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
750: * and operating character sets are both ASCII (ISO646) or supersets
751: * thereof.
752: *
753: * Note that to avoid overflowing the temporary buffer used in
754: * relex(), the expanded character class (prior to range expansion)
755: * must be less than twice the size of their full name.
756: */
1.12 millert 757:
758: /* Because isblank doesn't show up in any of the header files on any
759: * system i use, it's defined here. if some other locale has a richer
760: * definition of "blank", define HAS_ISBLANK and provide your own
761: * version.
762: * the parentheses here are an attempt to find a path through the maze
763: * of macro definition and/or function and/or version provided. thanks
764: * to nelson beebe for the suggestion; let's see if it works everywhere.
765: */
766:
767: #ifndef HAS_ISBLANK
768:
1.17 millert 769: int (xisblank)(int c)
1.12 millert 770: {
771: return c==' ' || c=='\t';
772: }
773:
774: #endif
775:
1.11 millert 776: struct charclass {
777: const char *cc_name;
778: int cc_namelen;
1.12 millert 779: int (*cc_func)(int);
1.11 millert 780: } charclasses[] = {
1.12 millert 781: { "alnum", 5, isalnum },
782: { "alpha", 5, isalpha },
1.17 millert 783: #ifndef HAS_ISBLANK
1.21 millert 784: { "blank", 5, xisblank },
1.17 millert 785: #else
1.12 millert 786: { "blank", 5, isblank },
1.17 millert 787: #endif
1.12 millert 788: { "cntrl", 5, iscntrl },
789: { "digit", 5, isdigit },
790: { "graph", 5, isgraph },
791: { "lower", 5, islower },
792: { "print", 5, isprint },
793: { "punct", 5, ispunct },
794: { "space", 5, isspace },
795: { "upper", 5, isupper },
796: { "xdigit", 6, isxdigit },
1.11 millert 797: { NULL, 0, NULL },
798: };
799:
1.23 millert 800: #define REPEAT_SIMPLE 0
801: #define REPEAT_PLUS_APPENDED 1
802: #define REPEAT_WITH_Q 2
803: #define REPEAT_ZERO 3
804:
805: static int
806: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
807: int atomlen, int firstnum, int secondnum, int special_case)
808: {
809: int i, j;
810: uschar *buf = 0;
811: int ret = 1;
812: int init_q = (firstnum==0); /* first added char will be ? */
813: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
814: int prefix_length = reptok - basestr; /* prefix includes first rep */
815: int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */
816: int size = prefix_length + suffix_length;
817:
818: if (firstnum > 1) { /* add room for reps 2 through firstnum */
819: size += atomlen*(firstnum-1);
820: }
821:
822: /* Adjust size of buffer for special cases */
823: if (special_case == REPEAT_PLUS_APPENDED) {
824: size++; /* for the final + */
825: } else if (special_case == REPEAT_WITH_Q) {
826: size += init_q + (atomlen+1)* n_q_reps;
827: } else if (special_case == REPEAT_ZERO) {
828: size += 2; /* just a null ERE: () */
829: }
830: if ((buf = (uschar *) malloc(size+1)) == NULL)
831: FATAL("out of space in reg expr %.10s..", lastre);
832: memcpy(buf, basestr, prefix_length); /* copy prefix */
833: j = prefix_length;
834: if (special_case == REPEAT_ZERO) {
835: j -= atomlen;
836: buf[j++] = '(';
837: buf[j++] = ')';
838: }
839: for (i=1; i < firstnum; i++) { /* copy x reps */
840: memcpy(&buf[j], atom, atomlen);
841: j += atomlen;
842: }
843: if (special_case == REPEAT_PLUS_APPENDED) {
844: buf[j++] = '+';
845: } else if (special_case == REPEAT_WITH_Q) {
846: if (init_q) buf[j++] = '?';
847: for (i=0; i < n_q_reps; i++) { /* copy x? reps */
848: memcpy(&buf[j], atom, atomlen);
849: j += atomlen;
850: buf[j++] = '?';
851: }
852: }
853: memcpy(&buf[j], reptok+reptoklen, suffix_length);
854: if (special_case == REPEAT_ZERO) {
855: buf[j+suffix_length] = '\0';
856: } else {
857: buf[size] = '\0';
858: }
859: /* free old basestr */
860: if (firstbasestr != basestr) {
861: if (basestr)
862: xfree(basestr);
863: }
864: basestr = buf;
865: prestr = buf + prefix_length;
866: if (special_case == REPEAT_ZERO) {
867: prestr -= atomlen;
868: ret++;
869: }
870: return ret;
871: }
872:
873: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
874: int atomlen, int firstnum, int secondnum)
875: {
876: /*
877: In general, the repetition specifier or "bound" is replaced here
878: by an equivalent ERE string, repeating the immediately previous atom
879: and appending ? and + as needed. Note that the first copy of the
880: atom is left in place, except in the special_case of a zero-repeat
881: (i.e., {0}).
882: */
883: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
884: if (firstnum < 2) {
885: /* 0 or 1: should be handled before you get here */
886: FATAL("internal error");
887: } else {
888: return replace_repeat(reptok, reptoklen, atom, atomlen,
889: firstnum, secondnum, REPEAT_PLUS_APPENDED);
890: }
891: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
892: if (firstnum == 0) { /* {0} or {0,0} */
893: /* This case is unusual because the resulting
894: replacement string might actually be SMALLER than
895: the original ERE */
896: return replace_repeat(reptok, reptoklen, atom, atomlen,
897: firstnum, secondnum, REPEAT_ZERO);
898: } else { /* (firstnum >= 1) */
899: return replace_repeat(reptok, reptoklen, atom, atomlen,
900: firstnum, secondnum, REPEAT_SIMPLE);
901: }
902: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
903: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
904: return replace_repeat(reptok, reptoklen, atom, atomlen,
905: firstnum, secondnum, REPEAT_WITH_Q);
906: } else { /* Error - shouldn't be here (n>m) */
907: FATAL("internal error");
908: }
909: return 0;
910: }
1.11 millert 911:
1.1 tholo 912: int relex(void) /* lexical analyzer for reparse */
913: {
1.5 kstailey 914: int c, n;
1.1 tholo 915: int cflag;
1.10 millert 916: static uschar *buf = 0;
1.5 kstailey 917: static int bufsz = 100;
1.10 millert 918: uschar *bp;
1.11 millert 919: struct charclass *cc;
1.12 millert 920: int i;
1.23 millert 921: int num, m, commafound, digitfound;
922: const uschar *startreptok;
1.24 ! millert 923: static int parens = 0;
1.23 millert 924:
925: rescan:
926: starttok = prestr;
1.1 tholo 927:
928: switch (c = *prestr++) {
929: case '|': return OR;
930: case '*': return STAR;
931: case '+': return PLUS;
932: case '?': return QUEST;
933: case '.': return DOT;
934: case '\0': prestr--; return '\0';
935: case '^':
936: case '$':
1.24 ! millert 937: return c;
1.1 tholo 938: case '(':
1.24 ! millert 939: parens++;
! 940: return c;
1.1 tholo 941: case ')':
1.24 ! millert 942: if (parens) {
! 943: parens--;
! 944: return c;
! 945: }
! 946: /* unmatched close parenthesis; per POSIX, treat as literal */
! 947: rlxval = c;
! 948: return CHAR;
1.1 tholo 949: case '\\':
1.17 millert 950: rlxval = quoted(&prestr);
1.1 tholo 951: return CHAR;
952: default:
953: rlxval = c;
954: return CHAR;
955: case '[':
1.10 millert 956: if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 957: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 958: bp = buf;
1.1 tholo 959: if (*prestr == '^') {
960: cflag = 1;
961: prestr++;
962: }
963: else
964: cflag = 0;
1.11 millert 965: n = 2 * strlen((const char *) prestr)+1;
1.15 millert 966: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 967: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 968: for (; ; ) {
969: if ((c = *prestr++) == '\\') {
1.5 kstailey 970: *bp++ = '\\';
1.1 tholo 971: if ((c = *prestr++) == '\0')
1.8 millert 972: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 973: *bp++ = c;
1.10 millert 974: /* } else if (c == '\n') { */
975: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 976: } else if (c == '[' && *prestr == ':') {
977: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
978: for (cc = charclasses; cc->cc_name; cc++)
979: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
980: break;
981: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
982: prestr[2 + cc->cc_namelen] == ']') {
983: prestr += cc->cc_namelen + 3;
1.22 millert 984: /*
985: * BUG: We begin at 1, instead of 0, since we
986: * would otherwise prematurely terminate the
987: * string for classes like [[:cntrl:]]. This
988: * means that we can't match the NUL character,
989: * not without first adapting the entire
990: * program to track each string's length.
991: */
1.23 millert 992: for (i = 1; i <= UCHAR_MAX; i++) {
1.15 millert 993: if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1.12 millert 994: FATAL("out of space for reg expr %.10s...", lastre);
995: if (cc->cc_func(i)) {
996: *bp++ = i;
997: n++;
998: }
999: }
1.11 millert 1000: } else
1001: *bp++ = c;
1.23 millert 1002: } else if (c == '[' && *prestr == '.') {
1003: char collate_char;
1004: prestr++;
1005: collate_char = *prestr++;
1006: if (*prestr == '.' && prestr[1] == ']') {
1007: prestr += 2;
1008: /* Found it: map via locale TBD: for
1009: now, simply return this char. This
1010: is sufficient to pass conformance
1011: test awk.ex 156
1012: */
1013: if (*prestr == ']') {
1014: prestr++;
1015: rlxval = collate_char;
1016: return CHAR;
1017: }
1018: }
1019: } else if (c == '[' && *prestr == '=') {
1020: char equiv_char;
1021: prestr++;
1022: equiv_char = *prestr++;
1023: if (*prestr == '=' && prestr[1] == ']') {
1024: prestr += 2;
1025: /* Found it: map via locale TBD: for now
1026: simply return this char. This is
1027: sufficient to pass conformance test
1028: awk.ex 156
1029: */
1030: if (*prestr == ']') {
1031: prestr++;
1032: rlxval = equiv_char;
1033: return CHAR;
1034: }
1035: }
1.1 tholo 1036: } else if (c == '\0') {
1.8 millert 1037: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1038: } else if (bp == buf) { /* 1st char is special */
1039: *bp++ = c;
1.1 tholo 1040: } else if (c == ']') {
1.5 kstailey 1041: *bp++ = 0;
1.10 millert 1042: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1043: if (cflag == 0)
1044: return CCL;
1045: else
1046: return NCCL;
1047: } else
1.5 kstailey 1048: *bp++ = c;
1.1 tholo 1049: }
1.23 millert 1050: break;
1051: case '{':
1052: if (isdigit(*(prestr))) {
1053: num = 0; /* Process as a repetition */
1054: n = -1; m = -1;
1055: commafound = 0;
1056: digitfound = 0;
1057: startreptok = prestr-1;
1058: /* Remember start of previous atom here ? */
1059: } else { /* just a { char, not a repetition */
1060: rlxval = c;
1061: return CHAR;
1062: }
1063: for (; ; ) {
1064: if ((c = *prestr++) == '}') {
1065: if (commafound) {
1066: if (digitfound) { /* {n,m} */
1067: m = num;
1068: if (m<n)
1069: FATAL("illegal repetition expression: class %.20s",
1070: lastre);
1071: if ((n==0) && (m==1)) {
1072: return QUEST;
1073: }
1074: } else { /* {n,} */
1075: if (n==0) return STAR;
1076: if (n==1) return PLUS;
1077: }
1078: } else {
1079: if (digitfound) { /* {n} same as {n,n} */
1080: n = num;
1081: m = num;
1082: } else { /* {} */
1083: FATAL("illegal repetition expression: class %.20s",
1084: lastre);
1085: }
1086: }
1087: if (repeat(starttok, prestr-starttok, lastatom,
1088: startreptok - lastatom, n, m) > 0) {
1089: if ((n==0) && (m==0)) {
1090: return EMPTYRE;
1091: }
1092: /* must rescan input for next token */
1093: goto rescan;
1094: }
1095: /* Failed to replace: eat up {...} characters
1096: and treat like just PLUS */
1097: return PLUS;
1098: } else if (c == '\0') {
1099: FATAL("nonterminated character class %.20s",
1100: lastre);
1101: } else if (isdigit(c)) {
1102: num = 10 * num + c - '0';
1103: digitfound = 1;
1104: } else if (c == ',') {
1105: if (commafound)
1106: FATAL("illegal repetition expression: class %.20s",
1107: lastre);
1108: /* looking for {n,} or {n,m} */
1109: commafound = 1;
1110: n = num;
1111: digitfound = 0; /* reset */
1112: num = 0;
1113: } else {
1114: FATAL("illegal repetition expression: class %.20s",
1115: lastre);
1116: }
1117: }
1118: break;
1.1 tholo 1119: }
1120: }
1121:
1122: int cgoto(fa *f, int s, int c)
1123: {
1124: int i, j, k;
1.4 millert 1125: int *p, *q;
1.1 tholo 1126:
1.12 millert 1127: assert(c == HAT || c < NCHARS);
1.1 tholo 1128: while (f->accept >= maxsetvec) { /* guessing here! */
1.18 deraadt 1129: setvec = reallocarray(setvec, maxsetvec, 4 * sizeof(int));
1130: tmpset = reallocarray(tmpset, maxsetvec, 4 * sizeof(int));
1.7 millert 1131: if (setvec == 0 || tmpset == 0)
1.1 tholo 1132: overflo("out of space in cgoto()");
1.18 deraadt 1133: maxsetvec *= 4;
1.1 tholo 1134: }
1135: for (i = 0; i <= f->accept; i++)
1136: setvec[i] = 0;
1137: setcnt = 0;
1138: /* compute positions of gototab[s,c] into setvec */
1139: p = f->posns[s];
1140: for (i = 1; i <= *p; i++) {
1141: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1142: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1143: || (k == DOT && c != 0 && c != HAT)
1144: || (k == ALL && c != 0)
1.15 millert 1145: || (k == EMPTYRE && c != 0)
1.10 millert 1146: || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1147: || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1.1 tholo 1148: q = f->re[p[i]].lfollow;
1149: for (j = 1; j <= *q; j++) {
1150: if (q[j] >= maxsetvec) {
1.18 deraadt 1151: setvec = reallocarray(setvec,
1152: maxsetvec, 4 * sizeof(int));
1153: tmpset = reallocarray(tmpset,
1154: maxsetvec, 4 * sizeof(int));
1.1 tholo 1155: if (setvec == 0 || tmpset == 0)
1156: overflo("cgoto overflow");
1.18 deraadt 1157: maxsetvec *= 4;
1.1 tholo 1158: }
1159: if (setvec[q[j]] == 0) {
1160: setcnt++;
1161: setvec[q[j]] = 1;
1162: }
1163: }
1164: }
1165: }
1166: }
1167: /* determine if setvec is a previous state */
1168: tmpset[0] = setcnt;
1169: j = 1;
1170: for (i = f->accept; i >= 0; i--)
1171: if (setvec[i]) {
1172: tmpset[j++] = i;
1173: }
1174: /* tmpset == previous state? */
1175: for (i = 1; i <= f->curstat; i++) {
1176: p = f->posns[i];
1177: if ((k = tmpset[0]) != p[0])
1178: goto different;
1179: for (j = 1; j <= k; j++)
1180: if (tmpset[j] != p[j])
1181: goto different;
1182: /* setvec is state i */
1183: f->gototab[s][c] = i;
1184: return i;
1185: different:;
1186: }
1187:
1188: /* add tmpset to current set of states */
1189: if (f->curstat >= NSTATES-1) {
1190: f->curstat = 2;
1191: f->reset = 1;
1192: for (i = 2; i < NSTATES; i++)
1193: xfree(f->posns[i]);
1194: } else
1195: ++(f->curstat);
1196: for (i = 0; i < NCHARS; i++)
1197: f->gototab[f->curstat][i] = 0;
1198: xfree(f->posns[f->curstat]);
1.13 pvalchev 1199: if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1.1 tholo 1200: overflo("out of space in cgoto");
1201:
1202: f->posns[f->curstat] = p;
1203: f->gototab[s][c] = f->curstat;
1204: for (i = 0; i <= setcnt; i++)
1205: p[i] = tmpset[i];
1206: if (setvec[f->accept])
1207: f->out[f->curstat] = 1;
1208: else
1209: f->out[f->curstat] = 0;
1210: return f->curstat;
1211: }
1212:
1213:
1214: void freefa(fa *f) /* free a finite automaton */
1215: {
1216: int i;
1217:
1218: if (f == NULL)
1219: return;
1220: for (i = 0; i <= f->curstat; i++)
1221: xfree(f->posns[i]);
1222: for (i = 0; i <= f->accept; i++) {
1223: xfree(f->re[i].lfollow);
1224: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1225: xfree((f->re[i].lval.np));
1226: }
1227: xfree(f->restr);
1228: xfree(f);
1229: }