Annotation of src/usr.bin/awk/b.c, Revision 1.48
1.48 ! millert 1: /* $OpenBSD: b.c,v 1.47 2023/11/15 18:56:53 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.34 millert 36: #include "awkgram.tab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.28 millert 65: static const uschar *rlxstr;
66: static const uschar *prestr; /* current position in current re */
67: static const uschar *lastre; /* origin of last re */
68: static const uschar *lastatom; /* origin of last Atom */
69: static const uschar *starttok;
70: static const uschar *basestr; /* starts with original, replaced during
1.23 millert 71: repetition processing */
1.28 millert 72: static const uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
1.28 millert 77: const char *patbeg;
1.1 tholo 78: int patlen;
79:
1.27 millert 80: #define NFA 128 /* cache this many dynamic fa's */
1.1 tholo 81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.43 millert 84: extern int u8_nextlen(const char *s);
85:
1.38 millert 86:
87: /* utf-8 mechanism:
88:
89: For most of Awk, utf-8 strings just "work", since they look like
90: null-terminated sequences of 8-bit bytes.
91:
92: Functions like length(), index(), and substr() have to operate
93: in units of utf-8 characters. The u8_* functions in run.c
94: handle this.
95:
96: Regular expressions are more complicated, since the basic
97: mechanism of the goto table used 8-bit byte indices into the
98: gototab entries to compute the next state. Unicode is a lot
99: bigger, so the gototab entries are now structs with a character
100: and a next state, and there is a linear search of the characters
101: to find the state. (Yes, this is slower, by a significant
102: amount. Tough.)
103:
104: Throughout the RE mechanism in b.c, utf-8 characters are
105: converted to their utf-32 value. This mostly shows up in
106: cclenter, which expands character class ranges like a-z and now
107: alpha-omega. The size of a gototab array is still about 256.
108: This should be dynamic, but for now things work ok for a single
109: code page of Unicode, which is the most likely case.
110:
111: The code changes are localized in run.c and b.c. I have added a
112: handful of functions to somewhat better hide the implementation,
113: but a lot more could be done.
114:
115: */
116:
117: static int get_gototab(fa*, int, int);
118: static int set_gototab(fa*, int, int, int);
1.44 millert 119: static void reset_gototab(fa*, int);
1.38 millert 120: extern int u8_rune(int *, const uschar *);
121:
1.27 millert 122: static int *
123: intalloc(size_t n, const char *f)
124: {
1.35 millert 125: int *p = (int *) calloc(n, sizeof(int));
1.27 millert 126: if (p == NULL)
127: overflo(f);
128: return p;
129: }
130:
131: static void
132: allocsetvec(const char *f)
133: {
134: maxsetvec = MAXLIN;
1.35 millert 135: setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
136: tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27 millert 137: if (setvec == NULL || tmpset == NULL)
138: overflo(f);
139: }
140:
141: static void
142: resizesetvec(const char *f)
143: {
1.35 millert 144: setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
145: tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27 millert 146: if (setvec == NULL || tmpset == NULL)
147: overflo(f);
148: maxsetvec *= 4;
149: }
150:
151: static void
152: resize_state(fa *f, int state)
153: {
1.38 millert 154: gtt **p;
1.35 millert 155: uschar *p2;
156: int **p3;
1.27 millert 157: int i, new_count;
158:
159: if (++state < f->state_count)
160: return;
161:
162: new_count = state + 10; /* needs to be tuned */
163:
1.38 millert 164: p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
1.27 millert 165: if (p == NULL)
166: goto out;
167: f->gototab = p;
168:
1.35 millert 169: p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
170: if (p2 == NULL)
1.27 millert 171: goto out;
1.35 millert 172: f->out = p2;
1.27 millert 173:
1.35 millert 174: p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
175: if (p3 == NULL)
1.27 millert 176: goto out;
1.35 millert 177: f->posns = p3;
1.27 millert 178:
179: for (i = f->state_count; i < new_count; ++i) {
1.38 millert 180: f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
1.27 millert 181: if (f->gototab[i] == NULL)
182: goto out;
183: f->out[i] = 0;
184: f->posns[i] = NULL;
185: }
1.38 millert 186: f->gototab_len = NCHARS; /* should be variable, growable */
1.27 millert 187: f->state_count = new_count;
188: return;
189: out:
190: overflo(__func__);
191: }
192:
1.29 millert 193: fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1.1 tholo 194: {
195: int i, use, nuse;
196: fa *pfa;
1.6 millert 197: static int now = 1;
1.1 tholo 198:
1.25 millert 199: if (setvec == NULL) { /* first time through any RE */
1.27 millert 200: allocsetvec(__func__);
1.1 tholo 201: }
202:
1.29 millert 203: if (compile_time != RUNNING) /* a constant for sure */
1.1 tholo 204: return mkdfa(s, anchor);
205: for (i = 0; i < nfatab; i++) /* is it there already? */
206: if (fatab[i]->anchor == anchor
1.11 millert 207: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 208: fatab[i]->use = now++;
1.1 tholo 209: return fatab[i];
1.10 millert 210: }
1.1 tholo 211: pfa = mkdfa(s, anchor);
212: if (nfatab < NFA) { /* room for another */
213: fatab[nfatab] = pfa;
1.6 millert 214: fatab[nfatab]->use = now++;
1.1 tholo 215: nfatab++;
216: return pfa;
217: }
218: use = fatab[0]->use; /* replace least-recently used */
219: nuse = 0;
220: for (i = 1; i < nfatab; i++)
221: if (fatab[i]->use < use) {
222: use = fatab[i]->use;
223: nuse = i;
224: }
225: freefa(fatab[nuse]);
226: fatab[nuse] = pfa;
1.6 millert 227: pfa->use = now++;
1.1 tholo 228: return pfa;
229: }
230:
1.29 millert 231: fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
1.30 millert 232: /* anchor = true for anchored matches, else false */
1.1 tholo 233: {
234: Node *p, *p1;
235: fa *f;
236:
1.28 millert 237: firstbasestr = (const uschar *) s;
1.23 millert 238: basestr = firstbasestr;
1.1 tholo 239: p = reparse(s);
240: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
241: /* put ALL STAR in front of reg. exp. */
242: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
243: /* put FINAL after reg. exp. */
244:
245: poscnt = 0;
246: penter(p1); /* enter parent pointers and leaf indices */
1.35 millert 247: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27 millert 248: overflo(__func__);
1.1 tholo 249: f->accept = poscnt-1; /* penter has computed number of positions in re */
250: cfoll(f, p1); /* set up follow sets */
251: freetr(p1);
1.27 millert 252: resize_state(f, 1);
253: f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
254: f->posns[1] = intalloc(1, __func__);
1.1 tholo 255: *f->posns[1] = 0;
256: f->initstat = makeinit(f, anchor);
257: f->anchor = anchor;
1.10 millert 258: f->restr = (uschar *) tostring(s);
1.23 millert 259: if (firstbasestr != basestr) {
260: if (basestr)
261: xfree(basestr);
262: }
1.1 tholo 263: return f;
264: }
265:
1.29 millert 266: int makeinit(fa *f, bool anchor)
1.1 tholo 267: {
268: int i, k;
269:
270: f->curstat = 2;
271: f->out[2] = 0;
272: k = *(f->re[0].lfollow);
1.25 millert 273: xfree(f->posns[2]);
1.27 millert 274: f->posns[2] = intalloc(k + 1, __func__);
1.30 millert 275: for (i = 0; i <= k; i++) {
1.1 tholo 276: (f->posns[2])[i] = (f->re[0].lfollow)[i];
277: }
278: if ((f->posns[2])[1] == f->accept)
279: f->out[2] = 1;
1.44 millert 280: reset_gototab(f, 2);
1.1 tholo 281: f->curstat = cgoto(f, 2, HAT);
282: if (anchor) {
283: *f->posns[2] = k-1; /* leave out position 0 */
1.30 millert 284: for (i = 0; i < k; i++) {
1.1 tholo 285: (f->posns[0])[i] = (f->posns[2])[i];
286: }
287:
288: f->out[0] = f->out[2];
289: if (f->curstat != 2)
290: --(*f->posns[f->curstat]);
291: }
292: return f->curstat;
293: }
294:
295: void penter(Node *p) /* set up parent pointers and leaf indices */
296: {
297: switch (type(p)) {
1.15 millert 298: ELEAF
1.1 tholo 299: LEAF
1.7 millert 300: info(p) = poscnt;
1.1 tholo 301: poscnt++;
302: break;
303: UNARY
304: penter(left(p));
305: parent(left(p)) = p;
306: break;
307: case CAT:
308: case OR:
309: penter(left(p));
310: penter(right(p));
311: parent(left(p)) = p;
312: parent(right(p)) = p;
313: break;
1.31 millert 314: case ZERO:
315: break;
1.1 tholo 316: default: /* can't happen */
1.8 millert 317: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 318: break;
319: }
320: }
321:
322: void freetr(Node *p) /* free parse tree */
323: {
324: switch (type(p)) {
1.15 millert 325: ELEAF
1.1 tholo 326: LEAF
327: xfree(p);
328: break;
329: UNARY
1.31 millert 330: case ZERO:
1.1 tholo 331: freetr(left(p));
332: xfree(p);
333: break;
334: case CAT:
335: case OR:
336: freetr(left(p));
337: freetr(right(p));
338: xfree(p);
339: break;
340: default: /* can't happen */
1.8 millert 341: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 342: break;
343: }
344: }
345:
346: /* in the parsing of regular expressions, metacharacters like . have */
347: /* to be seen literally; \056 is not a metacharacter. */
348:
1.38 millert 349: int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
1.2 millert 350: { /* only pick up one 8-bit byte (2 chars) */
1.28 millert 351: const uschar *p;
1.1 tholo 352: int n = 0;
1.2 millert 353: int i;
1.1 tholo 354:
1.38 millert 355: for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
1.1 tholo 356: if (isdigit(*p))
357: n = 16 * n + *p - '0';
358: else if (*p >= 'a' && *p <= 'f')
359: n = 16 * n + *p - 'a' + 10;
360: else if (*p >= 'A' && *p <= 'F')
361: n = 16 * n + *p - 'A' + 10;
362: }
1.28 millert 363: *pp = p;
1.1 tholo 364: return n;
365: }
366:
1.2 millert 367: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 368:
1.28 millert 369: int quoted(const uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 370: /* and increment *pp */
371: {
1.28 millert 372: const uschar *p = *pp;
1.1 tholo 373: int c;
374:
1.38 millert 375: /* BUG: should advance by utf-8 char even if makes no sense */
376:
377: if ((c = *p++) == 't') {
1.1 tholo 378: c = '\t';
1.38 millert 379: } else if (c == 'n') {
1.1 tholo 380: c = '\n';
1.38 millert 381: } else if (c == 'f') {
1.1 tholo 382: c = '\f';
1.38 millert 383: } else if (c == 'r') {
1.1 tholo 384: c = '\r';
1.38 millert 385: } else if (c == 'b') {
1.1 tholo 386: c = '\b';
1.38 millert 387: } else if (c == 'v') {
1.25 millert 388: c = '\v';
1.38 millert 389: } else if (c == 'a') {
1.25 millert 390: c = '\a';
1.38 millert 391: } else if (c == '\\') {
1.1 tholo 392: c = '\\';
1.38 millert 393: } else if (c == 'x') { /* 2 hex digits follow */
394: c = hexstr(&p, 2); /* this adds a null if number is invalid */
395: } else if (c == 'u') { /* unicode char number up to 8 hex digits */
396: c = hexstr(&p, 8);
1.1 tholo 397: } else if (isoctdigit(c)) { /* \d \dd \ddd */
398: int n = c - '0';
399: if (isoctdigit(*p)) {
400: n = 8 * n + *p++ - '0';
401: if (isoctdigit(*p))
402: n = 8 * n + *p++ - '0';
403: }
404: c = n;
405: } /* else */
406: /* c = c; */
407: *pp = p;
408: return c;
409: }
410:
1.38 millert 411: int *cclenter(const char *argp) /* add a character class */
1.1 tholo 412: {
413: int i, c, c2;
1.38 millert 414: int n;
415: const uschar *p = (const uschar *) argp;
416: int *bp, *retp;
417: static int *buf = NULL;
1.5 kstailey 418: static int bufsz = 100;
1.1 tholo 419:
1.38 millert 420: if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
1.8 millert 421: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 422: bp = buf;
1.38 millert 423: for (i = 0; *p != 0; ) {
424: n = u8_rune(&c, p);
425: p += n;
1.1 tholo 426: if (c == '\\') {
1.17 millert 427: c = quoted(&p);
1.5 kstailey 428: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 429: if (*p != 0) {
1.5 kstailey 430: c = bp[-1];
1.38 millert 431: /* c2 = *p++; */
432: n = u8_rune(&c2, p);
433: p += n;
1.1 tholo 434: if (c2 == '\\')
1.38 millert 435: c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
1.1 tholo 436: if (c > c2) { /* empty; ignore */
1.5 kstailey 437: bp--;
1.1 tholo 438: i--;
439: continue;
440: }
441: while (c < c2) {
1.38 millert 442: if (i >= bufsz) {
1.45 millert 443: buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38 millert 444: if (buf == NULL)
445: FATAL("out of space for character class [%.10s...] 2", p);
446: bufsz *= 2;
447: bp = buf + i;
448: }
1.5 kstailey 449: *bp++ = ++c;
1.1 tholo 450: i++;
451: }
452: continue;
453: }
454: }
1.38 millert 455: if (i >= bufsz) {
1.45 millert 456: buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38 millert 457: if (buf == NULL)
458: FATAL("out of space for character class [%.10s...] 2", p);
459: bufsz *= 2;
460: bp = buf + i;
461: }
1.5 kstailey 462: *bp++ = c;
1.1 tholo 463: i++;
464: }
1.5 kstailey 465: *bp = 0;
1.38 millert 466: /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
467: /* xfree(op); BUG: what are we freeing here? */
468: retp = (int *) calloc(bp-buf+1, sizeof(int));
469: for (i = 0; i < bp-buf+1; i++)
470: retp[i] = buf[i];
471: return retp;
1.1 tholo 472: }
473:
1.11 millert 474: void overflo(const char *s)
1.1 tholo 475: {
1.27 millert 476: FATAL("regular expression too big: out of space in %.30s...", s);
1.1 tholo 477: }
478:
479: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
480: {
481: int i;
1.4 millert 482: int *p;
1.1 tholo 483:
484: switch (type(v)) {
1.15 millert 485: ELEAF
1.1 tholo 486: LEAF
1.7 millert 487: f->re[info(v)].ltype = type(v);
488: f->re[info(v)].lval.np = right(v);
1.1 tholo 489: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 490: resizesetvec(__func__);
1.1 tholo 491: }
492: for (i = 0; i <= f->accept; i++)
493: setvec[i] = 0;
494: setcnt = 0;
495: follow(v); /* computes setvec and setcnt */
1.27 millert 496: p = intalloc(setcnt + 1, __func__);
1.7 millert 497: f->re[info(v)].lfollow = p;
1.1 tholo 498: *p = setcnt;
499: for (i = f->accept; i >= 0; i--)
500: if (setvec[i] == 1)
501: *++p = i;
502: break;
503: UNARY
504: cfoll(f,left(v));
505: break;
506: case CAT:
507: case OR:
508: cfoll(f,left(v));
509: cfoll(f,right(v));
510: break;
1.31 millert 511: case ZERO:
512: break;
1.1 tholo 513: default: /* can't happen */
1.8 millert 514: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 515: }
516: }
517:
518: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 519: /* returns 0 if p matches empty string */
1.1 tholo 520: {
1.4 millert 521: int b, lp;
1.1 tholo 522:
523: switch (type(p)) {
1.15 millert 524: ELEAF
1.1 tholo 525: LEAF
1.7 millert 526: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 527: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.27 millert 528: resizesetvec(__func__);
1.1 tholo 529: }
1.15 millert 530: if (type(p) == EMPTYRE) {
531: setvec[lp] = 0;
532: return(0);
533: }
1.1 tholo 534: if (setvec[lp] != 1) {
535: setvec[lp] = 1;
536: setcnt++;
537: }
1.41 millert 538: if (type(p) == CCL && (*(int *) right(p)) == 0)
1.1 tholo 539: return(0); /* empty CCL */
1.30 millert 540: return(1);
1.1 tholo 541: case PLUS:
1.30 millert 542: if (first(left(p)) == 0)
543: return(0);
1.1 tholo 544: return(1);
545: case STAR:
546: case QUEST:
547: first(left(p));
548: return(0);
549: case CAT:
550: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
551: return(1);
552: case OR:
553: b = first(right(p));
554: if (first(left(p)) == 0 || b == 0) return(0);
555: return(1);
1.31 millert 556: case ZERO:
557: return 0;
1.1 tholo 558: }
1.8 millert 559: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 560: return(-1);
561: }
562:
563: void follow(Node *v) /* collects leaves that can follow v into setvec */
564: {
565: Node *p;
566:
567: if (type(v) == FINAL)
568: return;
569: p = parent(v);
570: switch (type(p)) {
571: case STAR:
572: case PLUS:
573: first(v);
574: follow(p);
575: return;
576:
577: case OR:
578: case QUEST:
579: follow(p);
580: return;
581:
582: case CAT:
583: if (v == left(p)) { /* v is left child of p */
584: if (first(right(p)) == 0) {
585: follow(p);
586: return;
587: }
588: } else /* v is right child */
589: follow(p);
590: return;
591: }
592: }
593:
1.38 millert 594: int member(int c, int *sarg) /* is c in s? */
1.1 tholo 595: {
1.38 millert 596: int *s = (int *) sarg;
1.10 millert 597:
1.1 tholo 598: while (*s)
599: if (c == *s++)
600: return(1);
601: return(0);
602: }
603:
1.38 millert 604: static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
605: {
606: int i;
607: for (i = 0; i < f->gototab_len; i++) {
608: if (f->gototab[state][i].ch == 0)
609: break;
610: if (f->gototab[state][i].ch == ch)
611: return f->gototab[state][i].state;
612: }
613: return 0;
614: }
615:
1.44 millert 616: static void reset_gototab(fa *f, int state) /* hide gototab inplementation */
617: {
618: memset(f->gototab[state], 0, f->gototab_len * sizeof(**f->gototab));
619: }
620:
1.38 millert 621: static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
622: {
623: int i;
624: for (i = 0; i < f->gototab_len; i++) {
625: if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
626: f->gototab[state][i].ch = ch;
627: f->gototab[state][i].state = val;
628: return val;
629: }
630: }
631: overflo(__func__);
632: return val; /* not used anywhere at the moment */
633: }
634:
1.11 millert 635: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 636: {
637: int s, ns;
1.38 millert 638: int n;
639: int rune;
1.28 millert 640: const uschar *p = (const uschar *) p0;
1.1 tholo 641:
1.38 millert 642: /* return pmatch(f, p0); does it matter whether longest or shortest? */
643:
1.27 millert 644: s = f->initstat;
645: assert (s < f->state_count);
646:
1.1 tholo 647: if (f->out[s])
648: return(1);
649: do {
1.15 millert 650: /* assert(*p < NCHARS); */
1.38 millert 651: n = u8_rune(&rune, p);
652: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 653: s = ns;
654: else
1.38 millert 655: s = cgoto(f, s, rune);
1.1 tholo 656: if (f->out[s])
657: return(1);
1.38 millert 658: if (*p == 0)
659: break;
660: p += n;
661: } while (1); /* was *p++ != 0 */
1.1 tholo 662: return(0);
663: }
664:
1.11 millert 665: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 666: {
667: int s, ns;
1.38 millert 668: int n;
669: int rune;
1.28 millert 670: const uschar *p = (const uschar *) p0;
671: const uschar *q;
1.1 tholo 672:
1.27 millert 673: s = f->initstat;
674: assert(s < f->state_count);
675:
1.28 millert 676: patbeg = (const char *)p;
1.1 tholo 677: patlen = -1;
678: do {
679: q = p;
680: do {
681: if (f->out[s]) /* final state */
682: patlen = q-p;
1.15 millert 683: /* assert(*q < NCHARS); */
1.38 millert 684: n = u8_rune(&rune, q);
685: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 686: s = ns;
687: else
1.38 millert 688: s = cgoto(f, s, rune);
1.27 millert 689:
690: assert(s < f->state_count);
691:
1.9 deraadt 692: if (s == 1) { /* no transition */
1.1 tholo 693: if (patlen >= 0) {
1.28 millert 694: patbeg = (const char *) p;
1.1 tholo 695: return(1);
696: }
697: else
698: goto nextin; /* no match */
1.9 deraadt 699: }
1.38 millert 700: if (*q == 0)
701: break;
702: q += n;
703: } while (1);
704: q++; /* was *q++ */
1.1 tholo 705: if (f->out[s])
706: patlen = q-p-1; /* don't count $ */
707: if (patlen >= 0) {
1.28 millert 708: patbeg = (const char *) p;
1.1 tholo 709: return(1);
710: }
711: nextin:
712: s = 2;
1.38 millert 713: if (*p == 0)
714: break;
715: n = u8_rune(&rune, p);
716: p += n;
717: } while (1); /* was *p++ */
1.1 tholo 718: return (0);
719: }
720:
1.11 millert 721: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 722: {
723: int s, ns;
1.38 millert 724: int n;
725: int rune;
1.28 millert 726: const uschar *p = (const uschar *) p0;
727: const uschar *q;
1.1 tholo 728:
1.27 millert 729: s = f->initstat;
730: assert(s < f->state_count);
731:
1.28 millert 732: patbeg = (const char *)p;
1.1 tholo 733: patlen = -1;
734: while (*p) {
735: q = p;
736: do {
737: if (f->out[s]) /* final state */
738: patlen = q-p;
1.15 millert 739: /* assert(*q < NCHARS); */
1.38 millert 740: n = u8_rune(&rune, q);
741: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 742: s = ns;
743: else
1.38 millert 744: s = cgoto(f, s, rune);
1.9 deraadt 745: if (s == 1) { /* no transition */
1.1 tholo 746: if (patlen > 0) {
1.28 millert 747: patbeg = (const char *) p;
1.1 tholo 748: return(1);
749: } else
750: goto nnextin; /* no nonempty match */
1.9 deraadt 751: }
1.38 millert 752: if (*q == 0)
753: break;
754: q += n;
755: } while (1);
756: q++;
1.1 tholo 757: if (f->out[s])
758: patlen = q-p-1; /* don't count $ */
759: if (patlen > 0 ) {
1.28 millert 760: patbeg = (const char *) p;
1.1 tholo 761: return(1);
762: }
763: nnextin:
764: s = 2;
765: p++;
766: }
767: return (0);
768: }
769:
1.43 millert 770:
771: #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
772:
1.26 millert 773: /*
774: * NAME
775: * fnematch
776: *
777: * DESCRIPTION
778: * A stream-fed version of nematch which transfers characters to a
779: * null-terminated buffer. All characters up to and including the last
780: * character of the matching text or EOF are placed in the buffer. If
781: * a match is found, patbeg and patlen are set appropriately.
782: *
783: * RETURN VALUES
1.29 millert 784: * false No match found.
785: * true Match found.
1.26 millert 786: */
787:
1.29 millert 788: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26 millert 789: {
1.48 ! millert 790: char *i, *j, *k, *buf = *pbuf;
1.26 millert 791: int bufsize = *pbufsize;
1.48 ! millert 792: int c, n, ns, s;
1.26 millert 793:
794: s = pfa->initstat;
795: patlen = 0;
796:
797: /*
1.48 ! millert 798: * buf <= i <= j <= k <= buf+bufsize
1.26 millert 799: *
1.48 ! millert 800: * i: origin of active substring
! 801: * j: current character
! 802: * k: destination of the next getc
1.26 millert 803: */
1.48 ! millert 804:
! 805: i = j = k = buf;
! 806:
! 807: do {
! 808: /*
! 809: * Call u8_rune with at least MAX_UTF_BYTES ahead in
! 810: * the buffer until EOF interferes.
! 811: */
! 812: if (k - j < MAX_UTF_BYTES) {
! 813: if (k + MAX_UTF_BYTES > buf + bufsize) {
! 814: adjbuf(&buf, &bufsize,
! 815: bufsize + MAX_UTF_BYTES,
! 816: quantum, 0, "fnematch");
1.46 millert 817: }
1.48 ! millert 818: for (n = MAX_UTF_BYTES ; n > 0; n--) {
! 819: *k++ = (c = getc(f)) != EOF ? c : 0;
! 820: if (c == EOF) {
! 821: if (ferror(f))
! 822: FATAL("fnematch: getc error");
! 823: break;
! 824: }
1.38 millert 825: }
1.48 ! millert 826: }
1.26 millert 827:
1.48 ! millert 828: j += u8_rune(&c, (uschar *)j);
! 829:
! 830: if ((ns = get_gototab(pfa, s, c)) != 0)
! 831: s = ns;
! 832: else
! 833: s = cgoto(pfa, s, c);
1.26 millert 834:
1.48 ! millert 835: if (pfa->out[s]) { /* final state */
! 836: patbeg = i;
! 837: patlen = j - i;
! 838: if (c == 0) /* don't count $ */
! 839: patlen--;
! 840: }
! 841:
! 842: if (c && s != 1)
! 843: continue; /* origin i still viable, next j */
! 844: if (patlen)
! 845: break; /* best match found */
! 846:
! 847: /* no match at origin i, next i and start over */
! 848: i += u8_rune(&c, (uschar *)i);
! 849: if (c == 0)
! 850: break; /* no match */
! 851: j = i;
1.26 millert 852: s = 2;
1.48 ! millert 853: } while (1);
1.26 millert 854:
855: /* adjbuf() may have relocated a resized buffer. Inform the world. */
856: *pbuf = buf;
857: *pbufsize = bufsize;
858:
859: if (patlen) {
860: /*
861: * Under no circumstances is the last character fed to
862: * the automaton part of the match. It is EOF's nullbyte,
863: * or it sent the automaton into a state with no further
864: * transitions available (s==1), or both. Room for a
865: * terminating nullbyte is guaranteed.
866: *
867: * ungetc any chars after the end of matching text
868: * (except for EOF's nullbyte, if present) and null
869: * terminate the buffer.
870: */
1.48 ! millert 871: do
! 872: if (*--k && ungetc(*k, f) == EOF)
! 873: FATAL("unable to ungetc '%c'", *k);
! 874: while (k > patbeg + patlen);
! 875: *k = '\0';
1.29 millert 876: return true;
1.26 millert 877: }
878: else
1.29 millert 879: return false;
1.26 millert 880: }
881:
1.11 millert 882: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 883: { /* uses relex() to scan regular expression */
884: Node *np;
885:
1.32 millert 886: DPRINTF("reparse <%s>\n", p);
1.28 millert 887: lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 888: rtok = relex();
1.11 millert 889: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 890: if (rtok == '\0') {
1.11 millert 891: /* FATAL("empty regular expression"); previous */
1.15 millert 892: return(op2(EMPTYRE, NIL, NIL));
893: }
1.1 tholo 894: np = regexp();
895: if (rtok != '\0')
1.8 millert 896: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 897: return(np);
898: }
899:
900: Node *regexp(void) /* top-level parse of reg expr */
901: {
902: return (alt(concat(primary())));
903: }
904:
905: Node *primary(void)
906: {
907: Node *np;
1.23 millert 908: int savelastatom;
1.1 tholo 909:
910: switch (rtok) {
911: case CHAR:
1.23 millert 912: lastatom = starttok;
1.7 millert 913: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 914: rtok = relex();
915: return (unary(np));
916: case ALL:
917: rtok = relex();
918: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 919: case EMPTYRE:
920: rtok = relex();
1.23 millert 921: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 922: case DOT:
1.23 millert 923: lastatom = starttok;
1.1 tholo 924: rtok = relex();
925: return (unary(op2(DOT, NIL, NIL)));
926: case CCL:
1.28 millert 927: np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23 millert 928: lastatom = starttok;
1.1 tholo 929: rtok = relex();
930: return (unary(np));
931: case NCCL:
1.28 millert 932: np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23 millert 933: lastatom = starttok;
1.1 tholo 934: rtok = relex();
935: return (unary(np));
936: case '^':
937: rtok = relex();
1.7 millert 938: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 939: case '$':
940: rtok = relex();
941: return (unary(op2(CHAR, NIL, NIL)));
942: case '(':
1.23 millert 943: lastatom = starttok;
944: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 945: rtok = relex();
946: if (rtok == ')') { /* special pleading for () */
947: rtok = relex();
1.42 millert 948: return unary(op2(CCL, NIL, (Node *) cclenter("")));
1.1 tholo 949: }
950: np = regexp();
951: if (rtok == ')') {
1.23 millert 952: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 953: rtok = relex();
954: return (unary(np));
955: }
956: else
1.8 millert 957: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 958: default:
1.8 millert 959: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 960: }
961: return 0; /*NOTREACHED*/
962: }
963:
964: Node *concat(Node *np)
965: {
966: switch (rtok) {
1.23 millert 967: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 968: return (concat(op2(CAT, np, primary())));
1.23 millert 969: case EMPTYRE:
970: rtok = relex();
1.42 millert 971: return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1.23 millert 972: primary())));
1.1 tholo 973: }
974: return (np);
975: }
976:
977: Node *alt(Node *np)
978: {
979: if (rtok == OR) {
980: rtok = relex();
981: return (alt(op2(OR, np, concat(primary()))));
982: }
983: return (np);
984: }
985:
986: Node *unary(Node *np)
987: {
988: switch (rtok) {
989: case STAR:
990: rtok = relex();
991: return (unary(op2(STAR, np, NIL)));
992: case PLUS:
993: rtok = relex();
994: return (unary(op2(PLUS, np, NIL)));
995: case QUEST:
996: rtok = relex();
997: return (unary(op2(QUEST, np, NIL)));
1.31 millert 998: case ZERO:
999: rtok = relex();
1000: return (unary(op2(ZERO, np, NIL)));
1.1 tholo 1001: default:
1002: return (np);
1003: }
1004: }
1005:
1.11 millert 1006: /*
1007: * Character class definitions conformant to the POSIX locale as
1008: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1009: * and operating character sets are both ASCII (ISO646) or supersets
1010: * thereof.
1011: *
1012: * Note that to avoid overflowing the temporary buffer used in
1013: * relex(), the expanded character class (prior to range expansion)
1014: * must be less than twice the size of their full name.
1015: */
1.12 millert 1016:
1017: /* Because isblank doesn't show up in any of the header files on any
1018: * system i use, it's defined here. if some other locale has a richer
1019: * definition of "blank", define HAS_ISBLANK and provide your own
1020: * version.
1021: * the parentheses here are an attempt to find a path through the maze
1022: * of macro definition and/or function and/or version provided. thanks
1023: * to nelson beebe for the suggestion; let's see if it works everywhere.
1024: */
1025:
1026: #ifndef HAS_ISBLANK
1027:
1.17 millert 1028: int (xisblank)(int c)
1.12 millert 1029: {
1030: return c==' ' || c=='\t';
1031: }
1032:
1033: #endif
1034:
1.31 millert 1035: static const struct charclass {
1.11 millert 1036: const char *cc_name;
1037: int cc_namelen;
1.12 millert 1038: int (*cc_func)(int);
1.11 millert 1039: } charclasses[] = {
1.12 millert 1040: { "alnum", 5, isalnum },
1041: { "alpha", 5, isalpha },
1.17 millert 1042: #ifndef HAS_ISBLANK
1.21 millert 1043: { "blank", 5, xisblank },
1.17 millert 1044: #else
1.12 millert 1045: { "blank", 5, isblank },
1.17 millert 1046: #endif
1.12 millert 1047: { "cntrl", 5, iscntrl },
1048: { "digit", 5, isdigit },
1049: { "graph", 5, isgraph },
1050: { "lower", 5, islower },
1051: { "print", 5, isprint },
1052: { "punct", 5, ispunct },
1053: { "space", 5, isspace },
1054: { "upper", 5, isupper },
1055: { "xdigit", 6, isxdigit },
1.11 millert 1056: { NULL, 0, NULL },
1057: };
1058:
1.23 millert 1059: #define REPEAT_SIMPLE 0
1060: #define REPEAT_PLUS_APPENDED 1
1061: #define REPEAT_WITH_Q 2
1062: #define REPEAT_ZERO 3
1063:
1064: static int
1065: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1066: int atomlen, int firstnum, int secondnum, int special_case)
1067: {
1068: int i, j;
1.26 millert 1069: uschar *buf = NULL;
1.23 millert 1070: int ret = 1;
1.31 millert 1071: int init_q = (firstnum == 0); /* first added char will be ? */
1.23 millert 1072: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1073: int prefix_length = reptok - basestr; /* prefix includes first rep */
1.28 millert 1074: int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1.23 millert 1075: int size = prefix_length + suffix_length;
1076:
1077: if (firstnum > 1) { /* add room for reps 2 through firstnum */
1078: size += atomlen*(firstnum-1);
1079: }
1080:
1081: /* Adjust size of buffer for special cases */
1082: if (special_case == REPEAT_PLUS_APPENDED) {
1083: size++; /* for the final + */
1084: } else if (special_case == REPEAT_WITH_Q) {
1.36 millert 1085: size += init_q + (atomlen+1)* (n_q_reps-init_q);
1.23 millert 1086: } else if (special_case == REPEAT_ZERO) {
1087: size += 2; /* just a null ERE: () */
1088: }
1.35 millert 1089: if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23 millert 1090: FATAL("out of space in reg expr %.10s..", lastre);
1091: memcpy(buf, basestr, prefix_length); /* copy prefix */
1092: j = prefix_length;
1093: if (special_case == REPEAT_ZERO) {
1094: j -= atomlen;
1095: buf[j++] = '(';
1096: buf[j++] = ')';
1097: }
1.30 millert 1098: for (i = 1; i < firstnum; i++) { /* copy x reps */
1.23 millert 1099: memcpy(&buf[j], atom, atomlen);
1100: j += atomlen;
1101: }
1102: if (special_case == REPEAT_PLUS_APPENDED) {
1103: buf[j++] = '+';
1104: } else if (special_case == REPEAT_WITH_Q) {
1.29 millert 1105: if (init_q)
1106: buf[j++] = '?';
1.30 millert 1107: for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1.23 millert 1108: memcpy(&buf[j], atom, atomlen);
1109: j += atomlen;
1110: buf[j++] = '?';
1111: }
1112: }
1113: memcpy(&buf[j], reptok+reptoklen, suffix_length);
1.36 millert 1114: j += suffix_length;
1115: buf[j] = '\0';
1.23 millert 1116: /* free old basestr */
1117: if (firstbasestr != basestr) {
1118: if (basestr)
1119: xfree(basestr);
1120: }
1121: basestr = buf;
1122: prestr = buf + prefix_length;
1123: if (special_case == REPEAT_ZERO) {
1124: prestr -= atomlen;
1125: ret++;
1126: }
1127: return ret;
1128: }
1129:
1130: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1131: int atomlen, int firstnum, int secondnum)
1132: {
1133: /*
1134: In general, the repetition specifier or "bound" is replaced here
1135: by an equivalent ERE string, repeating the immediately previous atom
1136: and appending ? and + as needed. Note that the first copy of the
1137: atom is left in place, except in the special_case of a zero-repeat
1138: (i.e., {0}).
1139: */
1140: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1141: if (firstnum < 2) {
1142: /* 0 or 1: should be handled before you get here */
1143: FATAL("internal error");
1144: } else {
1145: return replace_repeat(reptok, reptoklen, atom, atomlen,
1146: firstnum, secondnum, REPEAT_PLUS_APPENDED);
1147: }
1148: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1149: if (firstnum == 0) { /* {0} or {0,0} */
1150: /* This case is unusual because the resulting
1151: replacement string might actually be SMALLER than
1152: the original ERE */
1153: return replace_repeat(reptok, reptoklen, atom, atomlen,
1154: firstnum, secondnum, REPEAT_ZERO);
1155: } else { /* (firstnum >= 1) */
1156: return replace_repeat(reptok, reptoklen, atom, atomlen,
1157: firstnum, secondnum, REPEAT_SIMPLE);
1158: }
1159: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1160: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1161: return replace_repeat(reptok, reptoklen, atom, atomlen,
1162: firstnum, secondnum, REPEAT_WITH_Q);
1163: } else { /* Error - shouldn't be here (n>m) */
1164: FATAL("internal error");
1165: }
1166: return 0;
1167: }
1.11 millert 1168:
1.38 millert 1169: extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
1170:
1.1 tholo 1171: int relex(void) /* lexical analyzer for reparse */
1172: {
1.5 kstailey 1173: int c, n;
1.1 tholo 1174: int cflag;
1.25 millert 1175: static uschar *buf = NULL;
1.5 kstailey 1176: static int bufsz = 100;
1.10 millert 1177: uschar *bp;
1.31 millert 1178: const struct charclass *cc;
1.12 millert 1179: int i;
1.29 millert 1180: int num, m;
1181: bool commafound, digitfound;
1.23 millert 1182: const uschar *startreptok;
1.24 millert 1183: static int parens = 0;
1.23 millert 1184:
1185: rescan:
1186: starttok = prestr;
1.1 tholo 1187:
1.38 millert 1188: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1189: prestr += n;
1190: starttok = prestr;
1191: return CHAR;
1192: }
1193:
1.1 tholo 1194: switch (c = *prestr++) {
1195: case '|': return OR;
1196: case '*': return STAR;
1197: case '+': return PLUS;
1198: case '?': return QUEST;
1199: case '.': return DOT;
1200: case '\0': prestr--; return '\0';
1201: case '^':
1202: case '$':
1.24 millert 1203: return c;
1.1 tholo 1204: case '(':
1.24 millert 1205: parens++;
1206: return c;
1.1 tholo 1207: case ')':
1.24 millert 1208: if (parens) {
1209: parens--;
1210: return c;
1211: }
1212: /* unmatched close parenthesis; per POSIX, treat as literal */
1213: rlxval = c;
1214: return CHAR;
1.1 tholo 1215: case '\\':
1.17 millert 1216: rlxval = quoted(&prestr);
1.1 tholo 1217: return CHAR;
1218: default:
1219: rlxval = c;
1220: return CHAR;
1.25 millert 1221: case '[':
1.35 millert 1222: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 1223: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 1224: bp = buf;
1.1 tholo 1225: if (*prestr == '^') {
1226: cflag = 1;
1227: prestr++;
1228: }
1229: else
1230: cflag = 0;
1.38 millert 1231: n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1.15 millert 1232: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 1233: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 1234: for (; ; ) {
1.38 millert 1235: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1236: for (i = 0; i < n; i++)
1237: *bp++ = *prestr++;
1238: continue;
1239: }
1.1 tholo 1240: if ((c = *prestr++) == '\\') {
1.5 kstailey 1241: *bp++ = '\\';
1.1 tholo 1242: if ((c = *prestr++) == '\0')
1.8 millert 1243: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 1244: *bp++ = c;
1.10 millert 1245: /* } else if (c == '\n') { */
1246: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 1247: } else if (c == '[' && *prestr == ':') {
1248: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1249: for (cc = charclasses; cc->cc_name; cc++)
1250: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1251: break;
1252: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1253: prestr[2 + cc->cc_namelen] == ']') {
1254: prestr += cc->cc_namelen + 3;
1.22 millert 1255: /*
1256: * BUG: We begin at 1, instead of 0, since we
1257: * would otherwise prematurely terminate the
1258: * string for classes like [[:cntrl:]]. This
1259: * means that we can't match the NUL character,
1260: * not without first adapting the entire
1261: * program to track each string's length.
1262: */
1.23 millert 1263: for (i = 1; i <= UCHAR_MAX; i++) {
1.37 millert 1264: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1.12 millert 1265: FATAL("out of space for reg expr %.10s...", lastre);
1266: if (cc->cc_func(i)) {
1.32 millert 1267: /* escape backslash */
1268: if (i == '\\') {
1269: *bp++ = '\\';
1270: n++;
1271: }
1272:
1.12 millert 1273: *bp++ = i;
1274: n++;
1275: }
1276: }
1.11 millert 1277: } else
1278: *bp++ = c;
1.23 millert 1279: } else if (c == '[' && *prestr == '.') {
1280: char collate_char;
1281: prestr++;
1282: collate_char = *prestr++;
1283: if (*prestr == '.' && prestr[1] == ']') {
1284: prestr += 2;
1285: /* Found it: map via locale TBD: for
1286: now, simply return this char. This
1287: is sufficient to pass conformance
1288: test awk.ex 156
1289: */
1290: if (*prestr == ']') {
1291: prestr++;
1292: rlxval = collate_char;
1293: return CHAR;
1294: }
1295: }
1296: } else if (c == '[' && *prestr == '=') {
1297: char equiv_char;
1298: prestr++;
1299: equiv_char = *prestr++;
1300: if (*prestr == '=' && prestr[1] == ']') {
1301: prestr += 2;
1302: /* Found it: map via locale TBD: for now
1303: simply return this char. This is
1304: sufficient to pass conformance test
1305: awk.ex 156
1306: */
1307: if (*prestr == ']') {
1308: prestr++;
1309: rlxval = equiv_char;
1310: return CHAR;
1311: }
1312: }
1.1 tholo 1313: } else if (c == '\0') {
1.8 millert 1314: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1315: } else if (bp == buf) { /* 1st char is special */
1316: *bp++ = c;
1.1 tholo 1317: } else if (c == ']') {
1.5 kstailey 1318: *bp++ = 0;
1.10 millert 1319: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1320: if (cflag == 0)
1321: return CCL;
1322: else
1323: return NCCL;
1324: } else
1.5 kstailey 1325: *bp++ = c;
1.1 tholo 1326: }
1.23 millert 1327: break;
1328: case '{':
1329: if (isdigit(*(prestr))) {
1330: num = 0; /* Process as a repetition */
1331: n = -1; m = -1;
1.29 millert 1332: commafound = false;
1333: digitfound = false;
1.23 millert 1334: startreptok = prestr-1;
1335: /* Remember start of previous atom here ? */
1336: } else { /* just a { char, not a repetition */
1337: rlxval = c;
1338: return CHAR;
1339: }
1340: for (; ; ) {
1341: if ((c = *prestr++) == '}') {
1342: if (commafound) {
1343: if (digitfound) { /* {n,m} */
1344: m = num;
1.30 millert 1345: if (m < n)
1.23 millert 1346: FATAL("illegal repetition expression: class %.20s",
1347: lastre);
1.30 millert 1348: if (n == 0 && m == 1) {
1.23 millert 1349: return QUEST;
1350: }
1351: } else { /* {n,} */
1.30 millert 1352: if (n == 0)
1353: return STAR;
1354: else if (n == 1)
1355: return PLUS;
1.23 millert 1356: }
1357: } else {
1358: if (digitfound) { /* {n} same as {n,n} */
1359: n = num;
1360: m = num;
1361: } else { /* {} */
1362: FATAL("illegal repetition expression: class %.20s",
1363: lastre);
1364: }
1365: }
1366: if (repeat(starttok, prestr-starttok, lastatom,
1367: startreptok - lastatom, n, m) > 0) {
1.30 millert 1368: if (n == 0 && m == 0) {
1.31 millert 1369: return ZERO;
1.23 millert 1370: }
1371: /* must rescan input for next token */
1372: goto rescan;
1373: }
1374: /* Failed to replace: eat up {...} characters
1375: and treat like just PLUS */
1376: return PLUS;
1377: } else if (c == '\0') {
1378: FATAL("nonterminated character class %.20s",
1379: lastre);
1380: } else if (isdigit(c)) {
1381: num = 10 * num + c - '0';
1.29 millert 1382: digitfound = true;
1.23 millert 1383: } else if (c == ',') {
1384: if (commafound)
1385: FATAL("illegal repetition expression: class %.20s",
1386: lastre);
1387: /* looking for {n,} or {n,m} */
1.29 millert 1388: commafound = true;
1.23 millert 1389: n = num;
1.29 millert 1390: digitfound = false; /* reset */
1.23 millert 1391: num = 0;
1392: } else {
1393: FATAL("illegal repetition expression: class %.20s",
1394: lastre);
1395: }
1396: }
1397: break;
1.1 tholo 1398: }
1399: }
1400:
1401: int cgoto(fa *f, int s, int c)
1402: {
1.27 millert 1403: int *p, *q;
1.1 tholo 1404: int i, j, k;
1405:
1.38 millert 1406: /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1.1 tholo 1407: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 1408: resizesetvec(__func__);
1.1 tholo 1409: }
1410: for (i = 0; i <= f->accept; i++)
1411: setvec[i] = 0;
1412: setcnt = 0;
1.27 millert 1413: resize_state(f, s);
1.1 tholo 1414: /* compute positions of gototab[s,c] into setvec */
1415: p = f->posns[s];
1416: for (i = 1; i <= *p; i++) {
1417: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1418: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1419: || (k == DOT && c != 0 && c != HAT)
1420: || (k == ALL && c != 0)
1.15 millert 1421: || (k == EMPTYRE && c != 0)
1.38 millert 1422: || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1423: || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1.1 tholo 1424: q = f->re[p[i]].lfollow;
1425: for (j = 1; j <= *q; j++) {
1426: if (q[j] >= maxsetvec) {
1.27 millert 1427: resizesetvec(__func__);
1.1 tholo 1428: }
1429: if (setvec[q[j]] == 0) {
1430: setcnt++;
1431: setvec[q[j]] = 1;
1432: }
1433: }
1434: }
1435: }
1436: }
1437: /* determine if setvec is a previous state */
1438: tmpset[0] = setcnt;
1439: j = 1;
1440: for (i = f->accept; i >= 0; i--)
1441: if (setvec[i]) {
1442: tmpset[j++] = i;
1443: }
1.27 millert 1444: resize_state(f, f->curstat > s ? f->curstat : s);
1.1 tholo 1445: /* tmpset == previous state? */
1446: for (i = 1; i <= f->curstat; i++) {
1447: p = f->posns[i];
1448: if ((k = tmpset[0]) != p[0])
1449: goto different;
1450: for (j = 1; j <= k; j++)
1451: if (tmpset[j] != p[j])
1452: goto different;
1453: /* setvec is state i */
1.30 millert 1454: if (c != HAT)
1.38 millert 1455: set_gototab(f, s, c, i);
1.1 tholo 1456: return i;
1457: different:;
1458: }
1459:
1460: /* add tmpset to current set of states */
1.27 millert 1461: ++(f->curstat);
1462: resize_state(f, f->curstat);
1.1 tholo 1463: xfree(f->posns[f->curstat]);
1.27 millert 1464: p = intalloc(setcnt + 1, __func__);
1.1 tholo 1465:
1466: f->posns[f->curstat] = p;
1.30 millert 1467: if (c != HAT)
1.38 millert 1468: set_gototab(f, s, c, f->curstat);
1.1 tholo 1469: for (i = 0; i <= setcnt; i++)
1470: p[i] = tmpset[i];
1471: if (setvec[f->accept])
1472: f->out[f->curstat] = 1;
1473: else
1474: f->out[f->curstat] = 0;
1475: return f->curstat;
1476: }
1477:
1478:
1479: void freefa(fa *f) /* free a finite automaton */
1480: {
1481: int i;
1482:
1483: if (f == NULL)
1484: return;
1.27 millert 1485: for (i = 0; i < f->state_count; i++)
1486: xfree(f->gototab[i])
1.1 tholo 1487: for (i = 0; i <= f->curstat; i++)
1488: xfree(f->posns[i]);
1489: for (i = 0; i <= f->accept; i++) {
1490: xfree(f->re[i].lfollow);
1491: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30 millert 1492: xfree(f->re[i].lval.np);
1.1 tholo 1493: }
1494: xfree(f->restr);
1.27 millert 1495: xfree(f->out);
1496: xfree(f->posns);
1497: xfree(f->gototab);
1.1 tholo 1498: xfree(f);
1499: }