Annotation of src/usr.bin/awk/b.c, Revision 1.49
1.49 ! millert 1: /* $OpenBSD: b.c,v 1.48 2023/11/22 01:01:21 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.34 millert 36: #include "awkgram.tab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.28 millert 65: static const uschar *rlxstr;
66: static const uschar *prestr; /* current position in current re */
67: static const uschar *lastre; /* origin of last re */
68: static const uschar *lastatom; /* origin of last Atom */
69: static const uschar *starttok;
70: static const uschar *basestr; /* starts with original, replaced during
1.23 millert 71: repetition processing */
1.28 millert 72: static const uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
1.28 millert 77: const char *patbeg;
1.1 tholo 78: int patlen;
79:
1.27 millert 80: #define NFA 128 /* cache this many dynamic fa's */
1.1 tholo 81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.43 millert 84: extern int u8_nextlen(const char *s);
85:
1.38 millert 86:
87: /* utf-8 mechanism:
88:
89: For most of Awk, utf-8 strings just "work", since they look like
90: null-terminated sequences of 8-bit bytes.
91:
92: Functions like length(), index(), and substr() have to operate
93: in units of utf-8 characters. The u8_* functions in run.c
94: handle this.
95:
96: Regular expressions are more complicated, since the basic
97: mechanism of the goto table used 8-bit byte indices into the
98: gototab entries to compute the next state. Unicode is a lot
99: bigger, so the gototab entries are now structs with a character
1.49 ! millert 100: and a next state. These are sorted by code point and binary
! 101: searched.
1.38 millert 102:
103: Throughout the RE mechanism in b.c, utf-8 characters are
104: converted to their utf-32 value. This mostly shows up in
105: cclenter, which expands character class ranges like a-z and now
106: alpha-omega. The size of a gototab array is still about 256.
107: This should be dynamic, but for now things work ok for a single
108: code page of Unicode, which is the most likely case.
109:
110: The code changes are localized in run.c and b.c. I have added a
111: handful of functions to somewhat better hide the implementation,
112: but a lot more could be done.
113:
114: */
115:
1.49 ! millert 116: static int entry_cmp(const void *l, const void *r);
1.38 millert 117: static int get_gototab(fa*, int, int);
118: static int set_gototab(fa*, int, int, int);
1.49 ! millert 119: static void clear_gototab(fa*, int);
1.38 millert 120: extern int u8_rune(int *, const uschar *);
121:
1.27 millert 122: static int *
123: intalloc(size_t n, const char *f)
124: {
1.35 millert 125: int *p = (int *) calloc(n, sizeof(int));
1.27 millert 126: if (p == NULL)
127: overflo(f);
128: return p;
129: }
130:
131: static void
132: allocsetvec(const char *f)
133: {
134: maxsetvec = MAXLIN;
1.35 millert 135: setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
136: tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27 millert 137: if (setvec == NULL || tmpset == NULL)
138: overflo(f);
139: }
140:
141: static void
142: resizesetvec(const char *f)
143: {
1.35 millert 144: setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
145: tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27 millert 146: if (setvec == NULL || tmpset == NULL)
147: overflo(f);
148: maxsetvec *= 4;
149: }
150:
151: static void
152: resize_state(fa *f, int state)
153: {
1.49 ! millert 154: gtt *p;
1.35 millert 155: uschar *p2;
156: int **p3;
1.27 millert 157: int i, new_count;
158:
159: if (++state < f->state_count)
160: return;
161:
162: new_count = state + 10; /* needs to be tuned */
163:
1.49 ! millert 164: p = (gtt *) reallocarray(f->gototab, new_count, sizeof(gtt));
1.27 millert 165: if (p == NULL)
166: goto out;
167: f->gototab = p;
168:
1.35 millert 169: p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
170: if (p2 == NULL)
1.27 millert 171: goto out;
1.35 millert 172: f->out = p2;
1.27 millert 173:
1.35 millert 174: p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
175: if (p3 == NULL)
1.27 millert 176: goto out;
1.35 millert 177: f->posns = p3;
1.27 millert 178:
179: for (i = f->state_count; i < new_count; ++i) {
1.49 ! millert 180: f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
! 181: if (f->gototab[i].entries == NULL)
1.27 millert 182: goto out;
1.49 ! millert 183: f->gototab[i].allocated = NCHARS;
! 184: f->gototab[i].inuse = 0;
! 185: f->out[i] = 0;
1.27 millert 186: f->posns[i] = NULL;
187: }
188: f->state_count = new_count;
189: return;
190: out:
191: overflo(__func__);
192: }
193:
1.29 millert 194: fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1.1 tholo 195: {
196: int i, use, nuse;
197: fa *pfa;
1.6 millert 198: static int now = 1;
1.1 tholo 199:
1.25 millert 200: if (setvec == NULL) { /* first time through any RE */
1.27 millert 201: allocsetvec(__func__);
1.1 tholo 202: }
203:
1.29 millert 204: if (compile_time != RUNNING) /* a constant for sure */
1.1 tholo 205: return mkdfa(s, anchor);
206: for (i = 0; i < nfatab; i++) /* is it there already? */
207: if (fatab[i]->anchor == anchor
1.11 millert 208: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 209: fatab[i]->use = now++;
1.1 tholo 210: return fatab[i];
1.10 millert 211: }
1.1 tholo 212: pfa = mkdfa(s, anchor);
213: if (nfatab < NFA) { /* room for another */
214: fatab[nfatab] = pfa;
1.6 millert 215: fatab[nfatab]->use = now++;
1.1 tholo 216: nfatab++;
217: return pfa;
218: }
219: use = fatab[0]->use; /* replace least-recently used */
220: nuse = 0;
221: for (i = 1; i < nfatab; i++)
222: if (fatab[i]->use < use) {
223: use = fatab[i]->use;
224: nuse = i;
225: }
226: freefa(fatab[nuse]);
227: fatab[nuse] = pfa;
1.6 millert 228: pfa->use = now++;
1.1 tholo 229: return pfa;
230: }
231:
1.29 millert 232: fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
1.30 millert 233: /* anchor = true for anchored matches, else false */
1.1 tholo 234: {
235: Node *p, *p1;
236: fa *f;
237:
1.28 millert 238: firstbasestr = (const uschar *) s;
1.23 millert 239: basestr = firstbasestr;
1.1 tholo 240: p = reparse(s);
241: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
242: /* put ALL STAR in front of reg. exp. */
243: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
244: /* put FINAL after reg. exp. */
245:
246: poscnt = 0;
247: penter(p1); /* enter parent pointers and leaf indices */
1.35 millert 248: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27 millert 249: overflo(__func__);
1.1 tholo 250: f->accept = poscnt-1; /* penter has computed number of positions in re */
251: cfoll(f, p1); /* set up follow sets */
252: freetr(p1);
1.27 millert 253: resize_state(f, 1);
254: f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
255: f->posns[1] = intalloc(1, __func__);
1.1 tholo 256: *f->posns[1] = 0;
257: f->initstat = makeinit(f, anchor);
258: f->anchor = anchor;
1.10 millert 259: f->restr = (uschar *) tostring(s);
1.23 millert 260: if (firstbasestr != basestr) {
261: if (basestr)
262: xfree(basestr);
263: }
1.1 tholo 264: return f;
265: }
266:
1.29 millert 267: int makeinit(fa *f, bool anchor)
1.1 tholo 268: {
269: int i, k;
270:
271: f->curstat = 2;
272: f->out[2] = 0;
273: k = *(f->re[0].lfollow);
1.25 millert 274: xfree(f->posns[2]);
1.27 millert 275: f->posns[2] = intalloc(k + 1, __func__);
1.30 millert 276: for (i = 0; i <= k; i++) {
1.1 tholo 277: (f->posns[2])[i] = (f->re[0].lfollow)[i];
278: }
279: if ((f->posns[2])[1] == f->accept)
280: f->out[2] = 1;
1.49 ! millert 281: clear_gototab(f, 2);
1.1 tholo 282: f->curstat = cgoto(f, 2, HAT);
283: if (anchor) {
284: *f->posns[2] = k-1; /* leave out position 0 */
1.30 millert 285: for (i = 0; i < k; i++) {
1.1 tholo 286: (f->posns[0])[i] = (f->posns[2])[i];
287: }
288:
289: f->out[0] = f->out[2];
290: if (f->curstat != 2)
291: --(*f->posns[f->curstat]);
292: }
293: return f->curstat;
294: }
295:
296: void penter(Node *p) /* set up parent pointers and leaf indices */
297: {
298: switch (type(p)) {
1.15 millert 299: ELEAF
1.1 tholo 300: LEAF
1.7 millert 301: info(p) = poscnt;
1.1 tholo 302: poscnt++;
303: break;
304: UNARY
305: penter(left(p));
306: parent(left(p)) = p;
307: break;
308: case CAT:
309: case OR:
310: penter(left(p));
311: penter(right(p));
312: parent(left(p)) = p;
313: parent(right(p)) = p;
314: break;
1.31 millert 315: case ZERO:
316: break;
1.1 tholo 317: default: /* can't happen */
1.8 millert 318: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 319: break;
320: }
321: }
322:
323: void freetr(Node *p) /* free parse tree */
324: {
325: switch (type(p)) {
1.15 millert 326: ELEAF
1.1 tholo 327: LEAF
328: xfree(p);
329: break;
330: UNARY
1.31 millert 331: case ZERO:
1.1 tholo 332: freetr(left(p));
333: xfree(p);
334: break;
335: case CAT:
336: case OR:
337: freetr(left(p));
338: freetr(right(p));
339: xfree(p);
340: break;
341: default: /* can't happen */
1.8 millert 342: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 343: break;
344: }
345: }
346:
347: /* in the parsing of regular expressions, metacharacters like . have */
348: /* to be seen literally; \056 is not a metacharacter. */
349:
1.38 millert 350: int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
1.2 millert 351: { /* only pick up one 8-bit byte (2 chars) */
1.28 millert 352: const uschar *p;
1.1 tholo 353: int n = 0;
1.2 millert 354: int i;
1.1 tholo 355:
1.38 millert 356: for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
1.1 tholo 357: if (isdigit(*p))
358: n = 16 * n + *p - '0';
359: else if (*p >= 'a' && *p <= 'f')
360: n = 16 * n + *p - 'a' + 10;
361: else if (*p >= 'A' && *p <= 'F')
362: n = 16 * n + *p - 'A' + 10;
363: }
1.28 millert 364: *pp = p;
1.1 tholo 365: return n;
366: }
367:
1.2 millert 368: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 369:
1.28 millert 370: int quoted(const uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 371: /* and increment *pp */
372: {
1.28 millert 373: const uschar *p = *pp;
1.1 tholo 374: int c;
375:
1.38 millert 376: /* BUG: should advance by utf-8 char even if makes no sense */
377:
378: if ((c = *p++) == 't') {
1.1 tholo 379: c = '\t';
1.38 millert 380: } else if (c == 'n') {
1.1 tholo 381: c = '\n';
1.38 millert 382: } else if (c == 'f') {
1.1 tholo 383: c = '\f';
1.38 millert 384: } else if (c == 'r') {
1.1 tholo 385: c = '\r';
1.38 millert 386: } else if (c == 'b') {
1.1 tholo 387: c = '\b';
1.38 millert 388: } else if (c == 'v') {
1.25 millert 389: c = '\v';
1.38 millert 390: } else if (c == 'a') {
1.25 millert 391: c = '\a';
1.38 millert 392: } else if (c == '\\') {
1.1 tholo 393: c = '\\';
1.38 millert 394: } else if (c == 'x') { /* 2 hex digits follow */
395: c = hexstr(&p, 2); /* this adds a null if number is invalid */
396: } else if (c == 'u') { /* unicode char number up to 8 hex digits */
397: c = hexstr(&p, 8);
1.1 tholo 398: } else if (isoctdigit(c)) { /* \d \dd \ddd */
399: int n = c - '0';
400: if (isoctdigit(*p)) {
401: n = 8 * n + *p++ - '0';
402: if (isoctdigit(*p))
403: n = 8 * n + *p++ - '0';
404: }
405: c = n;
406: } /* else */
407: /* c = c; */
408: *pp = p;
409: return c;
410: }
411:
1.38 millert 412: int *cclenter(const char *argp) /* add a character class */
1.1 tholo 413: {
414: int i, c, c2;
1.38 millert 415: int n;
416: const uschar *p = (const uschar *) argp;
417: int *bp, *retp;
418: static int *buf = NULL;
1.5 kstailey 419: static int bufsz = 100;
1.1 tholo 420:
1.38 millert 421: if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
1.8 millert 422: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 423: bp = buf;
1.38 millert 424: for (i = 0; *p != 0; ) {
425: n = u8_rune(&c, p);
426: p += n;
1.1 tholo 427: if (c == '\\') {
1.17 millert 428: c = quoted(&p);
1.5 kstailey 429: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 430: if (*p != 0) {
1.5 kstailey 431: c = bp[-1];
1.38 millert 432: /* c2 = *p++; */
433: n = u8_rune(&c2, p);
434: p += n;
1.1 tholo 435: if (c2 == '\\')
1.38 millert 436: c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
1.1 tholo 437: if (c > c2) { /* empty; ignore */
1.5 kstailey 438: bp--;
1.1 tholo 439: i--;
440: continue;
441: }
442: while (c < c2) {
1.38 millert 443: if (i >= bufsz) {
1.45 millert 444: buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38 millert 445: if (buf == NULL)
446: FATAL("out of space for character class [%.10s...] 2", p);
447: bufsz *= 2;
448: bp = buf + i;
449: }
1.5 kstailey 450: *bp++ = ++c;
1.1 tholo 451: i++;
452: }
453: continue;
454: }
455: }
1.38 millert 456: if (i >= bufsz) {
1.45 millert 457: buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38 millert 458: if (buf == NULL)
459: FATAL("out of space for character class [%.10s...] 2", p);
460: bufsz *= 2;
461: bp = buf + i;
462: }
1.5 kstailey 463: *bp++ = c;
1.1 tholo 464: i++;
465: }
1.5 kstailey 466: *bp = 0;
1.38 millert 467: /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
468: /* xfree(op); BUG: what are we freeing here? */
469: retp = (int *) calloc(bp-buf+1, sizeof(int));
470: for (i = 0; i < bp-buf+1; i++)
471: retp[i] = buf[i];
472: return retp;
1.1 tholo 473: }
474:
1.11 millert 475: void overflo(const char *s)
1.1 tholo 476: {
1.27 millert 477: FATAL("regular expression too big: out of space in %.30s...", s);
1.1 tholo 478: }
479:
480: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
481: {
482: int i;
1.4 millert 483: int *p;
1.1 tholo 484:
485: switch (type(v)) {
1.15 millert 486: ELEAF
1.1 tholo 487: LEAF
1.7 millert 488: f->re[info(v)].ltype = type(v);
489: f->re[info(v)].lval.np = right(v);
1.1 tholo 490: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 491: resizesetvec(__func__);
1.1 tholo 492: }
493: for (i = 0; i <= f->accept; i++)
494: setvec[i] = 0;
495: setcnt = 0;
496: follow(v); /* computes setvec and setcnt */
1.27 millert 497: p = intalloc(setcnt + 1, __func__);
1.7 millert 498: f->re[info(v)].lfollow = p;
1.1 tholo 499: *p = setcnt;
500: for (i = f->accept; i >= 0; i--)
501: if (setvec[i] == 1)
502: *++p = i;
503: break;
504: UNARY
505: cfoll(f,left(v));
506: break;
507: case CAT:
508: case OR:
509: cfoll(f,left(v));
510: cfoll(f,right(v));
511: break;
1.31 millert 512: case ZERO:
513: break;
1.1 tholo 514: default: /* can't happen */
1.8 millert 515: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 516: }
517: }
518:
519: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 520: /* returns 0 if p matches empty string */
1.1 tholo 521: {
1.4 millert 522: int b, lp;
1.1 tholo 523:
524: switch (type(p)) {
1.15 millert 525: ELEAF
1.1 tholo 526: LEAF
1.7 millert 527: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 528: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.27 millert 529: resizesetvec(__func__);
1.1 tholo 530: }
1.15 millert 531: if (type(p) == EMPTYRE) {
532: setvec[lp] = 0;
533: return(0);
534: }
1.1 tholo 535: if (setvec[lp] != 1) {
536: setvec[lp] = 1;
537: setcnt++;
538: }
1.41 millert 539: if (type(p) == CCL && (*(int *) right(p)) == 0)
1.1 tholo 540: return(0); /* empty CCL */
1.30 millert 541: return(1);
1.1 tholo 542: case PLUS:
1.30 millert 543: if (first(left(p)) == 0)
544: return(0);
1.1 tholo 545: return(1);
546: case STAR:
547: case QUEST:
548: first(left(p));
549: return(0);
550: case CAT:
551: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
552: return(1);
553: case OR:
554: b = first(right(p));
555: if (first(left(p)) == 0 || b == 0) return(0);
556: return(1);
1.31 millert 557: case ZERO:
558: return 0;
1.1 tholo 559: }
1.8 millert 560: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 561: return(-1);
562: }
563:
564: void follow(Node *v) /* collects leaves that can follow v into setvec */
565: {
566: Node *p;
567:
568: if (type(v) == FINAL)
569: return;
570: p = parent(v);
571: switch (type(p)) {
572: case STAR:
573: case PLUS:
574: first(v);
575: follow(p);
576: return;
577:
578: case OR:
579: case QUEST:
580: follow(p);
581: return;
582:
583: case CAT:
584: if (v == left(p)) { /* v is left child of p */
585: if (first(right(p)) == 0) {
586: follow(p);
587: return;
588: }
589: } else /* v is right child */
590: follow(p);
591: return;
592: }
593: }
594:
1.38 millert 595: int member(int c, int *sarg) /* is c in s? */
1.1 tholo 596: {
1.38 millert 597: int *s = (int *) sarg;
1.10 millert 598:
1.1 tholo 599: while (*s)
600: if (c == *s++)
601: return(1);
602: return(0);
603: }
604:
1.49 ! millert 605: static void resize_gototab(fa *f, int state)
! 606: {
! 607: size_t new_size = f->gototab[state].allocated * 2;
! 608: gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
! 609: if (p == NULL)
! 610: overflo(__func__);
! 611:
! 612: // need to initialized the new memory to zero
! 613: size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
! 614: memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
! 615:
! 616: f->gototab[state].allocated = new_size; // update gotottab info
! 617: f->gototab[state].entries = p;
! 618: }
! 619:
1.38 millert 620: static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
621: {
1.49 ! millert 622: gtte key;
! 623: gtte *item;
! 624:
! 625: key.ch = ch;
! 626: key.state = 0; /* irrelevant */
! 627: item = bsearch(& key, f->gototab[state].entries,
! 628: f->gototab[state].inuse, sizeof(gtte),
! 629: entry_cmp);
! 630:
! 631: if (item == NULL)
! 632: return 0;
! 633: else
! 634: return item->state;
1.38 millert 635: }
636:
1.49 ! millert 637: static int entry_cmp(const void *l, const void *r)
1.44 millert 638: {
1.49 ! millert 639: const gtte *left, *right;
! 640:
! 641: left = (const gtte *) l;
! 642: right = (const gtte *) r;
! 643:
! 644: return left->ch - right->ch;
1.44 millert 645: }
646:
1.38 millert 647: static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
648: {
1.49 ! millert 649: if (f->gototab[state].inuse == 0) {
! 650: f->gototab[state].entries[0].ch = ch;
! 651: f->gototab[state].entries[0].state = val;
! 652: f->gototab[state].inuse++;
! 653: return val;
! 654: } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
! 655: // not seen yet, insert and return
! 656: gtt *tab = & f->gototab[state];
! 657: if (tab->inuse + 1 >= tab->allocated)
! 658: resize_gototab(f, state);
! 659:
! 660: f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
! 661: f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
! 662: f->gototab[state].inuse++;
! 663: return val;
! 664: } else {
! 665: // maybe we have it, maybe we don't
! 666: gtte key;
! 667: gtte *item;
! 668:
! 669: key.ch = ch;
! 670: key.state = 0; /* irrelevant */
! 671: item = bsearch(& key, f->gototab[state].entries,
! 672: f->gototab[state].inuse, sizeof(gtte),
! 673: entry_cmp);
! 674:
! 675: if (item != NULL) {
! 676: // we have it, update state and return
! 677: item->state = val;
! 678: return item->state;
! 679: }
! 680: // otherwise, fall through to insert and reallocate.
! 681: }
! 682:
! 683: gtt *tab = & f->gototab[state];
! 684: if (tab->inuse + 1 >= tab->allocated)
! 685: resize_gototab(f, state);
! 686: ++tab->inuse;
! 687: f->gototab[state].entries[tab->inuse].ch = ch;
! 688: f->gototab[state].entries[tab->inuse].state = val;
! 689:
! 690: qsort(f->gototab[state].entries,
! 691: f->gototab[state].inuse, sizeof(gtte), entry_cmp);
! 692:
1.38 millert 693: return val; /* not used anywhere at the moment */
694: }
695:
1.49 ! millert 696: static void clear_gototab(fa *f, int state)
! 697: {
! 698: memset(f->gototab[state].entries, 0,
! 699: f->gototab[state].allocated * sizeof(gtte));
! 700: f->gototab[state].inuse = 0;
! 701: }
! 702:
1.11 millert 703: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 704: {
705: int s, ns;
1.38 millert 706: int n;
707: int rune;
1.28 millert 708: const uschar *p = (const uschar *) p0;
1.1 tholo 709:
1.38 millert 710: /* return pmatch(f, p0); does it matter whether longest or shortest? */
711:
1.27 millert 712: s = f->initstat;
713: assert (s < f->state_count);
714:
1.1 tholo 715: if (f->out[s])
716: return(1);
717: do {
1.15 millert 718: /* assert(*p < NCHARS); */
1.38 millert 719: n = u8_rune(&rune, p);
720: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 721: s = ns;
722: else
1.38 millert 723: s = cgoto(f, s, rune);
1.1 tholo 724: if (f->out[s])
725: return(1);
1.38 millert 726: if (*p == 0)
727: break;
728: p += n;
729: } while (1); /* was *p++ != 0 */
1.1 tholo 730: return(0);
731: }
732:
1.11 millert 733: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 734: {
735: int s, ns;
1.38 millert 736: int n;
737: int rune;
1.28 millert 738: const uschar *p = (const uschar *) p0;
739: const uschar *q;
1.1 tholo 740:
1.27 millert 741: s = f->initstat;
742: assert(s < f->state_count);
743:
1.28 millert 744: patbeg = (const char *)p;
1.1 tholo 745: patlen = -1;
746: do {
747: q = p;
748: do {
749: if (f->out[s]) /* final state */
750: patlen = q-p;
1.15 millert 751: /* assert(*q < NCHARS); */
1.38 millert 752: n = u8_rune(&rune, q);
753: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 754: s = ns;
755: else
1.38 millert 756: s = cgoto(f, s, rune);
1.27 millert 757:
758: assert(s < f->state_count);
759:
1.9 deraadt 760: if (s == 1) { /* no transition */
1.1 tholo 761: if (patlen >= 0) {
1.28 millert 762: patbeg = (const char *) p;
1.1 tholo 763: return(1);
764: }
765: else
766: goto nextin; /* no match */
1.9 deraadt 767: }
1.38 millert 768: if (*q == 0)
769: break;
770: q += n;
771: } while (1);
772: q++; /* was *q++ */
1.1 tholo 773: if (f->out[s])
774: patlen = q-p-1; /* don't count $ */
775: if (patlen >= 0) {
1.28 millert 776: patbeg = (const char *) p;
1.1 tholo 777: return(1);
778: }
779: nextin:
780: s = 2;
1.38 millert 781: if (*p == 0)
782: break;
783: n = u8_rune(&rune, p);
784: p += n;
785: } while (1); /* was *p++ */
1.1 tholo 786: return (0);
787: }
788:
1.11 millert 789: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 790: {
791: int s, ns;
1.38 millert 792: int n;
793: int rune;
1.28 millert 794: const uschar *p = (const uschar *) p0;
795: const uschar *q;
1.1 tholo 796:
1.27 millert 797: s = f->initstat;
798: assert(s < f->state_count);
799:
1.28 millert 800: patbeg = (const char *)p;
1.1 tholo 801: patlen = -1;
802: while (*p) {
803: q = p;
804: do {
805: if (f->out[s]) /* final state */
806: patlen = q-p;
1.15 millert 807: /* assert(*q < NCHARS); */
1.38 millert 808: n = u8_rune(&rune, q);
809: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 810: s = ns;
811: else
1.38 millert 812: s = cgoto(f, s, rune);
1.9 deraadt 813: if (s == 1) { /* no transition */
1.1 tholo 814: if (patlen > 0) {
1.28 millert 815: patbeg = (const char *) p;
1.1 tholo 816: return(1);
817: } else
818: goto nnextin; /* no nonempty match */
1.9 deraadt 819: }
1.38 millert 820: if (*q == 0)
821: break;
822: q += n;
823: } while (1);
824: q++;
1.1 tholo 825: if (f->out[s])
826: patlen = q-p-1; /* don't count $ */
827: if (patlen > 0 ) {
1.28 millert 828: patbeg = (const char *) p;
1.1 tholo 829: return(1);
830: }
831: nnextin:
832: s = 2;
833: p++;
834: }
835: return (0);
836: }
837:
1.43 millert 838:
839: #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
840:
1.26 millert 841: /*
842: * NAME
843: * fnematch
844: *
845: * DESCRIPTION
846: * A stream-fed version of nematch which transfers characters to a
847: * null-terminated buffer. All characters up to and including the last
848: * character of the matching text or EOF are placed in the buffer. If
849: * a match is found, patbeg and patlen are set appropriately.
850: *
851: * RETURN VALUES
1.29 millert 852: * false No match found.
853: * true Match found.
1.26 millert 854: */
855:
1.29 millert 856: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26 millert 857: {
1.48 millert 858: char *i, *j, *k, *buf = *pbuf;
1.26 millert 859: int bufsize = *pbufsize;
1.48 millert 860: int c, n, ns, s;
1.26 millert 861:
862: s = pfa->initstat;
863: patlen = 0;
864:
865: /*
1.48 millert 866: * buf <= i <= j <= k <= buf+bufsize
1.26 millert 867: *
1.48 millert 868: * i: origin of active substring
869: * j: current character
870: * k: destination of the next getc
1.26 millert 871: */
1.48 millert 872:
873: i = j = k = buf;
874:
875: do {
876: /*
877: * Call u8_rune with at least MAX_UTF_BYTES ahead in
878: * the buffer until EOF interferes.
879: */
880: if (k - j < MAX_UTF_BYTES) {
881: if (k + MAX_UTF_BYTES > buf + bufsize) {
882: adjbuf(&buf, &bufsize,
883: bufsize + MAX_UTF_BYTES,
884: quantum, 0, "fnematch");
1.46 millert 885: }
1.48 millert 886: for (n = MAX_UTF_BYTES ; n > 0; n--) {
887: *k++ = (c = getc(f)) != EOF ? c : 0;
888: if (c == EOF) {
889: if (ferror(f))
890: FATAL("fnematch: getc error");
891: break;
892: }
1.38 millert 893: }
1.48 millert 894: }
1.26 millert 895:
1.48 millert 896: j += u8_rune(&c, (uschar *)j);
897:
898: if ((ns = get_gototab(pfa, s, c)) != 0)
899: s = ns;
900: else
901: s = cgoto(pfa, s, c);
1.26 millert 902:
1.48 millert 903: if (pfa->out[s]) { /* final state */
904: patbeg = i;
905: patlen = j - i;
906: if (c == 0) /* don't count $ */
907: patlen--;
908: }
909:
910: if (c && s != 1)
911: continue; /* origin i still viable, next j */
912: if (patlen)
913: break; /* best match found */
914:
915: /* no match at origin i, next i and start over */
916: i += u8_rune(&c, (uschar *)i);
917: if (c == 0)
918: break; /* no match */
919: j = i;
1.26 millert 920: s = 2;
1.48 millert 921: } while (1);
1.26 millert 922:
923: /* adjbuf() may have relocated a resized buffer. Inform the world. */
924: *pbuf = buf;
925: *pbufsize = bufsize;
926:
927: if (patlen) {
928: /*
929: * Under no circumstances is the last character fed to
930: * the automaton part of the match. It is EOF's nullbyte,
931: * or it sent the automaton into a state with no further
932: * transitions available (s==1), or both. Room for a
933: * terminating nullbyte is guaranteed.
934: *
935: * ungetc any chars after the end of matching text
936: * (except for EOF's nullbyte, if present) and null
937: * terminate the buffer.
938: */
1.48 millert 939: do
940: if (*--k && ungetc(*k, f) == EOF)
941: FATAL("unable to ungetc '%c'", *k);
942: while (k > patbeg + patlen);
943: *k = '\0';
1.29 millert 944: return true;
1.26 millert 945: }
946: else
1.29 millert 947: return false;
1.26 millert 948: }
949:
1.11 millert 950: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 951: { /* uses relex() to scan regular expression */
952: Node *np;
953:
1.32 millert 954: DPRINTF("reparse <%s>\n", p);
1.28 millert 955: lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 956: rtok = relex();
1.11 millert 957: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 958: if (rtok == '\0') {
1.11 millert 959: /* FATAL("empty regular expression"); previous */
1.15 millert 960: return(op2(EMPTYRE, NIL, NIL));
961: }
1.1 tholo 962: np = regexp();
963: if (rtok != '\0')
1.8 millert 964: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 965: return(np);
966: }
967:
968: Node *regexp(void) /* top-level parse of reg expr */
969: {
970: return (alt(concat(primary())));
971: }
972:
973: Node *primary(void)
974: {
975: Node *np;
1.23 millert 976: int savelastatom;
1.1 tholo 977:
978: switch (rtok) {
979: case CHAR:
1.23 millert 980: lastatom = starttok;
1.7 millert 981: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 982: rtok = relex();
983: return (unary(np));
984: case ALL:
985: rtok = relex();
986: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 987: case EMPTYRE:
988: rtok = relex();
1.23 millert 989: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 990: case DOT:
1.23 millert 991: lastatom = starttok;
1.1 tholo 992: rtok = relex();
993: return (unary(op2(DOT, NIL, NIL)));
994: case CCL:
1.28 millert 995: np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23 millert 996: lastatom = starttok;
1.1 tholo 997: rtok = relex();
998: return (unary(np));
999: case NCCL:
1.28 millert 1000: np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23 millert 1001: lastatom = starttok;
1.1 tholo 1002: rtok = relex();
1003: return (unary(np));
1004: case '^':
1005: rtok = relex();
1.7 millert 1006: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 1007: case '$':
1008: rtok = relex();
1009: return (unary(op2(CHAR, NIL, NIL)));
1010: case '(':
1.23 millert 1011: lastatom = starttok;
1012: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 1013: rtok = relex();
1014: if (rtok == ')') { /* special pleading for () */
1015: rtok = relex();
1.42 millert 1016: return unary(op2(CCL, NIL, (Node *) cclenter("")));
1.1 tholo 1017: }
1018: np = regexp();
1019: if (rtok == ')') {
1.23 millert 1020: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 1021: rtok = relex();
1022: return (unary(np));
1023: }
1024: else
1.8 millert 1025: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 1026: default:
1.8 millert 1027: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 1028: }
1029: return 0; /*NOTREACHED*/
1030: }
1031:
1032: Node *concat(Node *np)
1033: {
1034: switch (rtok) {
1.23 millert 1035: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 1036: return (concat(op2(CAT, np, primary())));
1.23 millert 1037: case EMPTYRE:
1038: rtok = relex();
1.42 millert 1039: return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1.23 millert 1040: primary())));
1.1 tholo 1041: }
1042: return (np);
1043: }
1044:
1045: Node *alt(Node *np)
1046: {
1047: if (rtok == OR) {
1048: rtok = relex();
1049: return (alt(op2(OR, np, concat(primary()))));
1050: }
1051: return (np);
1052: }
1053:
1054: Node *unary(Node *np)
1055: {
1056: switch (rtok) {
1057: case STAR:
1058: rtok = relex();
1059: return (unary(op2(STAR, np, NIL)));
1060: case PLUS:
1061: rtok = relex();
1062: return (unary(op2(PLUS, np, NIL)));
1063: case QUEST:
1064: rtok = relex();
1065: return (unary(op2(QUEST, np, NIL)));
1.31 millert 1066: case ZERO:
1067: rtok = relex();
1068: return (unary(op2(ZERO, np, NIL)));
1.1 tholo 1069: default:
1070: return (np);
1071: }
1072: }
1073:
1.11 millert 1074: /*
1075: * Character class definitions conformant to the POSIX locale as
1076: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1077: * and operating character sets are both ASCII (ISO646) or supersets
1078: * thereof.
1079: *
1080: * Note that to avoid overflowing the temporary buffer used in
1081: * relex(), the expanded character class (prior to range expansion)
1082: * must be less than twice the size of their full name.
1083: */
1.12 millert 1084:
1085: /* Because isblank doesn't show up in any of the header files on any
1086: * system i use, it's defined here. if some other locale has a richer
1087: * definition of "blank", define HAS_ISBLANK and provide your own
1088: * version.
1089: * the parentheses here are an attempt to find a path through the maze
1090: * of macro definition and/or function and/or version provided. thanks
1091: * to nelson beebe for the suggestion; let's see if it works everywhere.
1092: */
1093:
1094: #ifndef HAS_ISBLANK
1095:
1.17 millert 1096: int (xisblank)(int c)
1.12 millert 1097: {
1098: return c==' ' || c=='\t';
1099: }
1100:
1101: #endif
1102:
1.31 millert 1103: static const struct charclass {
1.11 millert 1104: const char *cc_name;
1105: int cc_namelen;
1.12 millert 1106: int (*cc_func)(int);
1.11 millert 1107: } charclasses[] = {
1.12 millert 1108: { "alnum", 5, isalnum },
1109: { "alpha", 5, isalpha },
1.17 millert 1110: #ifndef HAS_ISBLANK
1.21 millert 1111: { "blank", 5, xisblank },
1.17 millert 1112: #else
1.12 millert 1113: { "blank", 5, isblank },
1.17 millert 1114: #endif
1.12 millert 1115: { "cntrl", 5, iscntrl },
1116: { "digit", 5, isdigit },
1117: { "graph", 5, isgraph },
1118: { "lower", 5, islower },
1119: { "print", 5, isprint },
1120: { "punct", 5, ispunct },
1121: { "space", 5, isspace },
1122: { "upper", 5, isupper },
1123: { "xdigit", 6, isxdigit },
1.11 millert 1124: { NULL, 0, NULL },
1125: };
1126:
1.23 millert 1127: #define REPEAT_SIMPLE 0
1128: #define REPEAT_PLUS_APPENDED 1
1129: #define REPEAT_WITH_Q 2
1130: #define REPEAT_ZERO 3
1131:
1132: static int
1133: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1134: int atomlen, int firstnum, int secondnum, int special_case)
1135: {
1136: int i, j;
1.26 millert 1137: uschar *buf = NULL;
1.23 millert 1138: int ret = 1;
1.31 millert 1139: int init_q = (firstnum == 0); /* first added char will be ? */
1.23 millert 1140: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1141: int prefix_length = reptok - basestr; /* prefix includes first rep */
1.28 millert 1142: int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1.23 millert 1143: int size = prefix_length + suffix_length;
1144:
1145: if (firstnum > 1) { /* add room for reps 2 through firstnum */
1146: size += atomlen*(firstnum-1);
1147: }
1148:
1149: /* Adjust size of buffer for special cases */
1150: if (special_case == REPEAT_PLUS_APPENDED) {
1151: size++; /* for the final + */
1152: } else if (special_case == REPEAT_WITH_Q) {
1.36 millert 1153: size += init_q + (atomlen+1)* (n_q_reps-init_q);
1.23 millert 1154: } else if (special_case == REPEAT_ZERO) {
1155: size += 2; /* just a null ERE: () */
1156: }
1.35 millert 1157: if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23 millert 1158: FATAL("out of space in reg expr %.10s..", lastre);
1159: memcpy(buf, basestr, prefix_length); /* copy prefix */
1160: j = prefix_length;
1161: if (special_case == REPEAT_ZERO) {
1162: j -= atomlen;
1163: buf[j++] = '(';
1164: buf[j++] = ')';
1165: }
1.30 millert 1166: for (i = 1; i < firstnum; i++) { /* copy x reps */
1.23 millert 1167: memcpy(&buf[j], atom, atomlen);
1168: j += atomlen;
1169: }
1170: if (special_case == REPEAT_PLUS_APPENDED) {
1171: buf[j++] = '+';
1172: } else if (special_case == REPEAT_WITH_Q) {
1.29 millert 1173: if (init_q)
1174: buf[j++] = '?';
1.30 millert 1175: for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1.23 millert 1176: memcpy(&buf[j], atom, atomlen);
1177: j += atomlen;
1178: buf[j++] = '?';
1179: }
1180: }
1181: memcpy(&buf[j], reptok+reptoklen, suffix_length);
1.36 millert 1182: j += suffix_length;
1183: buf[j] = '\0';
1.23 millert 1184: /* free old basestr */
1185: if (firstbasestr != basestr) {
1186: if (basestr)
1187: xfree(basestr);
1188: }
1189: basestr = buf;
1190: prestr = buf + prefix_length;
1191: if (special_case == REPEAT_ZERO) {
1192: prestr -= atomlen;
1193: ret++;
1194: }
1195: return ret;
1196: }
1197:
1198: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1199: int atomlen, int firstnum, int secondnum)
1200: {
1201: /*
1202: In general, the repetition specifier or "bound" is replaced here
1203: by an equivalent ERE string, repeating the immediately previous atom
1204: and appending ? and + as needed. Note that the first copy of the
1205: atom is left in place, except in the special_case of a zero-repeat
1206: (i.e., {0}).
1207: */
1208: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1209: if (firstnum < 2) {
1210: /* 0 or 1: should be handled before you get here */
1211: FATAL("internal error");
1212: } else {
1213: return replace_repeat(reptok, reptoklen, atom, atomlen,
1214: firstnum, secondnum, REPEAT_PLUS_APPENDED);
1215: }
1216: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1217: if (firstnum == 0) { /* {0} or {0,0} */
1218: /* This case is unusual because the resulting
1219: replacement string might actually be SMALLER than
1220: the original ERE */
1221: return replace_repeat(reptok, reptoklen, atom, atomlen,
1222: firstnum, secondnum, REPEAT_ZERO);
1223: } else { /* (firstnum >= 1) */
1224: return replace_repeat(reptok, reptoklen, atom, atomlen,
1225: firstnum, secondnum, REPEAT_SIMPLE);
1226: }
1227: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1228: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1229: return replace_repeat(reptok, reptoklen, atom, atomlen,
1230: firstnum, secondnum, REPEAT_WITH_Q);
1231: } else { /* Error - shouldn't be here (n>m) */
1232: FATAL("internal error");
1233: }
1234: return 0;
1235: }
1.11 millert 1236:
1.38 millert 1237: extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
1238:
1.1 tholo 1239: int relex(void) /* lexical analyzer for reparse */
1240: {
1.5 kstailey 1241: int c, n;
1.1 tholo 1242: int cflag;
1.25 millert 1243: static uschar *buf = NULL;
1.5 kstailey 1244: static int bufsz = 100;
1.10 millert 1245: uschar *bp;
1.31 millert 1246: const struct charclass *cc;
1.12 millert 1247: int i;
1.29 millert 1248: int num, m;
1249: bool commafound, digitfound;
1.23 millert 1250: const uschar *startreptok;
1.24 millert 1251: static int parens = 0;
1.23 millert 1252:
1253: rescan:
1254: starttok = prestr;
1.1 tholo 1255:
1.38 millert 1256: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1257: prestr += n;
1258: starttok = prestr;
1259: return CHAR;
1260: }
1261:
1.1 tholo 1262: switch (c = *prestr++) {
1263: case '|': return OR;
1264: case '*': return STAR;
1265: case '+': return PLUS;
1266: case '?': return QUEST;
1267: case '.': return DOT;
1268: case '\0': prestr--; return '\0';
1269: case '^':
1270: case '$':
1.24 millert 1271: return c;
1.1 tholo 1272: case '(':
1.24 millert 1273: parens++;
1274: return c;
1.1 tholo 1275: case ')':
1.24 millert 1276: if (parens) {
1277: parens--;
1278: return c;
1279: }
1280: /* unmatched close parenthesis; per POSIX, treat as literal */
1281: rlxval = c;
1282: return CHAR;
1.1 tholo 1283: case '\\':
1.17 millert 1284: rlxval = quoted(&prestr);
1.1 tholo 1285: return CHAR;
1286: default:
1287: rlxval = c;
1288: return CHAR;
1.25 millert 1289: case '[':
1.35 millert 1290: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 1291: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 1292: bp = buf;
1.1 tholo 1293: if (*prestr == '^') {
1294: cflag = 1;
1295: prestr++;
1296: }
1297: else
1298: cflag = 0;
1.38 millert 1299: n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1.15 millert 1300: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 1301: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 1302: for (; ; ) {
1.38 millert 1303: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1304: for (i = 0; i < n; i++)
1305: *bp++ = *prestr++;
1306: continue;
1307: }
1.1 tholo 1308: if ((c = *prestr++) == '\\') {
1.5 kstailey 1309: *bp++ = '\\';
1.1 tholo 1310: if ((c = *prestr++) == '\0')
1.8 millert 1311: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 1312: *bp++ = c;
1.10 millert 1313: /* } else if (c == '\n') { */
1314: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 1315: } else if (c == '[' && *prestr == ':') {
1316: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1317: for (cc = charclasses; cc->cc_name; cc++)
1318: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1319: break;
1320: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1321: prestr[2 + cc->cc_namelen] == ']') {
1322: prestr += cc->cc_namelen + 3;
1.22 millert 1323: /*
1324: * BUG: We begin at 1, instead of 0, since we
1325: * would otherwise prematurely terminate the
1326: * string for classes like [[:cntrl:]]. This
1327: * means that we can't match the NUL character,
1328: * not without first adapting the entire
1329: * program to track each string's length.
1330: */
1.23 millert 1331: for (i = 1; i <= UCHAR_MAX; i++) {
1.37 millert 1332: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1.12 millert 1333: FATAL("out of space for reg expr %.10s...", lastre);
1334: if (cc->cc_func(i)) {
1.32 millert 1335: /* escape backslash */
1336: if (i == '\\') {
1337: *bp++ = '\\';
1338: n++;
1339: }
1340:
1.12 millert 1341: *bp++ = i;
1342: n++;
1343: }
1344: }
1.11 millert 1345: } else
1346: *bp++ = c;
1.23 millert 1347: } else if (c == '[' && *prestr == '.') {
1348: char collate_char;
1349: prestr++;
1350: collate_char = *prestr++;
1351: if (*prestr == '.' && prestr[1] == ']') {
1352: prestr += 2;
1353: /* Found it: map via locale TBD: for
1354: now, simply return this char. This
1355: is sufficient to pass conformance
1356: test awk.ex 156
1357: */
1358: if (*prestr == ']') {
1359: prestr++;
1360: rlxval = collate_char;
1361: return CHAR;
1362: }
1363: }
1364: } else if (c == '[' && *prestr == '=') {
1365: char equiv_char;
1366: prestr++;
1367: equiv_char = *prestr++;
1368: if (*prestr == '=' && prestr[1] == ']') {
1369: prestr += 2;
1370: /* Found it: map via locale TBD: for now
1371: simply return this char. This is
1372: sufficient to pass conformance test
1373: awk.ex 156
1374: */
1375: if (*prestr == ']') {
1376: prestr++;
1377: rlxval = equiv_char;
1378: return CHAR;
1379: }
1380: }
1.1 tholo 1381: } else if (c == '\0') {
1.8 millert 1382: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1383: } else if (bp == buf) { /* 1st char is special */
1384: *bp++ = c;
1.1 tholo 1385: } else if (c == ']') {
1.5 kstailey 1386: *bp++ = 0;
1.10 millert 1387: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1388: if (cflag == 0)
1389: return CCL;
1390: else
1391: return NCCL;
1392: } else
1.5 kstailey 1393: *bp++ = c;
1.1 tholo 1394: }
1.23 millert 1395: break;
1396: case '{':
1397: if (isdigit(*(prestr))) {
1398: num = 0; /* Process as a repetition */
1399: n = -1; m = -1;
1.29 millert 1400: commafound = false;
1401: digitfound = false;
1.23 millert 1402: startreptok = prestr-1;
1403: /* Remember start of previous atom here ? */
1404: } else { /* just a { char, not a repetition */
1405: rlxval = c;
1406: return CHAR;
1407: }
1408: for (; ; ) {
1409: if ((c = *prestr++) == '}') {
1410: if (commafound) {
1411: if (digitfound) { /* {n,m} */
1412: m = num;
1.30 millert 1413: if (m < n)
1.23 millert 1414: FATAL("illegal repetition expression: class %.20s",
1415: lastre);
1.30 millert 1416: if (n == 0 && m == 1) {
1.23 millert 1417: return QUEST;
1418: }
1419: } else { /* {n,} */
1.30 millert 1420: if (n == 0)
1421: return STAR;
1422: else if (n == 1)
1423: return PLUS;
1.23 millert 1424: }
1425: } else {
1426: if (digitfound) { /* {n} same as {n,n} */
1427: n = num;
1428: m = num;
1429: } else { /* {} */
1430: FATAL("illegal repetition expression: class %.20s",
1431: lastre);
1432: }
1433: }
1434: if (repeat(starttok, prestr-starttok, lastatom,
1435: startreptok - lastatom, n, m) > 0) {
1.30 millert 1436: if (n == 0 && m == 0) {
1.31 millert 1437: return ZERO;
1.23 millert 1438: }
1439: /* must rescan input for next token */
1440: goto rescan;
1441: }
1442: /* Failed to replace: eat up {...} characters
1443: and treat like just PLUS */
1444: return PLUS;
1445: } else if (c == '\0') {
1446: FATAL("nonterminated character class %.20s",
1447: lastre);
1448: } else if (isdigit(c)) {
1449: num = 10 * num + c - '0';
1.29 millert 1450: digitfound = true;
1.23 millert 1451: } else if (c == ',') {
1452: if (commafound)
1453: FATAL("illegal repetition expression: class %.20s",
1454: lastre);
1455: /* looking for {n,} or {n,m} */
1.29 millert 1456: commafound = true;
1.23 millert 1457: n = num;
1.29 millert 1458: digitfound = false; /* reset */
1.23 millert 1459: num = 0;
1460: } else {
1461: FATAL("illegal repetition expression: class %.20s",
1462: lastre);
1463: }
1464: }
1465: break;
1.1 tholo 1466: }
1467: }
1468:
1469: int cgoto(fa *f, int s, int c)
1470: {
1.27 millert 1471: int *p, *q;
1.1 tholo 1472: int i, j, k;
1473:
1.38 millert 1474: /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1.1 tholo 1475: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 1476: resizesetvec(__func__);
1.1 tholo 1477: }
1478: for (i = 0; i <= f->accept; i++)
1479: setvec[i] = 0;
1480: setcnt = 0;
1.27 millert 1481: resize_state(f, s);
1.1 tholo 1482: /* compute positions of gototab[s,c] into setvec */
1483: p = f->posns[s];
1484: for (i = 1; i <= *p; i++) {
1485: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1486: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1487: || (k == DOT && c != 0 && c != HAT)
1488: || (k == ALL && c != 0)
1.15 millert 1489: || (k == EMPTYRE && c != 0)
1.38 millert 1490: || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1491: || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1.1 tholo 1492: q = f->re[p[i]].lfollow;
1493: for (j = 1; j <= *q; j++) {
1494: if (q[j] >= maxsetvec) {
1.27 millert 1495: resizesetvec(__func__);
1.1 tholo 1496: }
1497: if (setvec[q[j]] == 0) {
1498: setcnt++;
1499: setvec[q[j]] = 1;
1500: }
1501: }
1502: }
1503: }
1504: }
1505: /* determine if setvec is a previous state */
1506: tmpset[0] = setcnt;
1507: j = 1;
1508: for (i = f->accept; i >= 0; i--)
1509: if (setvec[i]) {
1510: tmpset[j++] = i;
1511: }
1.27 millert 1512: resize_state(f, f->curstat > s ? f->curstat : s);
1.1 tholo 1513: /* tmpset == previous state? */
1514: for (i = 1; i <= f->curstat; i++) {
1515: p = f->posns[i];
1516: if ((k = tmpset[0]) != p[0])
1517: goto different;
1518: for (j = 1; j <= k; j++)
1519: if (tmpset[j] != p[j])
1520: goto different;
1521: /* setvec is state i */
1.30 millert 1522: if (c != HAT)
1.38 millert 1523: set_gototab(f, s, c, i);
1.1 tholo 1524: return i;
1525: different:;
1526: }
1527:
1528: /* add tmpset to current set of states */
1.27 millert 1529: ++(f->curstat);
1530: resize_state(f, f->curstat);
1.49 ! millert 1531: clear_gototab(f, f->curstat);
1.1 tholo 1532: xfree(f->posns[f->curstat]);
1.27 millert 1533: p = intalloc(setcnt + 1, __func__);
1.1 tholo 1534:
1535: f->posns[f->curstat] = p;
1.30 millert 1536: if (c != HAT)
1.38 millert 1537: set_gototab(f, s, c, f->curstat);
1.1 tholo 1538: for (i = 0; i <= setcnt; i++)
1539: p[i] = tmpset[i];
1540: if (setvec[f->accept])
1541: f->out[f->curstat] = 1;
1542: else
1543: f->out[f->curstat] = 0;
1544: return f->curstat;
1545: }
1546:
1547:
1548: void freefa(fa *f) /* free a finite automaton */
1549: {
1550: int i;
1551:
1552: if (f == NULL)
1553: return;
1.27 millert 1554: for (i = 0; i < f->state_count; i++)
1.49 ! millert 1555: xfree(f->gototab[i].entries);
! 1556: xfree(f->gototab);
1.1 tholo 1557: for (i = 0; i <= f->curstat; i++)
1558: xfree(f->posns[i]);
1559: for (i = 0; i <= f->accept; i++) {
1560: xfree(f->re[i].lfollow);
1561: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30 millert 1562: xfree(f->re[i].lval.np);
1.1 tholo 1563: }
1564: xfree(f->restr);
1.27 millert 1565: xfree(f->out);
1566: xfree(f->posns);
1567: xfree(f->gototab);
1.1 tholo 1568: xfree(f);
1569: }