Annotation of src/usr.bin/awk/b.c, Revision 1.46
1.46 ! millert 1: /* $OpenBSD: b.c,v 1.45 2023/10/30 17:52:54 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.34 millert 36: #include "awkgram.tab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.28 millert 65: static const uschar *rlxstr;
66: static const uschar *prestr; /* current position in current re */
67: static const uschar *lastre; /* origin of last re */
68: static const uschar *lastatom; /* origin of last Atom */
69: static const uschar *starttok;
70: static const uschar *basestr; /* starts with original, replaced during
1.23 millert 71: repetition processing */
1.28 millert 72: static const uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
1.28 millert 77: const char *patbeg;
1.1 tholo 78: int patlen;
79:
1.27 millert 80: #define NFA 128 /* cache this many dynamic fa's */
1.1 tholo 81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.43 millert 84: extern int u8_nextlen(const char *s);
85:
1.38 millert 86:
87: /* utf-8 mechanism:
88:
89: For most of Awk, utf-8 strings just "work", since they look like
90: null-terminated sequences of 8-bit bytes.
91:
92: Functions like length(), index(), and substr() have to operate
93: in units of utf-8 characters. The u8_* functions in run.c
94: handle this.
95:
96: Regular expressions are more complicated, since the basic
97: mechanism of the goto table used 8-bit byte indices into the
98: gototab entries to compute the next state. Unicode is a lot
99: bigger, so the gototab entries are now structs with a character
100: and a next state, and there is a linear search of the characters
101: to find the state. (Yes, this is slower, by a significant
102: amount. Tough.)
103:
104: Throughout the RE mechanism in b.c, utf-8 characters are
105: converted to their utf-32 value. This mostly shows up in
106: cclenter, which expands character class ranges like a-z and now
107: alpha-omega. The size of a gototab array is still about 256.
108: This should be dynamic, but for now things work ok for a single
109: code page of Unicode, which is the most likely case.
110:
111: The code changes are localized in run.c and b.c. I have added a
112: handful of functions to somewhat better hide the implementation,
113: but a lot more could be done.
114:
115: */
116:
117: static int get_gototab(fa*, int, int);
118: static int set_gototab(fa*, int, int, int);
1.44 millert 119: static void reset_gototab(fa*, int);
1.38 millert 120: extern int u8_rune(int *, const uschar *);
121:
1.27 millert 122: static int *
123: intalloc(size_t n, const char *f)
124: {
1.35 millert 125: int *p = (int *) calloc(n, sizeof(int));
1.27 millert 126: if (p == NULL)
127: overflo(f);
128: return p;
129: }
130:
131: static void
132: allocsetvec(const char *f)
133: {
134: maxsetvec = MAXLIN;
1.35 millert 135: setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
136: tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27 millert 137: if (setvec == NULL || tmpset == NULL)
138: overflo(f);
139: }
140:
141: static void
142: resizesetvec(const char *f)
143: {
1.35 millert 144: setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
145: tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27 millert 146: if (setvec == NULL || tmpset == NULL)
147: overflo(f);
148: maxsetvec *= 4;
149: }
150:
151: static void
152: resize_state(fa *f, int state)
153: {
1.38 millert 154: gtt **p;
1.35 millert 155: uschar *p2;
156: int **p3;
1.27 millert 157: int i, new_count;
158:
159: if (++state < f->state_count)
160: return;
161:
162: new_count = state + 10; /* needs to be tuned */
163:
1.38 millert 164: p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
1.27 millert 165: if (p == NULL)
166: goto out;
167: f->gototab = p;
168:
1.35 millert 169: p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
170: if (p2 == NULL)
1.27 millert 171: goto out;
1.35 millert 172: f->out = p2;
1.27 millert 173:
1.35 millert 174: p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
175: if (p3 == NULL)
1.27 millert 176: goto out;
1.35 millert 177: f->posns = p3;
1.27 millert 178:
179: for (i = f->state_count; i < new_count; ++i) {
1.38 millert 180: f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
1.27 millert 181: if (f->gototab[i] == NULL)
182: goto out;
183: f->out[i] = 0;
184: f->posns[i] = NULL;
185: }
1.38 millert 186: f->gototab_len = NCHARS; /* should be variable, growable */
1.27 millert 187: f->state_count = new_count;
188: return;
189: out:
190: overflo(__func__);
191: }
192:
1.29 millert 193: fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1.1 tholo 194: {
195: int i, use, nuse;
196: fa *pfa;
1.6 millert 197: static int now = 1;
1.1 tholo 198:
1.25 millert 199: if (setvec == NULL) { /* first time through any RE */
1.27 millert 200: allocsetvec(__func__);
1.1 tholo 201: }
202:
1.29 millert 203: if (compile_time != RUNNING) /* a constant for sure */
1.1 tholo 204: return mkdfa(s, anchor);
205: for (i = 0; i < nfatab; i++) /* is it there already? */
206: if (fatab[i]->anchor == anchor
1.11 millert 207: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 208: fatab[i]->use = now++;
1.1 tholo 209: return fatab[i];
1.10 millert 210: }
1.1 tholo 211: pfa = mkdfa(s, anchor);
212: if (nfatab < NFA) { /* room for another */
213: fatab[nfatab] = pfa;
1.6 millert 214: fatab[nfatab]->use = now++;
1.1 tholo 215: nfatab++;
216: return pfa;
217: }
218: use = fatab[0]->use; /* replace least-recently used */
219: nuse = 0;
220: for (i = 1; i < nfatab; i++)
221: if (fatab[i]->use < use) {
222: use = fatab[i]->use;
223: nuse = i;
224: }
225: freefa(fatab[nuse]);
226: fatab[nuse] = pfa;
1.6 millert 227: pfa->use = now++;
1.1 tholo 228: return pfa;
229: }
230:
1.29 millert 231: fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
1.30 millert 232: /* anchor = true for anchored matches, else false */
1.1 tholo 233: {
234: Node *p, *p1;
235: fa *f;
236:
1.28 millert 237: firstbasestr = (const uschar *) s;
1.23 millert 238: basestr = firstbasestr;
1.1 tholo 239: p = reparse(s);
240: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
241: /* put ALL STAR in front of reg. exp. */
242: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
243: /* put FINAL after reg. exp. */
244:
245: poscnt = 0;
246: penter(p1); /* enter parent pointers and leaf indices */
1.35 millert 247: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27 millert 248: overflo(__func__);
1.1 tholo 249: f->accept = poscnt-1; /* penter has computed number of positions in re */
250: cfoll(f, p1); /* set up follow sets */
251: freetr(p1);
1.27 millert 252: resize_state(f, 1);
253: f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
254: f->posns[1] = intalloc(1, __func__);
1.1 tholo 255: *f->posns[1] = 0;
256: f->initstat = makeinit(f, anchor);
257: f->anchor = anchor;
1.10 millert 258: f->restr = (uschar *) tostring(s);
1.23 millert 259: if (firstbasestr != basestr) {
260: if (basestr)
261: xfree(basestr);
262: }
1.1 tholo 263: return f;
264: }
265:
1.29 millert 266: int makeinit(fa *f, bool anchor)
1.1 tholo 267: {
268: int i, k;
269:
270: f->curstat = 2;
271: f->out[2] = 0;
272: k = *(f->re[0].lfollow);
1.25 millert 273: xfree(f->posns[2]);
1.27 millert 274: f->posns[2] = intalloc(k + 1, __func__);
1.30 millert 275: for (i = 0; i <= k; i++) {
1.1 tholo 276: (f->posns[2])[i] = (f->re[0].lfollow)[i];
277: }
278: if ((f->posns[2])[1] == f->accept)
279: f->out[2] = 1;
1.44 millert 280: reset_gototab(f, 2);
1.1 tholo 281: f->curstat = cgoto(f, 2, HAT);
282: if (anchor) {
283: *f->posns[2] = k-1; /* leave out position 0 */
1.30 millert 284: for (i = 0; i < k; i++) {
1.1 tholo 285: (f->posns[0])[i] = (f->posns[2])[i];
286: }
287:
288: f->out[0] = f->out[2];
289: if (f->curstat != 2)
290: --(*f->posns[f->curstat]);
291: }
292: return f->curstat;
293: }
294:
295: void penter(Node *p) /* set up parent pointers and leaf indices */
296: {
297: switch (type(p)) {
1.15 millert 298: ELEAF
1.1 tholo 299: LEAF
1.7 millert 300: info(p) = poscnt;
1.1 tholo 301: poscnt++;
302: break;
303: UNARY
304: penter(left(p));
305: parent(left(p)) = p;
306: break;
307: case CAT:
308: case OR:
309: penter(left(p));
310: penter(right(p));
311: parent(left(p)) = p;
312: parent(right(p)) = p;
313: break;
1.31 millert 314: case ZERO:
315: break;
1.1 tholo 316: default: /* can't happen */
1.8 millert 317: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 318: break;
319: }
320: }
321:
322: void freetr(Node *p) /* free parse tree */
323: {
324: switch (type(p)) {
1.15 millert 325: ELEAF
1.1 tholo 326: LEAF
327: xfree(p);
328: break;
329: UNARY
1.31 millert 330: case ZERO:
1.1 tholo 331: freetr(left(p));
332: xfree(p);
333: break;
334: case CAT:
335: case OR:
336: freetr(left(p));
337: freetr(right(p));
338: xfree(p);
339: break;
340: default: /* can't happen */
1.8 millert 341: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 342: break;
343: }
344: }
345:
346: /* in the parsing of regular expressions, metacharacters like . have */
347: /* to be seen literally; \056 is not a metacharacter. */
348:
1.38 millert 349: int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
1.2 millert 350: { /* only pick up one 8-bit byte (2 chars) */
1.28 millert 351: const uschar *p;
1.1 tholo 352: int n = 0;
1.2 millert 353: int i;
1.1 tholo 354:
1.38 millert 355: for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
1.1 tholo 356: if (isdigit(*p))
357: n = 16 * n + *p - '0';
358: else if (*p >= 'a' && *p <= 'f')
359: n = 16 * n + *p - 'a' + 10;
360: else if (*p >= 'A' && *p <= 'F')
361: n = 16 * n + *p - 'A' + 10;
362: }
1.28 millert 363: *pp = p;
1.1 tholo 364: return n;
365: }
366:
1.2 millert 367: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 368:
1.28 millert 369: int quoted(const uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 370: /* and increment *pp */
371: {
1.28 millert 372: const uschar *p = *pp;
1.1 tholo 373: int c;
374:
1.38 millert 375: /* BUG: should advance by utf-8 char even if makes no sense */
376:
377: if ((c = *p++) == 't') {
1.1 tholo 378: c = '\t';
1.38 millert 379: } else if (c == 'n') {
1.1 tholo 380: c = '\n';
1.38 millert 381: } else if (c == 'f') {
1.1 tholo 382: c = '\f';
1.38 millert 383: } else if (c == 'r') {
1.1 tholo 384: c = '\r';
1.38 millert 385: } else if (c == 'b') {
1.1 tholo 386: c = '\b';
1.38 millert 387: } else if (c == 'v') {
1.25 millert 388: c = '\v';
1.38 millert 389: } else if (c == 'a') {
1.25 millert 390: c = '\a';
1.38 millert 391: } else if (c == '\\') {
1.1 tholo 392: c = '\\';
1.38 millert 393: } else if (c == 'x') { /* 2 hex digits follow */
394: c = hexstr(&p, 2); /* this adds a null if number is invalid */
395: } else if (c == 'u') { /* unicode char number up to 8 hex digits */
396: c = hexstr(&p, 8);
1.1 tholo 397: } else if (isoctdigit(c)) { /* \d \dd \ddd */
398: int n = c - '0';
399: if (isoctdigit(*p)) {
400: n = 8 * n + *p++ - '0';
401: if (isoctdigit(*p))
402: n = 8 * n + *p++ - '0';
403: }
404: c = n;
405: } /* else */
406: /* c = c; */
407: *pp = p;
408: return c;
409: }
410:
1.38 millert 411: int *cclenter(const char *argp) /* add a character class */
1.1 tholo 412: {
413: int i, c, c2;
1.38 millert 414: int n;
415: const uschar *p = (const uschar *) argp;
416: int *bp, *retp;
417: static int *buf = NULL;
1.5 kstailey 418: static int bufsz = 100;
1.1 tholo 419:
1.38 millert 420: if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
1.8 millert 421: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 422: bp = buf;
1.38 millert 423: for (i = 0; *p != 0; ) {
424: n = u8_rune(&c, p);
425: p += n;
1.1 tholo 426: if (c == '\\') {
1.17 millert 427: c = quoted(&p);
1.5 kstailey 428: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 429: if (*p != 0) {
1.5 kstailey 430: c = bp[-1];
1.38 millert 431: /* c2 = *p++; */
432: n = u8_rune(&c2, p);
433: p += n;
1.1 tholo 434: if (c2 == '\\')
1.38 millert 435: c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
1.1 tholo 436: if (c > c2) { /* empty; ignore */
1.5 kstailey 437: bp--;
1.1 tholo 438: i--;
439: continue;
440: }
441: while (c < c2) {
1.38 millert 442: if (i >= bufsz) {
1.45 millert 443: buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38 millert 444: if (buf == NULL)
445: FATAL("out of space for character class [%.10s...] 2", p);
446: bufsz *= 2;
447: bp = buf + i;
448: }
1.5 kstailey 449: *bp++ = ++c;
1.1 tholo 450: i++;
451: }
452: continue;
453: }
454: }
1.38 millert 455: if (i >= bufsz) {
1.45 millert 456: buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38 millert 457: if (buf == NULL)
458: FATAL("out of space for character class [%.10s...] 2", p);
459: bufsz *= 2;
460: bp = buf + i;
461: }
1.5 kstailey 462: *bp++ = c;
1.1 tholo 463: i++;
464: }
1.5 kstailey 465: *bp = 0;
1.38 millert 466: /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
467: /* xfree(op); BUG: what are we freeing here? */
468: retp = (int *) calloc(bp-buf+1, sizeof(int));
469: for (i = 0; i < bp-buf+1; i++)
470: retp[i] = buf[i];
471: return retp;
1.1 tholo 472: }
473:
1.11 millert 474: void overflo(const char *s)
1.1 tholo 475: {
1.27 millert 476: FATAL("regular expression too big: out of space in %.30s...", s);
1.1 tholo 477: }
478:
479: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
480: {
481: int i;
1.4 millert 482: int *p;
1.1 tholo 483:
484: switch (type(v)) {
1.15 millert 485: ELEAF
1.1 tholo 486: LEAF
1.7 millert 487: f->re[info(v)].ltype = type(v);
488: f->re[info(v)].lval.np = right(v);
1.1 tholo 489: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 490: resizesetvec(__func__);
1.1 tholo 491: }
492: for (i = 0; i <= f->accept; i++)
493: setvec[i] = 0;
494: setcnt = 0;
495: follow(v); /* computes setvec and setcnt */
1.27 millert 496: p = intalloc(setcnt + 1, __func__);
1.7 millert 497: f->re[info(v)].lfollow = p;
1.1 tholo 498: *p = setcnt;
499: for (i = f->accept; i >= 0; i--)
500: if (setvec[i] == 1)
501: *++p = i;
502: break;
503: UNARY
504: cfoll(f,left(v));
505: break;
506: case CAT:
507: case OR:
508: cfoll(f,left(v));
509: cfoll(f,right(v));
510: break;
1.31 millert 511: case ZERO:
512: break;
1.1 tholo 513: default: /* can't happen */
1.8 millert 514: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 515: }
516: }
517:
518: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 519: /* returns 0 if p matches empty string */
1.1 tholo 520: {
1.4 millert 521: int b, lp;
1.1 tholo 522:
523: switch (type(p)) {
1.15 millert 524: ELEAF
1.1 tholo 525: LEAF
1.7 millert 526: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 527: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.27 millert 528: resizesetvec(__func__);
1.1 tholo 529: }
1.15 millert 530: if (type(p) == EMPTYRE) {
531: setvec[lp] = 0;
532: return(0);
533: }
1.1 tholo 534: if (setvec[lp] != 1) {
535: setvec[lp] = 1;
536: setcnt++;
537: }
1.41 millert 538: if (type(p) == CCL && (*(int *) right(p)) == 0)
1.1 tholo 539: return(0); /* empty CCL */
1.30 millert 540: return(1);
1.1 tholo 541: case PLUS:
1.30 millert 542: if (first(left(p)) == 0)
543: return(0);
1.1 tholo 544: return(1);
545: case STAR:
546: case QUEST:
547: first(left(p));
548: return(0);
549: case CAT:
550: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
551: return(1);
552: case OR:
553: b = first(right(p));
554: if (first(left(p)) == 0 || b == 0) return(0);
555: return(1);
1.31 millert 556: case ZERO:
557: return 0;
1.1 tholo 558: }
1.8 millert 559: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 560: return(-1);
561: }
562:
563: void follow(Node *v) /* collects leaves that can follow v into setvec */
564: {
565: Node *p;
566:
567: if (type(v) == FINAL)
568: return;
569: p = parent(v);
570: switch (type(p)) {
571: case STAR:
572: case PLUS:
573: first(v);
574: follow(p);
575: return;
576:
577: case OR:
578: case QUEST:
579: follow(p);
580: return;
581:
582: case CAT:
583: if (v == left(p)) { /* v is left child of p */
584: if (first(right(p)) == 0) {
585: follow(p);
586: return;
587: }
588: } else /* v is right child */
589: follow(p);
590: return;
591: }
592: }
593:
1.38 millert 594: int member(int c, int *sarg) /* is c in s? */
1.1 tholo 595: {
1.38 millert 596: int *s = (int *) sarg;
1.10 millert 597:
1.1 tholo 598: while (*s)
599: if (c == *s++)
600: return(1);
601: return(0);
602: }
603:
1.38 millert 604: static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
605: {
606: int i;
607: for (i = 0; i < f->gototab_len; i++) {
608: if (f->gototab[state][i].ch == 0)
609: break;
610: if (f->gototab[state][i].ch == ch)
611: return f->gototab[state][i].state;
612: }
613: return 0;
614: }
615:
1.44 millert 616: static void reset_gototab(fa *f, int state) /* hide gototab inplementation */
617: {
618: memset(f->gototab[state], 0, f->gototab_len * sizeof(**f->gototab));
619: }
620:
1.38 millert 621: static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
622: {
623: int i;
624: for (i = 0; i < f->gototab_len; i++) {
625: if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
626: f->gototab[state][i].ch = ch;
627: f->gototab[state][i].state = val;
628: return val;
629: }
630: }
631: overflo(__func__);
632: return val; /* not used anywhere at the moment */
633: }
634:
1.11 millert 635: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 636: {
637: int s, ns;
1.38 millert 638: int n;
639: int rune;
1.28 millert 640: const uschar *p = (const uschar *) p0;
1.1 tholo 641:
1.38 millert 642: /* return pmatch(f, p0); does it matter whether longest or shortest? */
643:
1.27 millert 644: s = f->initstat;
645: assert (s < f->state_count);
646:
1.1 tholo 647: if (f->out[s])
648: return(1);
649: do {
1.15 millert 650: /* assert(*p < NCHARS); */
1.38 millert 651: n = u8_rune(&rune, p);
652: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 653: s = ns;
654: else
1.38 millert 655: s = cgoto(f, s, rune);
1.1 tholo 656: if (f->out[s])
657: return(1);
1.38 millert 658: if (*p == 0)
659: break;
660: p += n;
661: } while (1); /* was *p++ != 0 */
1.1 tholo 662: return(0);
663: }
664:
1.11 millert 665: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 666: {
667: int s, ns;
1.38 millert 668: int n;
669: int rune;
1.28 millert 670: const uschar *p = (const uschar *) p0;
671: const uschar *q;
1.1 tholo 672:
1.27 millert 673: s = f->initstat;
674: assert(s < f->state_count);
675:
1.28 millert 676: patbeg = (const char *)p;
1.1 tholo 677: patlen = -1;
678: do {
679: q = p;
680: do {
681: if (f->out[s]) /* final state */
682: patlen = q-p;
1.15 millert 683: /* assert(*q < NCHARS); */
1.38 millert 684: n = u8_rune(&rune, q);
685: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 686: s = ns;
687: else
1.38 millert 688: s = cgoto(f, s, rune);
1.27 millert 689:
690: assert(s < f->state_count);
691:
1.9 deraadt 692: if (s == 1) { /* no transition */
1.1 tholo 693: if (patlen >= 0) {
1.28 millert 694: patbeg = (const char *) p;
1.1 tholo 695: return(1);
696: }
697: else
698: goto nextin; /* no match */
1.9 deraadt 699: }
1.38 millert 700: if (*q == 0)
701: break;
702: q += n;
703: } while (1);
704: q++; /* was *q++ */
1.1 tholo 705: if (f->out[s])
706: patlen = q-p-1; /* don't count $ */
707: if (patlen >= 0) {
1.28 millert 708: patbeg = (const char *) p;
1.1 tholo 709: return(1);
710: }
711: nextin:
712: s = 2;
1.38 millert 713: if (*p == 0)
714: break;
715: n = u8_rune(&rune, p);
716: p += n;
717: } while (1); /* was *p++ */
1.1 tholo 718: return (0);
719: }
720:
1.11 millert 721: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 722: {
723: int s, ns;
1.38 millert 724: int n;
725: int rune;
1.28 millert 726: const uschar *p = (const uschar *) p0;
727: const uschar *q;
1.1 tholo 728:
1.27 millert 729: s = f->initstat;
730: assert(s < f->state_count);
731:
1.28 millert 732: patbeg = (const char *)p;
1.1 tholo 733: patlen = -1;
734: while (*p) {
735: q = p;
736: do {
737: if (f->out[s]) /* final state */
738: patlen = q-p;
1.15 millert 739: /* assert(*q < NCHARS); */
1.38 millert 740: n = u8_rune(&rune, q);
741: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 742: s = ns;
743: else
1.38 millert 744: s = cgoto(f, s, rune);
1.9 deraadt 745: if (s == 1) { /* no transition */
1.1 tholo 746: if (patlen > 0) {
1.28 millert 747: patbeg = (const char *) p;
1.1 tholo 748: return(1);
749: } else
750: goto nnextin; /* no nonempty match */
1.9 deraadt 751: }
1.38 millert 752: if (*q == 0)
753: break;
754: q += n;
755: } while (1);
756: q++;
1.1 tholo 757: if (f->out[s])
758: patlen = q-p-1; /* don't count $ */
759: if (patlen > 0 ) {
1.28 millert 760: patbeg = (const char *) p;
1.1 tholo 761: return(1);
762: }
763: nnextin:
764: s = 2;
765: p++;
766: }
767: return (0);
768: }
769:
1.43 millert 770:
771: #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
772:
773: // Read one rune at a time from the given FILE*. Return both
774: // the bytes and the actual rune.
775:
776: struct runedata {
777: int rune;
778: size_t len;
779: char bytes[6];
780: };
781:
782: struct runedata getrune(FILE *fp)
1.38 millert 783: {
1.43 millert 784: struct runedata result;
785: int c, i, next;
786:
787: memset(&result, 0, sizeof(result));
788:
789: c = getc(fp);
790: if (c == EOF)
791: return result; // result.rune == 0 --> EOF
792: else if (c < 128 || awk_mb_cur_max == 1) {
793: result.bytes[0] = c;
794: result.len = 1;
795: result.rune = c;
796:
797: return result;
798: }
799:
800: // need to get bytes and fill things in
801: result.bytes[0] = c;
802: result.len = 1;
803:
804: next = 1;
805: for (i = 1; i < MAX_UTF_BYTES; i++) {
806: c = getc(fp);
807: if (c == EOF)
1.38 millert 808: break;
1.43 millert 809: result.bytes[next++] = c;
810: result.len++;
811: }
812:
813: // put back any extra input bytes
814: int actual_len = u8_nextlen(result.bytes);
815: while (result.len > actual_len) {
816: ungetc(result.bytes[--result.len], fp);
1.38 millert 817: }
818:
1.43 millert 819: result.bytes[result.len] = '\0';
820: (void) u8_rune(& result.rune, (uschar *) result.bytes);
1.38 millert 821:
1.43 millert 822: return result;
1.38 millert 823: }
824:
1.26 millert 825:
826: /*
827: * NAME
828: * fnematch
829: *
830: * DESCRIPTION
831: * A stream-fed version of nematch which transfers characters to a
832: * null-terminated buffer. All characters up to and including the last
833: * character of the matching text or EOF are placed in the buffer. If
834: * a match is found, patbeg and patlen are set appropriately.
835: *
836: * RETURN VALUES
1.29 millert 837: * false No match found.
838: * true Match found.
1.26 millert 839: */
840:
1.29 millert 841: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26 millert 842: {
843: char *buf = *pbuf;
844: int bufsize = *pbufsize;
1.43 millert 845: int i, j, k, ns, s;
846: struct runedata r;
1.26 millert 847:
848: s = pfa->initstat;
849: patlen = 0;
850:
851: /*
852: * All indices relative to buf.
853: * i <= j <= k <= bufsize
854: *
1.43 millert 855: * i: origin of active substring (first byte of first character)
856: * j: current character (last byte of current character)
1.26 millert 857: * k: destination of next getc()
858: */
859: i = -1, k = 0;
860: do {
861: j = i++;
862: do {
1.43 millert 863: r = getrune(f);
1.46 ! millert 864: if (r.len == 0) {
! 865: r.len = 1; // store NUL byte for EOF
! 866: }
! 867: j += r.len;
! 868: if (j >= bufsize) {
! 869: if (!adjbuf(&buf, &bufsize, j+1, quantum, 0, "fnematch"))
! 870: FATAL("stream '%.30s...' too long", buf);
1.38 millert 871: }
1.43 millert 872: memcpy(buf + k, r.bytes, r.len);
873: k += r.len;
1.26 millert 874:
1.43 millert 875: if ((ns = get_gototab(pfa, s, r.rune)) != 0)
1.26 millert 876: s = ns;
877: else
1.43 millert 878: s = cgoto(pfa, s, r.rune);
1.26 millert 879:
880: if (pfa->out[s]) { /* final state */
881: patlen = j - i + 1;
1.43 millert 882: if (r.rune == 0) /* don't count $ */
1.26 millert 883: patlen--;
884: }
885: } while (buf[j] && s != 1);
886: s = 2;
1.43 millert 887: if (r.len > 1)
888: i += r.len - 1; // i incremented around the loop
1.26 millert 889: } while (buf[i] && !patlen);
890:
891: /* adjbuf() may have relocated a resized buffer. Inform the world. */
892: *pbuf = buf;
893: *pbufsize = bufsize;
894:
895: if (patlen) {
896: patbeg = buf + i;
897: /*
898: * Under no circumstances is the last character fed to
899: * the automaton part of the match. It is EOF's nullbyte,
900: * or it sent the automaton into a state with no further
901: * transitions available (s==1), or both. Room for a
902: * terminating nullbyte is guaranteed.
903: *
904: * ungetc any chars after the end of matching text
905: * (except for EOF's nullbyte, if present) and null
906: * terminate the buffer.
907: */
1.43 millert 908: do {
909: int ii;
910: for (ii = r.len; ii > 0; ii--)
911: if (buf[--k] && ungetc(buf[k], f) == EOF)
912: FATAL("unable to ungetc '%c'", buf[k]);
913: } while (k > i + patlen);
1.30 millert 914: buf[k] = '\0';
1.29 millert 915: return true;
1.26 millert 916: }
917: else
1.29 millert 918: return false;
1.26 millert 919: }
920:
1.11 millert 921: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 922: { /* uses relex() to scan regular expression */
923: Node *np;
924:
1.32 millert 925: DPRINTF("reparse <%s>\n", p);
1.28 millert 926: lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 927: rtok = relex();
1.11 millert 928: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 929: if (rtok == '\0') {
1.11 millert 930: /* FATAL("empty regular expression"); previous */
1.15 millert 931: return(op2(EMPTYRE, NIL, NIL));
932: }
1.1 tholo 933: np = regexp();
934: if (rtok != '\0')
1.8 millert 935: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 936: return(np);
937: }
938:
939: Node *regexp(void) /* top-level parse of reg expr */
940: {
941: return (alt(concat(primary())));
942: }
943:
944: Node *primary(void)
945: {
946: Node *np;
1.23 millert 947: int savelastatom;
1.1 tholo 948:
949: switch (rtok) {
950: case CHAR:
1.23 millert 951: lastatom = starttok;
1.7 millert 952: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 953: rtok = relex();
954: return (unary(np));
955: case ALL:
956: rtok = relex();
957: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 958: case EMPTYRE:
959: rtok = relex();
1.23 millert 960: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 961: case DOT:
1.23 millert 962: lastatom = starttok;
1.1 tholo 963: rtok = relex();
964: return (unary(op2(DOT, NIL, NIL)));
965: case CCL:
1.28 millert 966: np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23 millert 967: lastatom = starttok;
1.1 tholo 968: rtok = relex();
969: return (unary(np));
970: case NCCL:
1.28 millert 971: np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23 millert 972: lastatom = starttok;
1.1 tholo 973: rtok = relex();
974: return (unary(np));
975: case '^':
976: rtok = relex();
1.7 millert 977: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 978: case '$':
979: rtok = relex();
980: return (unary(op2(CHAR, NIL, NIL)));
981: case '(':
1.23 millert 982: lastatom = starttok;
983: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 984: rtok = relex();
985: if (rtok == ')') { /* special pleading for () */
986: rtok = relex();
1.42 millert 987: return unary(op2(CCL, NIL, (Node *) cclenter("")));
1.1 tholo 988: }
989: np = regexp();
990: if (rtok == ')') {
1.23 millert 991: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 992: rtok = relex();
993: return (unary(np));
994: }
995: else
1.8 millert 996: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 997: default:
1.8 millert 998: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 999: }
1000: return 0; /*NOTREACHED*/
1001: }
1002:
1003: Node *concat(Node *np)
1004: {
1005: switch (rtok) {
1.23 millert 1006: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 1007: return (concat(op2(CAT, np, primary())));
1.23 millert 1008: case EMPTYRE:
1009: rtok = relex();
1.42 millert 1010: return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1.23 millert 1011: primary())));
1.1 tholo 1012: }
1013: return (np);
1014: }
1015:
1016: Node *alt(Node *np)
1017: {
1018: if (rtok == OR) {
1019: rtok = relex();
1020: return (alt(op2(OR, np, concat(primary()))));
1021: }
1022: return (np);
1023: }
1024:
1025: Node *unary(Node *np)
1026: {
1027: switch (rtok) {
1028: case STAR:
1029: rtok = relex();
1030: return (unary(op2(STAR, np, NIL)));
1031: case PLUS:
1032: rtok = relex();
1033: return (unary(op2(PLUS, np, NIL)));
1034: case QUEST:
1035: rtok = relex();
1036: return (unary(op2(QUEST, np, NIL)));
1.31 millert 1037: case ZERO:
1038: rtok = relex();
1039: return (unary(op2(ZERO, np, NIL)));
1.1 tholo 1040: default:
1041: return (np);
1042: }
1043: }
1044:
1.11 millert 1045: /*
1046: * Character class definitions conformant to the POSIX locale as
1047: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1048: * and operating character sets are both ASCII (ISO646) or supersets
1049: * thereof.
1050: *
1051: * Note that to avoid overflowing the temporary buffer used in
1052: * relex(), the expanded character class (prior to range expansion)
1053: * must be less than twice the size of their full name.
1054: */
1.12 millert 1055:
1056: /* Because isblank doesn't show up in any of the header files on any
1057: * system i use, it's defined here. if some other locale has a richer
1058: * definition of "blank", define HAS_ISBLANK and provide your own
1059: * version.
1060: * the parentheses here are an attempt to find a path through the maze
1061: * of macro definition and/or function and/or version provided. thanks
1062: * to nelson beebe for the suggestion; let's see if it works everywhere.
1063: */
1064:
1065: #ifndef HAS_ISBLANK
1066:
1.17 millert 1067: int (xisblank)(int c)
1.12 millert 1068: {
1069: return c==' ' || c=='\t';
1070: }
1071:
1072: #endif
1073:
1.31 millert 1074: static const struct charclass {
1.11 millert 1075: const char *cc_name;
1076: int cc_namelen;
1.12 millert 1077: int (*cc_func)(int);
1.11 millert 1078: } charclasses[] = {
1.12 millert 1079: { "alnum", 5, isalnum },
1080: { "alpha", 5, isalpha },
1.17 millert 1081: #ifndef HAS_ISBLANK
1.21 millert 1082: { "blank", 5, xisblank },
1.17 millert 1083: #else
1.12 millert 1084: { "blank", 5, isblank },
1.17 millert 1085: #endif
1.12 millert 1086: { "cntrl", 5, iscntrl },
1087: { "digit", 5, isdigit },
1088: { "graph", 5, isgraph },
1089: { "lower", 5, islower },
1090: { "print", 5, isprint },
1091: { "punct", 5, ispunct },
1092: { "space", 5, isspace },
1093: { "upper", 5, isupper },
1094: { "xdigit", 6, isxdigit },
1.11 millert 1095: { NULL, 0, NULL },
1096: };
1097:
1.23 millert 1098: #define REPEAT_SIMPLE 0
1099: #define REPEAT_PLUS_APPENDED 1
1100: #define REPEAT_WITH_Q 2
1101: #define REPEAT_ZERO 3
1102:
1103: static int
1104: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1105: int atomlen, int firstnum, int secondnum, int special_case)
1106: {
1107: int i, j;
1.26 millert 1108: uschar *buf = NULL;
1.23 millert 1109: int ret = 1;
1.31 millert 1110: int init_q = (firstnum == 0); /* first added char will be ? */
1.23 millert 1111: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1112: int prefix_length = reptok - basestr; /* prefix includes first rep */
1.28 millert 1113: int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1.23 millert 1114: int size = prefix_length + suffix_length;
1115:
1116: if (firstnum > 1) { /* add room for reps 2 through firstnum */
1117: size += atomlen*(firstnum-1);
1118: }
1119:
1120: /* Adjust size of buffer for special cases */
1121: if (special_case == REPEAT_PLUS_APPENDED) {
1122: size++; /* for the final + */
1123: } else if (special_case == REPEAT_WITH_Q) {
1.36 millert 1124: size += init_q + (atomlen+1)* (n_q_reps-init_q);
1.23 millert 1125: } else if (special_case == REPEAT_ZERO) {
1126: size += 2; /* just a null ERE: () */
1127: }
1.35 millert 1128: if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23 millert 1129: FATAL("out of space in reg expr %.10s..", lastre);
1130: memcpy(buf, basestr, prefix_length); /* copy prefix */
1131: j = prefix_length;
1132: if (special_case == REPEAT_ZERO) {
1133: j -= atomlen;
1134: buf[j++] = '(';
1135: buf[j++] = ')';
1136: }
1.30 millert 1137: for (i = 1; i < firstnum; i++) { /* copy x reps */
1.23 millert 1138: memcpy(&buf[j], atom, atomlen);
1139: j += atomlen;
1140: }
1141: if (special_case == REPEAT_PLUS_APPENDED) {
1142: buf[j++] = '+';
1143: } else if (special_case == REPEAT_WITH_Q) {
1.29 millert 1144: if (init_q)
1145: buf[j++] = '?';
1.30 millert 1146: for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1.23 millert 1147: memcpy(&buf[j], atom, atomlen);
1148: j += atomlen;
1149: buf[j++] = '?';
1150: }
1151: }
1152: memcpy(&buf[j], reptok+reptoklen, suffix_length);
1.36 millert 1153: j += suffix_length;
1154: buf[j] = '\0';
1.23 millert 1155: /* free old basestr */
1156: if (firstbasestr != basestr) {
1157: if (basestr)
1158: xfree(basestr);
1159: }
1160: basestr = buf;
1161: prestr = buf + prefix_length;
1162: if (special_case == REPEAT_ZERO) {
1163: prestr -= atomlen;
1164: ret++;
1165: }
1166: return ret;
1167: }
1168:
1169: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1170: int atomlen, int firstnum, int secondnum)
1171: {
1172: /*
1173: In general, the repetition specifier or "bound" is replaced here
1174: by an equivalent ERE string, repeating the immediately previous atom
1175: and appending ? and + as needed. Note that the first copy of the
1176: atom is left in place, except in the special_case of a zero-repeat
1177: (i.e., {0}).
1178: */
1179: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1180: if (firstnum < 2) {
1181: /* 0 or 1: should be handled before you get here */
1182: FATAL("internal error");
1183: } else {
1184: return replace_repeat(reptok, reptoklen, atom, atomlen,
1185: firstnum, secondnum, REPEAT_PLUS_APPENDED);
1186: }
1187: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1188: if (firstnum == 0) { /* {0} or {0,0} */
1189: /* This case is unusual because the resulting
1190: replacement string might actually be SMALLER than
1191: the original ERE */
1192: return replace_repeat(reptok, reptoklen, atom, atomlen,
1193: firstnum, secondnum, REPEAT_ZERO);
1194: } else { /* (firstnum >= 1) */
1195: return replace_repeat(reptok, reptoklen, atom, atomlen,
1196: firstnum, secondnum, REPEAT_SIMPLE);
1197: }
1198: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1199: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1200: return replace_repeat(reptok, reptoklen, atom, atomlen,
1201: firstnum, secondnum, REPEAT_WITH_Q);
1202: } else { /* Error - shouldn't be here (n>m) */
1203: FATAL("internal error");
1204: }
1205: return 0;
1206: }
1.11 millert 1207:
1.38 millert 1208: extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
1209:
1.1 tholo 1210: int relex(void) /* lexical analyzer for reparse */
1211: {
1.5 kstailey 1212: int c, n;
1.1 tholo 1213: int cflag;
1.25 millert 1214: static uschar *buf = NULL;
1.5 kstailey 1215: static int bufsz = 100;
1.10 millert 1216: uschar *bp;
1.31 millert 1217: const struct charclass *cc;
1.12 millert 1218: int i;
1.29 millert 1219: int num, m;
1220: bool commafound, digitfound;
1.23 millert 1221: const uschar *startreptok;
1.24 millert 1222: static int parens = 0;
1.23 millert 1223:
1224: rescan:
1225: starttok = prestr;
1.1 tholo 1226:
1.38 millert 1227: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1228: prestr += n;
1229: starttok = prestr;
1230: return CHAR;
1231: }
1232:
1.1 tholo 1233: switch (c = *prestr++) {
1234: case '|': return OR;
1235: case '*': return STAR;
1236: case '+': return PLUS;
1237: case '?': return QUEST;
1238: case '.': return DOT;
1239: case '\0': prestr--; return '\0';
1240: case '^':
1241: case '$':
1.24 millert 1242: return c;
1.1 tholo 1243: case '(':
1.24 millert 1244: parens++;
1245: return c;
1.1 tholo 1246: case ')':
1.24 millert 1247: if (parens) {
1248: parens--;
1249: return c;
1250: }
1251: /* unmatched close parenthesis; per POSIX, treat as literal */
1252: rlxval = c;
1253: return CHAR;
1.1 tholo 1254: case '\\':
1.17 millert 1255: rlxval = quoted(&prestr);
1.1 tholo 1256: return CHAR;
1257: default:
1258: rlxval = c;
1259: return CHAR;
1.25 millert 1260: case '[':
1.35 millert 1261: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 1262: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 1263: bp = buf;
1.1 tholo 1264: if (*prestr == '^') {
1265: cflag = 1;
1266: prestr++;
1267: }
1268: else
1269: cflag = 0;
1.38 millert 1270: n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1.15 millert 1271: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 1272: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 1273: for (; ; ) {
1.38 millert 1274: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1275: for (i = 0; i < n; i++)
1276: *bp++ = *prestr++;
1277: continue;
1278: }
1.1 tholo 1279: if ((c = *prestr++) == '\\') {
1.5 kstailey 1280: *bp++ = '\\';
1.1 tholo 1281: if ((c = *prestr++) == '\0')
1.8 millert 1282: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 1283: *bp++ = c;
1.10 millert 1284: /* } else if (c == '\n') { */
1285: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 1286: } else if (c == '[' && *prestr == ':') {
1287: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1288: for (cc = charclasses; cc->cc_name; cc++)
1289: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1290: break;
1291: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1292: prestr[2 + cc->cc_namelen] == ']') {
1293: prestr += cc->cc_namelen + 3;
1.22 millert 1294: /*
1295: * BUG: We begin at 1, instead of 0, since we
1296: * would otherwise prematurely terminate the
1297: * string for classes like [[:cntrl:]]. This
1298: * means that we can't match the NUL character,
1299: * not without first adapting the entire
1300: * program to track each string's length.
1301: */
1.23 millert 1302: for (i = 1; i <= UCHAR_MAX; i++) {
1.37 millert 1303: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1.12 millert 1304: FATAL("out of space for reg expr %.10s...", lastre);
1305: if (cc->cc_func(i)) {
1.32 millert 1306: /* escape backslash */
1307: if (i == '\\') {
1308: *bp++ = '\\';
1309: n++;
1310: }
1311:
1.12 millert 1312: *bp++ = i;
1313: n++;
1314: }
1315: }
1.11 millert 1316: } else
1317: *bp++ = c;
1.23 millert 1318: } else if (c == '[' && *prestr == '.') {
1319: char collate_char;
1320: prestr++;
1321: collate_char = *prestr++;
1322: if (*prestr == '.' && prestr[1] == ']') {
1323: prestr += 2;
1324: /* Found it: map via locale TBD: for
1325: now, simply return this char. This
1326: is sufficient to pass conformance
1327: test awk.ex 156
1328: */
1329: if (*prestr == ']') {
1330: prestr++;
1331: rlxval = collate_char;
1332: return CHAR;
1333: }
1334: }
1335: } else if (c == '[' && *prestr == '=') {
1336: char equiv_char;
1337: prestr++;
1338: equiv_char = *prestr++;
1339: if (*prestr == '=' && prestr[1] == ']') {
1340: prestr += 2;
1341: /* Found it: map via locale TBD: for now
1342: simply return this char. This is
1343: sufficient to pass conformance test
1344: awk.ex 156
1345: */
1346: if (*prestr == ']') {
1347: prestr++;
1348: rlxval = equiv_char;
1349: return CHAR;
1350: }
1351: }
1.1 tholo 1352: } else if (c == '\0') {
1.8 millert 1353: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1354: } else if (bp == buf) { /* 1st char is special */
1355: *bp++ = c;
1.1 tholo 1356: } else if (c == ']') {
1.5 kstailey 1357: *bp++ = 0;
1.10 millert 1358: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1359: if (cflag == 0)
1360: return CCL;
1361: else
1362: return NCCL;
1363: } else
1.5 kstailey 1364: *bp++ = c;
1.1 tholo 1365: }
1.23 millert 1366: break;
1367: case '{':
1368: if (isdigit(*(prestr))) {
1369: num = 0; /* Process as a repetition */
1370: n = -1; m = -1;
1.29 millert 1371: commafound = false;
1372: digitfound = false;
1.23 millert 1373: startreptok = prestr-1;
1374: /* Remember start of previous atom here ? */
1375: } else { /* just a { char, not a repetition */
1376: rlxval = c;
1377: return CHAR;
1378: }
1379: for (; ; ) {
1380: if ((c = *prestr++) == '}') {
1381: if (commafound) {
1382: if (digitfound) { /* {n,m} */
1383: m = num;
1.30 millert 1384: if (m < n)
1.23 millert 1385: FATAL("illegal repetition expression: class %.20s",
1386: lastre);
1.30 millert 1387: if (n == 0 && m == 1) {
1.23 millert 1388: return QUEST;
1389: }
1390: } else { /* {n,} */
1.30 millert 1391: if (n == 0)
1392: return STAR;
1393: else if (n == 1)
1394: return PLUS;
1.23 millert 1395: }
1396: } else {
1397: if (digitfound) { /* {n} same as {n,n} */
1398: n = num;
1399: m = num;
1400: } else { /* {} */
1401: FATAL("illegal repetition expression: class %.20s",
1402: lastre);
1403: }
1404: }
1405: if (repeat(starttok, prestr-starttok, lastatom,
1406: startreptok - lastatom, n, m) > 0) {
1.30 millert 1407: if (n == 0 && m == 0) {
1.31 millert 1408: return ZERO;
1.23 millert 1409: }
1410: /* must rescan input for next token */
1411: goto rescan;
1412: }
1413: /* Failed to replace: eat up {...} characters
1414: and treat like just PLUS */
1415: return PLUS;
1416: } else if (c == '\0') {
1417: FATAL("nonterminated character class %.20s",
1418: lastre);
1419: } else if (isdigit(c)) {
1420: num = 10 * num + c - '0';
1.29 millert 1421: digitfound = true;
1.23 millert 1422: } else if (c == ',') {
1423: if (commafound)
1424: FATAL("illegal repetition expression: class %.20s",
1425: lastre);
1426: /* looking for {n,} or {n,m} */
1.29 millert 1427: commafound = true;
1.23 millert 1428: n = num;
1.29 millert 1429: digitfound = false; /* reset */
1.23 millert 1430: num = 0;
1431: } else {
1432: FATAL("illegal repetition expression: class %.20s",
1433: lastre);
1434: }
1435: }
1436: break;
1.1 tholo 1437: }
1438: }
1439:
1440: int cgoto(fa *f, int s, int c)
1441: {
1.27 millert 1442: int *p, *q;
1.1 tholo 1443: int i, j, k;
1444:
1.38 millert 1445: /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1.1 tholo 1446: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 1447: resizesetvec(__func__);
1.1 tholo 1448: }
1449: for (i = 0; i <= f->accept; i++)
1450: setvec[i] = 0;
1451: setcnt = 0;
1.27 millert 1452: resize_state(f, s);
1.1 tholo 1453: /* compute positions of gototab[s,c] into setvec */
1454: p = f->posns[s];
1455: for (i = 1; i <= *p; i++) {
1456: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1457: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1458: || (k == DOT && c != 0 && c != HAT)
1459: || (k == ALL && c != 0)
1.15 millert 1460: || (k == EMPTYRE && c != 0)
1.38 millert 1461: || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1462: || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1.1 tholo 1463: q = f->re[p[i]].lfollow;
1464: for (j = 1; j <= *q; j++) {
1465: if (q[j] >= maxsetvec) {
1.27 millert 1466: resizesetvec(__func__);
1.1 tholo 1467: }
1468: if (setvec[q[j]] == 0) {
1469: setcnt++;
1470: setvec[q[j]] = 1;
1471: }
1472: }
1473: }
1474: }
1475: }
1476: /* determine if setvec is a previous state */
1477: tmpset[0] = setcnt;
1478: j = 1;
1479: for (i = f->accept; i >= 0; i--)
1480: if (setvec[i]) {
1481: tmpset[j++] = i;
1482: }
1.27 millert 1483: resize_state(f, f->curstat > s ? f->curstat : s);
1.1 tholo 1484: /* tmpset == previous state? */
1485: for (i = 1; i <= f->curstat; i++) {
1486: p = f->posns[i];
1487: if ((k = tmpset[0]) != p[0])
1488: goto different;
1489: for (j = 1; j <= k; j++)
1490: if (tmpset[j] != p[j])
1491: goto different;
1492: /* setvec is state i */
1.30 millert 1493: if (c != HAT)
1.38 millert 1494: set_gototab(f, s, c, i);
1.1 tholo 1495: return i;
1496: different:;
1497: }
1498:
1499: /* add tmpset to current set of states */
1.27 millert 1500: ++(f->curstat);
1501: resize_state(f, f->curstat);
1.1 tholo 1502: xfree(f->posns[f->curstat]);
1.27 millert 1503: p = intalloc(setcnt + 1, __func__);
1.1 tholo 1504:
1505: f->posns[f->curstat] = p;
1.30 millert 1506: if (c != HAT)
1.38 millert 1507: set_gototab(f, s, c, f->curstat);
1.1 tholo 1508: for (i = 0; i <= setcnt; i++)
1509: p[i] = tmpset[i];
1510: if (setvec[f->accept])
1511: f->out[f->curstat] = 1;
1512: else
1513: f->out[f->curstat] = 0;
1514: return f->curstat;
1515: }
1516:
1517:
1518: void freefa(fa *f) /* free a finite automaton */
1519: {
1520: int i;
1521:
1522: if (f == NULL)
1523: return;
1.27 millert 1524: for (i = 0; i < f->state_count; i++)
1525: xfree(f->gototab[i])
1.1 tholo 1526: for (i = 0; i <= f->curstat; i++)
1527: xfree(f->posns[i]);
1528: for (i = 0; i <= f->accept; i++) {
1529: xfree(f->re[i].lfollow);
1530: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30 millert 1531: xfree(f->re[i].lval.np);
1.1 tholo 1532: }
1533: xfree(f->restr);
1.27 millert 1534: xfree(f->out);
1535: xfree(f->posns);
1536: xfree(f->gototab);
1.1 tholo 1537: xfree(f);
1538: }