Annotation of src/usr.bin/awk/b.c, Revision 1.47
1.47 ! millert 1: /* $OpenBSD: b.c,v 1.46 2023/11/15 18:48:13 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.34 millert 36: #include "awkgram.tab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.28 millert 65: static const uschar *rlxstr;
66: static const uschar *prestr; /* current position in current re */
67: static const uschar *lastre; /* origin of last re */
68: static const uschar *lastatom; /* origin of last Atom */
69: static const uschar *starttok;
70: static const uschar *basestr; /* starts with original, replaced during
1.23 millert 71: repetition processing */
1.28 millert 72: static const uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
1.28 millert 77: const char *patbeg;
1.1 tholo 78: int patlen;
79:
1.27 millert 80: #define NFA 128 /* cache this many dynamic fa's */
1.1 tholo 81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.43 millert 84: extern int u8_nextlen(const char *s);
85:
1.38 millert 86:
87: /* utf-8 mechanism:
88:
89: For most of Awk, utf-8 strings just "work", since they look like
90: null-terminated sequences of 8-bit bytes.
91:
92: Functions like length(), index(), and substr() have to operate
93: in units of utf-8 characters. The u8_* functions in run.c
94: handle this.
95:
96: Regular expressions are more complicated, since the basic
97: mechanism of the goto table used 8-bit byte indices into the
98: gototab entries to compute the next state. Unicode is a lot
99: bigger, so the gototab entries are now structs with a character
100: and a next state, and there is a linear search of the characters
101: to find the state. (Yes, this is slower, by a significant
102: amount. Tough.)
103:
104: Throughout the RE mechanism in b.c, utf-8 characters are
105: converted to their utf-32 value. This mostly shows up in
106: cclenter, which expands character class ranges like a-z and now
107: alpha-omega. The size of a gototab array is still about 256.
108: This should be dynamic, but for now things work ok for a single
109: code page of Unicode, which is the most likely case.
110:
111: The code changes are localized in run.c and b.c. I have added a
112: handful of functions to somewhat better hide the implementation,
113: but a lot more could be done.
114:
115: */
116:
117: static int get_gototab(fa*, int, int);
118: static int set_gototab(fa*, int, int, int);
1.44 millert 119: static void reset_gototab(fa*, int);
1.38 millert 120: extern int u8_rune(int *, const uschar *);
121:
1.27 millert 122: static int *
123: intalloc(size_t n, const char *f)
124: {
1.35 millert 125: int *p = (int *) calloc(n, sizeof(int));
1.27 millert 126: if (p == NULL)
127: overflo(f);
128: return p;
129: }
130:
131: static void
132: allocsetvec(const char *f)
133: {
134: maxsetvec = MAXLIN;
1.35 millert 135: setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
136: tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27 millert 137: if (setvec == NULL || tmpset == NULL)
138: overflo(f);
139: }
140:
141: static void
142: resizesetvec(const char *f)
143: {
1.35 millert 144: setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
145: tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27 millert 146: if (setvec == NULL || tmpset == NULL)
147: overflo(f);
148: maxsetvec *= 4;
149: }
150:
151: static void
152: resize_state(fa *f, int state)
153: {
1.38 millert 154: gtt **p;
1.35 millert 155: uschar *p2;
156: int **p3;
1.27 millert 157: int i, new_count;
158:
159: if (++state < f->state_count)
160: return;
161:
162: new_count = state + 10; /* needs to be tuned */
163:
1.38 millert 164: p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
1.27 millert 165: if (p == NULL)
166: goto out;
167: f->gototab = p;
168:
1.35 millert 169: p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
170: if (p2 == NULL)
1.27 millert 171: goto out;
1.35 millert 172: f->out = p2;
1.27 millert 173:
1.35 millert 174: p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
175: if (p3 == NULL)
1.27 millert 176: goto out;
1.35 millert 177: f->posns = p3;
1.27 millert 178:
179: for (i = f->state_count; i < new_count; ++i) {
1.38 millert 180: f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
1.27 millert 181: if (f->gototab[i] == NULL)
182: goto out;
183: f->out[i] = 0;
184: f->posns[i] = NULL;
185: }
1.38 millert 186: f->gototab_len = NCHARS; /* should be variable, growable */
1.27 millert 187: f->state_count = new_count;
188: return;
189: out:
190: overflo(__func__);
191: }
192:
1.29 millert 193: fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1.1 tholo 194: {
195: int i, use, nuse;
196: fa *pfa;
1.6 millert 197: static int now = 1;
1.1 tholo 198:
1.25 millert 199: if (setvec == NULL) { /* first time through any RE */
1.27 millert 200: allocsetvec(__func__);
1.1 tholo 201: }
202:
1.29 millert 203: if (compile_time != RUNNING) /* a constant for sure */
1.1 tholo 204: return mkdfa(s, anchor);
205: for (i = 0; i < nfatab; i++) /* is it there already? */
206: if (fatab[i]->anchor == anchor
1.11 millert 207: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 208: fatab[i]->use = now++;
1.1 tholo 209: return fatab[i];
1.10 millert 210: }
1.1 tholo 211: pfa = mkdfa(s, anchor);
212: if (nfatab < NFA) { /* room for another */
213: fatab[nfatab] = pfa;
1.6 millert 214: fatab[nfatab]->use = now++;
1.1 tholo 215: nfatab++;
216: return pfa;
217: }
218: use = fatab[0]->use; /* replace least-recently used */
219: nuse = 0;
220: for (i = 1; i < nfatab; i++)
221: if (fatab[i]->use < use) {
222: use = fatab[i]->use;
223: nuse = i;
224: }
225: freefa(fatab[nuse]);
226: fatab[nuse] = pfa;
1.6 millert 227: pfa->use = now++;
1.1 tholo 228: return pfa;
229: }
230:
1.29 millert 231: fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
1.30 millert 232: /* anchor = true for anchored matches, else false */
1.1 tholo 233: {
234: Node *p, *p1;
235: fa *f;
236:
1.28 millert 237: firstbasestr = (const uschar *) s;
1.23 millert 238: basestr = firstbasestr;
1.1 tholo 239: p = reparse(s);
240: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
241: /* put ALL STAR in front of reg. exp. */
242: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
243: /* put FINAL after reg. exp. */
244:
245: poscnt = 0;
246: penter(p1); /* enter parent pointers and leaf indices */
1.35 millert 247: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27 millert 248: overflo(__func__);
1.1 tholo 249: f->accept = poscnt-1; /* penter has computed number of positions in re */
250: cfoll(f, p1); /* set up follow sets */
251: freetr(p1);
1.27 millert 252: resize_state(f, 1);
253: f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
254: f->posns[1] = intalloc(1, __func__);
1.1 tholo 255: *f->posns[1] = 0;
256: f->initstat = makeinit(f, anchor);
257: f->anchor = anchor;
1.10 millert 258: f->restr = (uschar *) tostring(s);
1.23 millert 259: if (firstbasestr != basestr) {
260: if (basestr)
261: xfree(basestr);
262: }
1.1 tholo 263: return f;
264: }
265:
1.29 millert 266: int makeinit(fa *f, bool anchor)
1.1 tholo 267: {
268: int i, k;
269:
270: f->curstat = 2;
271: f->out[2] = 0;
272: k = *(f->re[0].lfollow);
1.25 millert 273: xfree(f->posns[2]);
1.27 millert 274: f->posns[2] = intalloc(k + 1, __func__);
1.30 millert 275: for (i = 0; i <= k; i++) {
1.1 tholo 276: (f->posns[2])[i] = (f->re[0].lfollow)[i];
277: }
278: if ((f->posns[2])[1] == f->accept)
279: f->out[2] = 1;
1.44 millert 280: reset_gototab(f, 2);
1.1 tholo 281: f->curstat = cgoto(f, 2, HAT);
282: if (anchor) {
283: *f->posns[2] = k-1; /* leave out position 0 */
1.30 millert 284: for (i = 0; i < k; i++) {
1.1 tholo 285: (f->posns[0])[i] = (f->posns[2])[i];
286: }
287:
288: f->out[0] = f->out[2];
289: if (f->curstat != 2)
290: --(*f->posns[f->curstat]);
291: }
292: return f->curstat;
293: }
294:
295: void penter(Node *p) /* set up parent pointers and leaf indices */
296: {
297: switch (type(p)) {
1.15 millert 298: ELEAF
1.1 tholo 299: LEAF
1.7 millert 300: info(p) = poscnt;
1.1 tholo 301: poscnt++;
302: break;
303: UNARY
304: penter(left(p));
305: parent(left(p)) = p;
306: break;
307: case CAT:
308: case OR:
309: penter(left(p));
310: penter(right(p));
311: parent(left(p)) = p;
312: parent(right(p)) = p;
313: break;
1.31 millert 314: case ZERO:
315: break;
1.1 tholo 316: default: /* can't happen */
1.8 millert 317: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 318: break;
319: }
320: }
321:
322: void freetr(Node *p) /* free parse tree */
323: {
324: switch (type(p)) {
1.15 millert 325: ELEAF
1.1 tholo 326: LEAF
327: xfree(p);
328: break;
329: UNARY
1.31 millert 330: case ZERO:
1.1 tholo 331: freetr(left(p));
332: xfree(p);
333: break;
334: case CAT:
335: case OR:
336: freetr(left(p));
337: freetr(right(p));
338: xfree(p);
339: break;
340: default: /* can't happen */
1.8 millert 341: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 342: break;
343: }
344: }
345:
346: /* in the parsing of regular expressions, metacharacters like . have */
347: /* to be seen literally; \056 is not a metacharacter. */
348:
1.38 millert 349: int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
1.2 millert 350: { /* only pick up one 8-bit byte (2 chars) */
1.28 millert 351: const uschar *p;
1.1 tholo 352: int n = 0;
1.2 millert 353: int i;
1.1 tholo 354:
1.38 millert 355: for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
1.1 tholo 356: if (isdigit(*p))
357: n = 16 * n + *p - '0';
358: else if (*p >= 'a' && *p <= 'f')
359: n = 16 * n + *p - 'a' + 10;
360: else if (*p >= 'A' && *p <= 'F')
361: n = 16 * n + *p - 'A' + 10;
362: }
1.28 millert 363: *pp = p;
1.1 tholo 364: return n;
365: }
366:
1.2 millert 367: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 368:
1.28 millert 369: int quoted(const uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 370: /* and increment *pp */
371: {
1.28 millert 372: const uschar *p = *pp;
1.1 tholo 373: int c;
374:
1.38 millert 375: /* BUG: should advance by utf-8 char even if makes no sense */
376:
377: if ((c = *p++) == 't') {
1.1 tholo 378: c = '\t';
1.38 millert 379: } else if (c == 'n') {
1.1 tholo 380: c = '\n';
1.38 millert 381: } else if (c == 'f') {
1.1 tholo 382: c = '\f';
1.38 millert 383: } else if (c == 'r') {
1.1 tholo 384: c = '\r';
1.38 millert 385: } else if (c == 'b') {
1.1 tholo 386: c = '\b';
1.38 millert 387: } else if (c == 'v') {
1.25 millert 388: c = '\v';
1.38 millert 389: } else if (c == 'a') {
1.25 millert 390: c = '\a';
1.38 millert 391: } else if (c == '\\') {
1.1 tholo 392: c = '\\';
1.38 millert 393: } else if (c == 'x') { /* 2 hex digits follow */
394: c = hexstr(&p, 2); /* this adds a null if number is invalid */
395: } else if (c == 'u') { /* unicode char number up to 8 hex digits */
396: c = hexstr(&p, 8);
1.1 tholo 397: } else if (isoctdigit(c)) { /* \d \dd \ddd */
398: int n = c - '0';
399: if (isoctdigit(*p)) {
400: n = 8 * n + *p++ - '0';
401: if (isoctdigit(*p))
402: n = 8 * n + *p++ - '0';
403: }
404: c = n;
405: } /* else */
406: /* c = c; */
407: *pp = p;
408: return c;
409: }
410:
1.38 millert 411: int *cclenter(const char *argp) /* add a character class */
1.1 tholo 412: {
413: int i, c, c2;
1.38 millert 414: int n;
415: const uschar *p = (const uschar *) argp;
416: int *bp, *retp;
417: static int *buf = NULL;
1.5 kstailey 418: static int bufsz = 100;
1.1 tholo 419:
1.38 millert 420: if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
1.8 millert 421: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 422: bp = buf;
1.38 millert 423: for (i = 0; *p != 0; ) {
424: n = u8_rune(&c, p);
425: p += n;
1.1 tholo 426: if (c == '\\') {
1.17 millert 427: c = quoted(&p);
1.5 kstailey 428: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 429: if (*p != 0) {
1.5 kstailey 430: c = bp[-1];
1.38 millert 431: /* c2 = *p++; */
432: n = u8_rune(&c2, p);
433: p += n;
1.1 tholo 434: if (c2 == '\\')
1.38 millert 435: c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
1.1 tholo 436: if (c > c2) { /* empty; ignore */
1.5 kstailey 437: bp--;
1.1 tholo 438: i--;
439: continue;
440: }
441: while (c < c2) {
1.38 millert 442: if (i >= bufsz) {
1.45 millert 443: buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38 millert 444: if (buf == NULL)
445: FATAL("out of space for character class [%.10s...] 2", p);
446: bufsz *= 2;
447: bp = buf + i;
448: }
1.5 kstailey 449: *bp++ = ++c;
1.1 tholo 450: i++;
451: }
452: continue;
453: }
454: }
1.38 millert 455: if (i >= bufsz) {
1.45 millert 456: buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38 millert 457: if (buf == NULL)
458: FATAL("out of space for character class [%.10s...] 2", p);
459: bufsz *= 2;
460: bp = buf + i;
461: }
1.5 kstailey 462: *bp++ = c;
1.1 tholo 463: i++;
464: }
1.5 kstailey 465: *bp = 0;
1.38 millert 466: /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
467: /* xfree(op); BUG: what are we freeing here? */
468: retp = (int *) calloc(bp-buf+1, sizeof(int));
469: for (i = 0; i < bp-buf+1; i++)
470: retp[i] = buf[i];
471: return retp;
1.1 tholo 472: }
473:
1.11 millert 474: void overflo(const char *s)
1.1 tholo 475: {
1.27 millert 476: FATAL("regular expression too big: out of space in %.30s...", s);
1.1 tholo 477: }
478:
479: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
480: {
481: int i;
1.4 millert 482: int *p;
1.1 tholo 483:
484: switch (type(v)) {
1.15 millert 485: ELEAF
1.1 tholo 486: LEAF
1.7 millert 487: f->re[info(v)].ltype = type(v);
488: f->re[info(v)].lval.np = right(v);
1.1 tholo 489: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 490: resizesetvec(__func__);
1.1 tholo 491: }
492: for (i = 0; i <= f->accept; i++)
493: setvec[i] = 0;
494: setcnt = 0;
495: follow(v); /* computes setvec and setcnt */
1.27 millert 496: p = intalloc(setcnt + 1, __func__);
1.7 millert 497: f->re[info(v)].lfollow = p;
1.1 tholo 498: *p = setcnt;
499: for (i = f->accept; i >= 0; i--)
500: if (setvec[i] == 1)
501: *++p = i;
502: break;
503: UNARY
504: cfoll(f,left(v));
505: break;
506: case CAT:
507: case OR:
508: cfoll(f,left(v));
509: cfoll(f,right(v));
510: break;
1.31 millert 511: case ZERO:
512: break;
1.1 tholo 513: default: /* can't happen */
1.8 millert 514: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 515: }
516: }
517:
518: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 519: /* returns 0 if p matches empty string */
1.1 tholo 520: {
1.4 millert 521: int b, lp;
1.1 tholo 522:
523: switch (type(p)) {
1.15 millert 524: ELEAF
1.1 tholo 525: LEAF
1.7 millert 526: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 527: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.27 millert 528: resizesetvec(__func__);
1.1 tholo 529: }
1.15 millert 530: if (type(p) == EMPTYRE) {
531: setvec[lp] = 0;
532: return(0);
533: }
1.1 tholo 534: if (setvec[lp] != 1) {
535: setvec[lp] = 1;
536: setcnt++;
537: }
1.41 millert 538: if (type(p) == CCL && (*(int *) right(p)) == 0)
1.1 tholo 539: return(0); /* empty CCL */
1.30 millert 540: return(1);
1.1 tholo 541: case PLUS:
1.30 millert 542: if (first(left(p)) == 0)
543: return(0);
1.1 tholo 544: return(1);
545: case STAR:
546: case QUEST:
547: first(left(p));
548: return(0);
549: case CAT:
550: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
551: return(1);
552: case OR:
553: b = first(right(p));
554: if (first(left(p)) == 0 || b == 0) return(0);
555: return(1);
1.31 millert 556: case ZERO:
557: return 0;
1.1 tholo 558: }
1.8 millert 559: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 560: return(-1);
561: }
562:
563: void follow(Node *v) /* collects leaves that can follow v into setvec */
564: {
565: Node *p;
566:
567: if (type(v) == FINAL)
568: return;
569: p = parent(v);
570: switch (type(p)) {
571: case STAR:
572: case PLUS:
573: first(v);
574: follow(p);
575: return;
576:
577: case OR:
578: case QUEST:
579: follow(p);
580: return;
581:
582: case CAT:
583: if (v == left(p)) { /* v is left child of p */
584: if (first(right(p)) == 0) {
585: follow(p);
586: return;
587: }
588: } else /* v is right child */
589: follow(p);
590: return;
591: }
592: }
593:
1.38 millert 594: int member(int c, int *sarg) /* is c in s? */
1.1 tholo 595: {
1.38 millert 596: int *s = (int *) sarg;
1.10 millert 597:
1.1 tholo 598: while (*s)
599: if (c == *s++)
600: return(1);
601: return(0);
602: }
603:
1.38 millert 604: static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
605: {
606: int i;
607: for (i = 0; i < f->gototab_len; i++) {
608: if (f->gototab[state][i].ch == 0)
609: break;
610: if (f->gototab[state][i].ch == ch)
611: return f->gototab[state][i].state;
612: }
613: return 0;
614: }
615:
1.44 millert 616: static void reset_gototab(fa *f, int state) /* hide gototab inplementation */
617: {
618: memset(f->gototab[state], 0, f->gototab_len * sizeof(**f->gototab));
619: }
620:
1.38 millert 621: static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
622: {
623: int i;
624: for (i = 0; i < f->gototab_len; i++) {
625: if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
626: f->gototab[state][i].ch = ch;
627: f->gototab[state][i].state = val;
628: return val;
629: }
630: }
631: overflo(__func__);
632: return val; /* not used anywhere at the moment */
633: }
634:
1.11 millert 635: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 636: {
637: int s, ns;
1.38 millert 638: int n;
639: int rune;
1.28 millert 640: const uschar *p = (const uschar *) p0;
1.1 tholo 641:
1.38 millert 642: /* return pmatch(f, p0); does it matter whether longest or shortest? */
643:
1.27 millert 644: s = f->initstat;
645: assert (s < f->state_count);
646:
1.1 tholo 647: if (f->out[s])
648: return(1);
649: do {
1.15 millert 650: /* assert(*p < NCHARS); */
1.38 millert 651: n = u8_rune(&rune, p);
652: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 653: s = ns;
654: else
1.38 millert 655: s = cgoto(f, s, rune);
1.1 tholo 656: if (f->out[s])
657: return(1);
1.38 millert 658: if (*p == 0)
659: break;
660: p += n;
661: } while (1); /* was *p++ != 0 */
1.1 tholo 662: return(0);
663: }
664:
1.11 millert 665: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 666: {
667: int s, ns;
1.38 millert 668: int n;
669: int rune;
1.28 millert 670: const uschar *p = (const uschar *) p0;
671: const uschar *q;
1.1 tholo 672:
1.27 millert 673: s = f->initstat;
674: assert(s < f->state_count);
675:
1.28 millert 676: patbeg = (const char *)p;
1.1 tholo 677: patlen = -1;
678: do {
679: q = p;
680: do {
681: if (f->out[s]) /* final state */
682: patlen = q-p;
1.15 millert 683: /* assert(*q < NCHARS); */
1.38 millert 684: n = u8_rune(&rune, q);
685: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 686: s = ns;
687: else
1.38 millert 688: s = cgoto(f, s, rune);
1.27 millert 689:
690: assert(s < f->state_count);
691:
1.9 deraadt 692: if (s == 1) { /* no transition */
1.1 tholo 693: if (patlen >= 0) {
1.28 millert 694: patbeg = (const char *) p;
1.1 tholo 695: return(1);
696: }
697: else
698: goto nextin; /* no match */
1.9 deraadt 699: }
1.38 millert 700: if (*q == 0)
701: break;
702: q += n;
703: } while (1);
704: q++; /* was *q++ */
1.1 tholo 705: if (f->out[s])
706: patlen = q-p-1; /* don't count $ */
707: if (patlen >= 0) {
1.28 millert 708: patbeg = (const char *) p;
1.1 tholo 709: return(1);
710: }
711: nextin:
712: s = 2;
1.38 millert 713: if (*p == 0)
714: break;
715: n = u8_rune(&rune, p);
716: p += n;
717: } while (1); /* was *p++ */
1.1 tholo 718: return (0);
719: }
720:
1.11 millert 721: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 722: {
723: int s, ns;
1.38 millert 724: int n;
725: int rune;
1.28 millert 726: const uschar *p = (const uschar *) p0;
727: const uschar *q;
1.1 tholo 728:
1.27 millert 729: s = f->initstat;
730: assert(s < f->state_count);
731:
1.28 millert 732: patbeg = (const char *)p;
1.1 tholo 733: patlen = -1;
734: while (*p) {
735: q = p;
736: do {
737: if (f->out[s]) /* final state */
738: patlen = q-p;
1.15 millert 739: /* assert(*q < NCHARS); */
1.38 millert 740: n = u8_rune(&rune, q);
741: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 742: s = ns;
743: else
1.38 millert 744: s = cgoto(f, s, rune);
1.9 deraadt 745: if (s == 1) { /* no transition */
1.1 tholo 746: if (patlen > 0) {
1.28 millert 747: patbeg = (const char *) p;
1.1 tholo 748: return(1);
749: } else
750: goto nnextin; /* no nonempty match */
1.9 deraadt 751: }
1.38 millert 752: if (*q == 0)
753: break;
754: q += n;
755: } while (1);
756: q++;
1.1 tholo 757: if (f->out[s])
758: patlen = q-p-1; /* don't count $ */
759: if (patlen > 0 ) {
1.28 millert 760: patbeg = (const char *) p;
1.1 tholo 761: return(1);
762: }
763: nnextin:
764: s = 2;
765: p++;
766: }
767: return (0);
768: }
769:
1.43 millert 770:
771: #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
772:
773: // Read one rune at a time from the given FILE*. Return both
774: // the bytes and the actual rune.
775:
776: struct runedata {
777: int rune;
778: size_t len;
779: char bytes[6];
780: };
781:
782: struct runedata getrune(FILE *fp)
1.38 millert 783: {
1.43 millert 784: struct runedata result;
785: int c, i, next;
786:
787: memset(&result, 0, sizeof(result));
788:
789: c = getc(fp);
790: if (c == EOF)
791: return result; // result.rune == 0 --> EOF
792: else if (c < 128 || awk_mb_cur_max == 1) {
793: result.bytes[0] = c;
794: result.len = 1;
795: result.rune = c;
796:
797: return result;
798: }
799:
800: // need to get bytes and fill things in
801: result.bytes[0] = c;
802: result.len = 1;
803:
804: next = 1;
805: for (i = 1; i < MAX_UTF_BYTES; i++) {
806: c = getc(fp);
807: if (c == EOF)
1.38 millert 808: break;
1.43 millert 809: result.bytes[next++] = c;
810: result.len++;
811: }
812:
813: // put back any extra input bytes
814: int actual_len = u8_nextlen(result.bytes);
815: while (result.len > actual_len) {
816: ungetc(result.bytes[--result.len], fp);
1.38 millert 817: }
818:
1.43 millert 819: result.bytes[result.len] = '\0';
820: (void) u8_rune(& result.rune, (uschar *) result.bytes);
1.38 millert 821:
1.43 millert 822: return result;
1.38 millert 823: }
824:
1.26 millert 825:
826: /*
827: * NAME
828: * fnematch
829: *
830: * DESCRIPTION
831: * A stream-fed version of nematch which transfers characters to a
832: * null-terminated buffer. All characters up to and including the last
833: * character of the matching text or EOF are placed in the buffer. If
834: * a match is found, patbeg and patlen are set appropriately.
835: *
836: * RETURN VALUES
1.29 millert 837: * false No match found.
838: * true Match found.
1.26 millert 839: */
840:
1.29 millert 841: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26 millert 842: {
843: char *buf = *pbuf;
844: int bufsize = *pbufsize;
1.43 millert 845: int i, j, k, ns, s;
846: struct runedata r;
1.26 millert 847:
848: s = pfa->initstat;
849: patlen = 0;
850:
851: /*
852: * All indices relative to buf.
853: * i <= j <= k <= bufsize
854: *
1.43 millert 855: * i: origin of active substring (first byte of first character)
856: * j: current character (last byte of current character)
1.26 millert 857: * k: destination of next getc()
858: */
859: i = -1, k = 0;
860: do {
861: j = i++;
862: do {
1.43 millert 863: r = getrune(f);
1.46 millert 864: if (r.len == 0) {
865: r.len = 1; // store NUL byte for EOF
866: }
867: j += r.len;
868: if (j >= bufsize) {
869: if (!adjbuf(&buf, &bufsize, j+1, quantum, 0, "fnematch"))
870: FATAL("stream '%.30s...' too long", buf);
1.38 millert 871: }
1.43 millert 872: memcpy(buf + k, r.bytes, r.len);
873: k += r.len;
1.26 millert 874:
1.43 millert 875: if ((ns = get_gototab(pfa, s, r.rune)) != 0)
1.26 millert 876: s = ns;
877: else
1.43 millert 878: s = cgoto(pfa, s, r.rune);
1.26 millert 879:
880: if (pfa->out[s]) { /* final state */
881: patlen = j - i + 1;
1.43 millert 882: if (r.rune == 0) /* don't count $ */
1.26 millert 883: patlen--;
884: }
885: } while (buf[j] && s != 1);
886: s = 2;
1.43 millert 887: if (r.len > 1)
888: i += r.len - 1; // i incremented around the loop
1.26 millert 889: } while (buf[i] && !patlen);
890:
891: /* adjbuf() may have relocated a resized buffer. Inform the world. */
892: *pbuf = buf;
893: *pbufsize = bufsize;
894:
895: if (patlen) {
896: patbeg = buf + i;
897: /*
898: * Under no circumstances is the last character fed to
899: * the automaton part of the match. It is EOF's nullbyte,
900: * or it sent the automaton into a state with no further
901: * transitions available (s==1), or both. Room for a
902: * terminating nullbyte is guaranteed.
903: *
904: * ungetc any chars after the end of matching text
905: * (except for EOF's nullbyte, if present) and null
906: * terminate the buffer.
907: */
1.47 ! millert 908: for (; r.len > 0; r.len--)
! 909: if (buf[--k] && ungetc(buf[k], f) == EOF)
! 910: FATAL("unable to ungetc '%c'", buf[k]);
! 911: buf[k-patlen] = '\0';
1.29 millert 912: return true;
1.26 millert 913: }
914: else
1.29 millert 915: return false;
1.26 millert 916: }
917:
1.11 millert 918: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 919: { /* uses relex() to scan regular expression */
920: Node *np;
921:
1.32 millert 922: DPRINTF("reparse <%s>\n", p);
1.28 millert 923: lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 924: rtok = relex();
1.11 millert 925: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 926: if (rtok == '\0') {
1.11 millert 927: /* FATAL("empty regular expression"); previous */
1.15 millert 928: return(op2(EMPTYRE, NIL, NIL));
929: }
1.1 tholo 930: np = regexp();
931: if (rtok != '\0')
1.8 millert 932: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 933: return(np);
934: }
935:
936: Node *regexp(void) /* top-level parse of reg expr */
937: {
938: return (alt(concat(primary())));
939: }
940:
941: Node *primary(void)
942: {
943: Node *np;
1.23 millert 944: int savelastatom;
1.1 tholo 945:
946: switch (rtok) {
947: case CHAR:
1.23 millert 948: lastatom = starttok;
1.7 millert 949: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 950: rtok = relex();
951: return (unary(np));
952: case ALL:
953: rtok = relex();
954: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 955: case EMPTYRE:
956: rtok = relex();
1.23 millert 957: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 958: case DOT:
1.23 millert 959: lastatom = starttok;
1.1 tholo 960: rtok = relex();
961: return (unary(op2(DOT, NIL, NIL)));
962: case CCL:
1.28 millert 963: np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23 millert 964: lastatom = starttok;
1.1 tholo 965: rtok = relex();
966: return (unary(np));
967: case NCCL:
1.28 millert 968: np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23 millert 969: lastatom = starttok;
1.1 tholo 970: rtok = relex();
971: return (unary(np));
972: case '^':
973: rtok = relex();
1.7 millert 974: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 975: case '$':
976: rtok = relex();
977: return (unary(op2(CHAR, NIL, NIL)));
978: case '(':
1.23 millert 979: lastatom = starttok;
980: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 981: rtok = relex();
982: if (rtok == ')') { /* special pleading for () */
983: rtok = relex();
1.42 millert 984: return unary(op2(CCL, NIL, (Node *) cclenter("")));
1.1 tholo 985: }
986: np = regexp();
987: if (rtok == ')') {
1.23 millert 988: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 989: rtok = relex();
990: return (unary(np));
991: }
992: else
1.8 millert 993: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 994: default:
1.8 millert 995: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 996: }
997: return 0; /*NOTREACHED*/
998: }
999:
1000: Node *concat(Node *np)
1001: {
1002: switch (rtok) {
1.23 millert 1003: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 1004: return (concat(op2(CAT, np, primary())));
1.23 millert 1005: case EMPTYRE:
1006: rtok = relex();
1.42 millert 1007: return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1.23 millert 1008: primary())));
1.1 tholo 1009: }
1010: return (np);
1011: }
1012:
1013: Node *alt(Node *np)
1014: {
1015: if (rtok == OR) {
1016: rtok = relex();
1017: return (alt(op2(OR, np, concat(primary()))));
1018: }
1019: return (np);
1020: }
1021:
1022: Node *unary(Node *np)
1023: {
1024: switch (rtok) {
1025: case STAR:
1026: rtok = relex();
1027: return (unary(op2(STAR, np, NIL)));
1028: case PLUS:
1029: rtok = relex();
1030: return (unary(op2(PLUS, np, NIL)));
1031: case QUEST:
1032: rtok = relex();
1033: return (unary(op2(QUEST, np, NIL)));
1.31 millert 1034: case ZERO:
1035: rtok = relex();
1036: return (unary(op2(ZERO, np, NIL)));
1.1 tholo 1037: default:
1038: return (np);
1039: }
1040: }
1041:
1.11 millert 1042: /*
1043: * Character class definitions conformant to the POSIX locale as
1044: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1045: * and operating character sets are both ASCII (ISO646) or supersets
1046: * thereof.
1047: *
1048: * Note that to avoid overflowing the temporary buffer used in
1049: * relex(), the expanded character class (prior to range expansion)
1050: * must be less than twice the size of their full name.
1051: */
1.12 millert 1052:
1053: /* Because isblank doesn't show up in any of the header files on any
1054: * system i use, it's defined here. if some other locale has a richer
1055: * definition of "blank", define HAS_ISBLANK and provide your own
1056: * version.
1057: * the parentheses here are an attempt to find a path through the maze
1058: * of macro definition and/or function and/or version provided. thanks
1059: * to nelson beebe for the suggestion; let's see if it works everywhere.
1060: */
1061:
1062: #ifndef HAS_ISBLANK
1063:
1.17 millert 1064: int (xisblank)(int c)
1.12 millert 1065: {
1066: return c==' ' || c=='\t';
1067: }
1068:
1069: #endif
1070:
1.31 millert 1071: static const struct charclass {
1.11 millert 1072: const char *cc_name;
1073: int cc_namelen;
1.12 millert 1074: int (*cc_func)(int);
1.11 millert 1075: } charclasses[] = {
1.12 millert 1076: { "alnum", 5, isalnum },
1077: { "alpha", 5, isalpha },
1.17 millert 1078: #ifndef HAS_ISBLANK
1.21 millert 1079: { "blank", 5, xisblank },
1.17 millert 1080: #else
1.12 millert 1081: { "blank", 5, isblank },
1.17 millert 1082: #endif
1.12 millert 1083: { "cntrl", 5, iscntrl },
1084: { "digit", 5, isdigit },
1085: { "graph", 5, isgraph },
1086: { "lower", 5, islower },
1087: { "print", 5, isprint },
1088: { "punct", 5, ispunct },
1089: { "space", 5, isspace },
1090: { "upper", 5, isupper },
1091: { "xdigit", 6, isxdigit },
1.11 millert 1092: { NULL, 0, NULL },
1093: };
1094:
1.23 millert 1095: #define REPEAT_SIMPLE 0
1096: #define REPEAT_PLUS_APPENDED 1
1097: #define REPEAT_WITH_Q 2
1098: #define REPEAT_ZERO 3
1099:
1100: static int
1101: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1102: int atomlen, int firstnum, int secondnum, int special_case)
1103: {
1104: int i, j;
1.26 millert 1105: uschar *buf = NULL;
1.23 millert 1106: int ret = 1;
1.31 millert 1107: int init_q = (firstnum == 0); /* first added char will be ? */
1.23 millert 1108: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1109: int prefix_length = reptok - basestr; /* prefix includes first rep */
1.28 millert 1110: int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1.23 millert 1111: int size = prefix_length + suffix_length;
1112:
1113: if (firstnum > 1) { /* add room for reps 2 through firstnum */
1114: size += atomlen*(firstnum-1);
1115: }
1116:
1117: /* Adjust size of buffer for special cases */
1118: if (special_case == REPEAT_PLUS_APPENDED) {
1119: size++; /* for the final + */
1120: } else if (special_case == REPEAT_WITH_Q) {
1.36 millert 1121: size += init_q + (atomlen+1)* (n_q_reps-init_q);
1.23 millert 1122: } else if (special_case == REPEAT_ZERO) {
1123: size += 2; /* just a null ERE: () */
1124: }
1.35 millert 1125: if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23 millert 1126: FATAL("out of space in reg expr %.10s..", lastre);
1127: memcpy(buf, basestr, prefix_length); /* copy prefix */
1128: j = prefix_length;
1129: if (special_case == REPEAT_ZERO) {
1130: j -= atomlen;
1131: buf[j++] = '(';
1132: buf[j++] = ')';
1133: }
1.30 millert 1134: for (i = 1; i < firstnum; i++) { /* copy x reps */
1.23 millert 1135: memcpy(&buf[j], atom, atomlen);
1136: j += atomlen;
1137: }
1138: if (special_case == REPEAT_PLUS_APPENDED) {
1139: buf[j++] = '+';
1140: } else if (special_case == REPEAT_WITH_Q) {
1.29 millert 1141: if (init_q)
1142: buf[j++] = '?';
1.30 millert 1143: for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1.23 millert 1144: memcpy(&buf[j], atom, atomlen);
1145: j += atomlen;
1146: buf[j++] = '?';
1147: }
1148: }
1149: memcpy(&buf[j], reptok+reptoklen, suffix_length);
1.36 millert 1150: j += suffix_length;
1151: buf[j] = '\0';
1.23 millert 1152: /* free old basestr */
1153: if (firstbasestr != basestr) {
1154: if (basestr)
1155: xfree(basestr);
1156: }
1157: basestr = buf;
1158: prestr = buf + prefix_length;
1159: if (special_case == REPEAT_ZERO) {
1160: prestr -= atomlen;
1161: ret++;
1162: }
1163: return ret;
1164: }
1165:
1166: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1167: int atomlen, int firstnum, int secondnum)
1168: {
1169: /*
1170: In general, the repetition specifier or "bound" is replaced here
1171: by an equivalent ERE string, repeating the immediately previous atom
1172: and appending ? and + as needed. Note that the first copy of the
1173: atom is left in place, except in the special_case of a zero-repeat
1174: (i.e., {0}).
1175: */
1176: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1177: if (firstnum < 2) {
1178: /* 0 or 1: should be handled before you get here */
1179: FATAL("internal error");
1180: } else {
1181: return replace_repeat(reptok, reptoklen, atom, atomlen,
1182: firstnum, secondnum, REPEAT_PLUS_APPENDED);
1183: }
1184: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1185: if (firstnum == 0) { /* {0} or {0,0} */
1186: /* This case is unusual because the resulting
1187: replacement string might actually be SMALLER than
1188: the original ERE */
1189: return replace_repeat(reptok, reptoklen, atom, atomlen,
1190: firstnum, secondnum, REPEAT_ZERO);
1191: } else { /* (firstnum >= 1) */
1192: return replace_repeat(reptok, reptoklen, atom, atomlen,
1193: firstnum, secondnum, REPEAT_SIMPLE);
1194: }
1195: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1196: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1197: return replace_repeat(reptok, reptoklen, atom, atomlen,
1198: firstnum, secondnum, REPEAT_WITH_Q);
1199: } else { /* Error - shouldn't be here (n>m) */
1200: FATAL("internal error");
1201: }
1202: return 0;
1203: }
1.11 millert 1204:
1.38 millert 1205: extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
1206:
1.1 tholo 1207: int relex(void) /* lexical analyzer for reparse */
1208: {
1.5 kstailey 1209: int c, n;
1.1 tholo 1210: int cflag;
1.25 millert 1211: static uschar *buf = NULL;
1.5 kstailey 1212: static int bufsz = 100;
1.10 millert 1213: uschar *bp;
1.31 millert 1214: const struct charclass *cc;
1.12 millert 1215: int i;
1.29 millert 1216: int num, m;
1217: bool commafound, digitfound;
1.23 millert 1218: const uschar *startreptok;
1.24 millert 1219: static int parens = 0;
1.23 millert 1220:
1221: rescan:
1222: starttok = prestr;
1.1 tholo 1223:
1.38 millert 1224: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1225: prestr += n;
1226: starttok = prestr;
1227: return CHAR;
1228: }
1229:
1.1 tholo 1230: switch (c = *prestr++) {
1231: case '|': return OR;
1232: case '*': return STAR;
1233: case '+': return PLUS;
1234: case '?': return QUEST;
1235: case '.': return DOT;
1236: case '\0': prestr--; return '\0';
1237: case '^':
1238: case '$':
1.24 millert 1239: return c;
1.1 tholo 1240: case '(':
1.24 millert 1241: parens++;
1242: return c;
1.1 tholo 1243: case ')':
1.24 millert 1244: if (parens) {
1245: parens--;
1246: return c;
1247: }
1248: /* unmatched close parenthesis; per POSIX, treat as literal */
1249: rlxval = c;
1250: return CHAR;
1.1 tholo 1251: case '\\':
1.17 millert 1252: rlxval = quoted(&prestr);
1.1 tholo 1253: return CHAR;
1254: default:
1255: rlxval = c;
1256: return CHAR;
1.25 millert 1257: case '[':
1.35 millert 1258: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 1259: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 1260: bp = buf;
1.1 tholo 1261: if (*prestr == '^') {
1262: cflag = 1;
1263: prestr++;
1264: }
1265: else
1266: cflag = 0;
1.38 millert 1267: n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1.15 millert 1268: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 1269: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 1270: for (; ; ) {
1.38 millert 1271: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1272: for (i = 0; i < n; i++)
1273: *bp++ = *prestr++;
1274: continue;
1275: }
1.1 tholo 1276: if ((c = *prestr++) == '\\') {
1.5 kstailey 1277: *bp++ = '\\';
1.1 tholo 1278: if ((c = *prestr++) == '\0')
1.8 millert 1279: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 1280: *bp++ = c;
1.10 millert 1281: /* } else if (c == '\n') { */
1282: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 1283: } else if (c == '[' && *prestr == ':') {
1284: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1285: for (cc = charclasses; cc->cc_name; cc++)
1286: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1287: break;
1288: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1289: prestr[2 + cc->cc_namelen] == ']') {
1290: prestr += cc->cc_namelen + 3;
1.22 millert 1291: /*
1292: * BUG: We begin at 1, instead of 0, since we
1293: * would otherwise prematurely terminate the
1294: * string for classes like [[:cntrl:]]. This
1295: * means that we can't match the NUL character,
1296: * not without first adapting the entire
1297: * program to track each string's length.
1298: */
1.23 millert 1299: for (i = 1; i <= UCHAR_MAX; i++) {
1.37 millert 1300: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1.12 millert 1301: FATAL("out of space for reg expr %.10s...", lastre);
1302: if (cc->cc_func(i)) {
1.32 millert 1303: /* escape backslash */
1304: if (i == '\\') {
1305: *bp++ = '\\';
1306: n++;
1307: }
1308:
1.12 millert 1309: *bp++ = i;
1310: n++;
1311: }
1312: }
1.11 millert 1313: } else
1314: *bp++ = c;
1.23 millert 1315: } else if (c == '[' && *prestr == '.') {
1316: char collate_char;
1317: prestr++;
1318: collate_char = *prestr++;
1319: if (*prestr == '.' && prestr[1] == ']') {
1320: prestr += 2;
1321: /* Found it: map via locale TBD: for
1322: now, simply return this char. This
1323: is sufficient to pass conformance
1324: test awk.ex 156
1325: */
1326: if (*prestr == ']') {
1327: prestr++;
1328: rlxval = collate_char;
1329: return CHAR;
1330: }
1331: }
1332: } else if (c == '[' && *prestr == '=') {
1333: char equiv_char;
1334: prestr++;
1335: equiv_char = *prestr++;
1336: if (*prestr == '=' && prestr[1] == ']') {
1337: prestr += 2;
1338: /* Found it: map via locale TBD: for now
1339: simply return this char. This is
1340: sufficient to pass conformance test
1341: awk.ex 156
1342: */
1343: if (*prestr == ']') {
1344: prestr++;
1345: rlxval = equiv_char;
1346: return CHAR;
1347: }
1348: }
1.1 tholo 1349: } else if (c == '\0') {
1.8 millert 1350: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1351: } else if (bp == buf) { /* 1st char is special */
1352: *bp++ = c;
1.1 tholo 1353: } else if (c == ']') {
1.5 kstailey 1354: *bp++ = 0;
1.10 millert 1355: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1356: if (cflag == 0)
1357: return CCL;
1358: else
1359: return NCCL;
1360: } else
1.5 kstailey 1361: *bp++ = c;
1.1 tholo 1362: }
1.23 millert 1363: break;
1364: case '{':
1365: if (isdigit(*(prestr))) {
1366: num = 0; /* Process as a repetition */
1367: n = -1; m = -1;
1.29 millert 1368: commafound = false;
1369: digitfound = false;
1.23 millert 1370: startreptok = prestr-1;
1371: /* Remember start of previous atom here ? */
1372: } else { /* just a { char, not a repetition */
1373: rlxval = c;
1374: return CHAR;
1375: }
1376: for (; ; ) {
1377: if ((c = *prestr++) == '}') {
1378: if (commafound) {
1379: if (digitfound) { /* {n,m} */
1380: m = num;
1.30 millert 1381: if (m < n)
1.23 millert 1382: FATAL("illegal repetition expression: class %.20s",
1383: lastre);
1.30 millert 1384: if (n == 0 && m == 1) {
1.23 millert 1385: return QUEST;
1386: }
1387: } else { /* {n,} */
1.30 millert 1388: if (n == 0)
1389: return STAR;
1390: else if (n == 1)
1391: return PLUS;
1.23 millert 1392: }
1393: } else {
1394: if (digitfound) { /* {n} same as {n,n} */
1395: n = num;
1396: m = num;
1397: } else { /* {} */
1398: FATAL("illegal repetition expression: class %.20s",
1399: lastre);
1400: }
1401: }
1402: if (repeat(starttok, prestr-starttok, lastatom,
1403: startreptok - lastatom, n, m) > 0) {
1.30 millert 1404: if (n == 0 && m == 0) {
1.31 millert 1405: return ZERO;
1.23 millert 1406: }
1407: /* must rescan input for next token */
1408: goto rescan;
1409: }
1410: /* Failed to replace: eat up {...} characters
1411: and treat like just PLUS */
1412: return PLUS;
1413: } else if (c == '\0') {
1414: FATAL("nonterminated character class %.20s",
1415: lastre);
1416: } else if (isdigit(c)) {
1417: num = 10 * num + c - '0';
1.29 millert 1418: digitfound = true;
1.23 millert 1419: } else if (c == ',') {
1420: if (commafound)
1421: FATAL("illegal repetition expression: class %.20s",
1422: lastre);
1423: /* looking for {n,} or {n,m} */
1.29 millert 1424: commafound = true;
1.23 millert 1425: n = num;
1.29 millert 1426: digitfound = false; /* reset */
1.23 millert 1427: num = 0;
1428: } else {
1429: FATAL("illegal repetition expression: class %.20s",
1430: lastre);
1431: }
1432: }
1433: break;
1.1 tholo 1434: }
1435: }
1436:
1437: int cgoto(fa *f, int s, int c)
1438: {
1.27 millert 1439: int *p, *q;
1.1 tholo 1440: int i, j, k;
1441:
1.38 millert 1442: /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1.1 tholo 1443: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 1444: resizesetvec(__func__);
1.1 tholo 1445: }
1446: for (i = 0; i <= f->accept; i++)
1447: setvec[i] = 0;
1448: setcnt = 0;
1.27 millert 1449: resize_state(f, s);
1.1 tholo 1450: /* compute positions of gototab[s,c] into setvec */
1451: p = f->posns[s];
1452: for (i = 1; i <= *p; i++) {
1453: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1454: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1455: || (k == DOT && c != 0 && c != HAT)
1456: || (k == ALL && c != 0)
1.15 millert 1457: || (k == EMPTYRE && c != 0)
1.38 millert 1458: || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1459: || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1.1 tholo 1460: q = f->re[p[i]].lfollow;
1461: for (j = 1; j <= *q; j++) {
1462: if (q[j] >= maxsetvec) {
1.27 millert 1463: resizesetvec(__func__);
1.1 tholo 1464: }
1465: if (setvec[q[j]] == 0) {
1466: setcnt++;
1467: setvec[q[j]] = 1;
1468: }
1469: }
1470: }
1471: }
1472: }
1473: /* determine if setvec is a previous state */
1474: tmpset[0] = setcnt;
1475: j = 1;
1476: for (i = f->accept; i >= 0; i--)
1477: if (setvec[i]) {
1478: tmpset[j++] = i;
1479: }
1.27 millert 1480: resize_state(f, f->curstat > s ? f->curstat : s);
1.1 tholo 1481: /* tmpset == previous state? */
1482: for (i = 1; i <= f->curstat; i++) {
1483: p = f->posns[i];
1484: if ((k = tmpset[0]) != p[0])
1485: goto different;
1486: for (j = 1; j <= k; j++)
1487: if (tmpset[j] != p[j])
1488: goto different;
1489: /* setvec is state i */
1.30 millert 1490: if (c != HAT)
1.38 millert 1491: set_gototab(f, s, c, i);
1.1 tholo 1492: return i;
1493: different:;
1494: }
1495:
1496: /* add tmpset to current set of states */
1.27 millert 1497: ++(f->curstat);
1498: resize_state(f, f->curstat);
1.1 tholo 1499: xfree(f->posns[f->curstat]);
1.27 millert 1500: p = intalloc(setcnt + 1, __func__);
1.1 tholo 1501:
1502: f->posns[f->curstat] = p;
1.30 millert 1503: if (c != HAT)
1.38 millert 1504: set_gototab(f, s, c, f->curstat);
1.1 tholo 1505: for (i = 0; i <= setcnt; i++)
1506: p[i] = tmpset[i];
1507: if (setvec[f->accept])
1508: f->out[f->curstat] = 1;
1509: else
1510: f->out[f->curstat] = 0;
1511: return f->curstat;
1512: }
1513:
1514:
1515: void freefa(fa *f) /* free a finite automaton */
1516: {
1517: int i;
1518:
1519: if (f == NULL)
1520: return;
1.27 millert 1521: for (i = 0; i < f->state_count; i++)
1522: xfree(f->gototab[i])
1.1 tholo 1523: for (i = 0; i <= f->curstat; i++)
1524: xfree(f->posns[i]);
1525: for (i = 0; i <= f->accept; i++) {
1526: xfree(f->re[i].lfollow);
1527: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30 millert 1528: xfree(f->re[i].lval.np);
1.1 tholo 1529: }
1530: xfree(f->restr);
1.27 millert 1531: xfree(f->out);
1532: xfree(f->posns);
1533: xfree(f->gototab);
1.1 tholo 1534: xfree(f);
1535: }