Annotation of src/usr.bin/awk/b.c, Revision 1.44
1.44 ! millert 1: /* $OpenBSD: b.c,v 1.43 2023/10/06 22:29:24 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.34 millert 36: #include "awkgram.tab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.28 millert 65: static const uschar *rlxstr;
66: static const uschar *prestr; /* current position in current re */
67: static const uschar *lastre; /* origin of last re */
68: static const uschar *lastatom; /* origin of last Atom */
69: static const uschar *starttok;
70: static const uschar *basestr; /* starts with original, replaced during
1.23 millert 71: repetition processing */
1.28 millert 72: static const uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
1.28 millert 77: const char *patbeg;
1.1 tholo 78: int patlen;
79:
1.27 millert 80: #define NFA 128 /* cache this many dynamic fa's */
1.1 tholo 81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.43 millert 84: extern int u8_nextlen(const char *s);
85:
1.38 millert 86:
87: /* utf-8 mechanism:
88:
89: For most of Awk, utf-8 strings just "work", since they look like
90: null-terminated sequences of 8-bit bytes.
91:
92: Functions like length(), index(), and substr() have to operate
93: in units of utf-8 characters. The u8_* functions in run.c
94: handle this.
95:
96: Regular expressions are more complicated, since the basic
97: mechanism of the goto table used 8-bit byte indices into the
98: gototab entries to compute the next state. Unicode is a lot
99: bigger, so the gototab entries are now structs with a character
100: and a next state, and there is a linear search of the characters
101: to find the state. (Yes, this is slower, by a significant
102: amount. Tough.)
103:
104: Throughout the RE mechanism in b.c, utf-8 characters are
105: converted to their utf-32 value. This mostly shows up in
106: cclenter, which expands character class ranges like a-z and now
107: alpha-omega. The size of a gototab array is still about 256.
108: This should be dynamic, but for now things work ok for a single
109: code page of Unicode, which is the most likely case.
110:
111: The code changes are localized in run.c and b.c. I have added a
112: handful of functions to somewhat better hide the implementation,
113: but a lot more could be done.
114:
115: */
116:
117: static int get_gototab(fa*, int, int);
118: static int set_gototab(fa*, int, int, int);
1.44 ! millert 119: static void reset_gototab(fa*, int);
1.38 millert 120: extern int u8_rune(int *, const uschar *);
121:
1.27 millert 122: static int *
123: intalloc(size_t n, const char *f)
124: {
1.35 millert 125: int *p = (int *) calloc(n, sizeof(int));
1.27 millert 126: if (p == NULL)
127: overflo(f);
128: return p;
129: }
130:
131: static void
132: allocsetvec(const char *f)
133: {
134: maxsetvec = MAXLIN;
1.35 millert 135: setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
136: tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27 millert 137: if (setvec == NULL || tmpset == NULL)
138: overflo(f);
139: }
140:
141: static void
142: resizesetvec(const char *f)
143: {
1.35 millert 144: setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
145: tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27 millert 146: if (setvec == NULL || tmpset == NULL)
147: overflo(f);
148: maxsetvec *= 4;
149: }
150:
151: static void
152: resize_state(fa *f, int state)
153: {
1.38 millert 154: gtt **p;
1.35 millert 155: uschar *p2;
156: int **p3;
1.27 millert 157: int i, new_count;
158:
159: if (++state < f->state_count)
160: return;
161:
162: new_count = state + 10; /* needs to be tuned */
163:
1.38 millert 164: p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
1.27 millert 165: if (p == NULL)
166: goto out;
167: f->gototab = p;
168:
1.35 millert 169: p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
170: if (p2 == NULL)
1.27 millert 171: goto out;
1.35 millert 172: f->out = p2;
1.27 millert 173:
1.35 millert 174: p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
175: if (p3 == NULL)
1.27 millert 176: goto out;
1.35 millert 177: f->posns = p3;
1.27 millert 178:
179: for (i = f->state_count; i < new_count; ++i) {
1.38 millert 180: f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
1.27 millert 181: if (f->gototab[i] == NULL)
182: goto out;
183: f->out[i] = 0;
184: f->posns[i] = NULL;
185: }
1.38 millert 186: f->gototab_len = NCHARS; /* should be variable, growable */
1.27 millert 187: f->state_count = new_count;
188: return;
189: out:
190: overflo(__func__);
191: }
192:
1.29 millert 193: fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1.1 tholo 194: {
195: int i, use, nuse;
196: fa *pfa;
1.6 millert 197: static int now = 1;
1.1 tholo 198:
1.25 millert 199: if (setvec == NULL) { /* first time through any RE */
1.27 millert 200: allocsetvec(__func__);
1.1 tholo 201: }
202:
1.29 millert 203: if (compile_time != RUNNING) /* a constant for sure */
1.1 tholo 204: return mkdfa(s, anchor);
205: for (i = 0; i < nfatab; i++) /* is it there already? */
206: if (fatab[i]->anchor == anchor
1.11 millert 207: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 208: fatab[i]->use = now++;
1.1 tholo 209: return fatab[i];
1.10 millert 210: }
1.1 tholo 211: pfa = mkdfa(s, anchor);
212: if (nfatab < NFA) { /* room for another */
213: fatab[nfatab] = pfa;
1.6 millert 214: fatab[nfatab]->use = now++;
1.1 tholo 215: nfatab++;
216: return pfa;
217: }
218: use = fatab[0]->use; /* replace least-recently used */
219: nuse = 0;
220: for (i = 1; i < nfatab; i++)
221: if (fatab[i]->use < use) {
222: use = fatab[i]->use;
223: nuse = i;
224: }
225: freefa(fatab[nuse]);
226: fatab[nuse] = pfa;
1.6 millert 227: pfa->use = now++;
1.1 tholo 228: return pfa;
229: }
230:
1.29 millert 231: fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
1.30 millert 232: /* anchor = true for anchored matches, else false */
1.1 tholo 233: {
234: Node *p, *p1;
235: fa *f;
236:
1.28 millert 237: firstbasestr = (const uschar *) s;
1.23 millert 238: basestr = firstbasestr;
1.1 tholo 239: p = reparse(s);
240: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
241: /* put ALL STAR in front of reg. exp. */
242: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
243: /* put FINAL after reg. exp. */
244:
245: poscnt = 0;
246: penter(p1); /* enter parent pointers and leaf indices */
1.35 millert 247: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27 millert 248: overflo(__func__);
1.1 tholo 249: f->accept = poscnt-1; /* penter has computed number of positions in re */
250: cfoll(f, p1); /* set up follow sets */
251: freetr(p1);
1.27 millert 252: resize_state(f, 1);
253: f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
254: f->posns[1] = intalloc(1, __func__);
1.1 tholo 255: *f->posns[1] = 0;
256: f->initstat = makeinit(f, anchor);
257: f->anchor = anchor;
1.10 millert 258: f->restr = (uschar *) tostring(s);
1.23 millert 259: if (firstbasestr != basestr) {
260: if (basestr)
261: xfree(basestr);
262: }
1.1 tholo 263: return f;
264: }
265:
1.29 millert 266: int makeinit(fa *f, bool anchor)
1.1 tholo 267: {
268: int i, k;
269:
270: f->curstat = 2;
271: f->out[2] = 0;
272: k = *(f->re[0].lfollow);
1.25 millert 273: xfree(f->posns[2]);
1.27 millert 274: f->posns[2] = intalloc(k + 1, __func__);
1.30 millert 275: for (i = 0; i <= k; i++) {
1.1 tholo 276: (f->posns[2])[i] = (f->re[0].lfollow)[i];
277: }
278: if ((f->posns[2])[1] == f->accept)
279: f->out[2] = 1;
1.44 ! millert 280: reset_gototab(f, 2);
1.1 tholo 281: f->curstat = cgoto(f, 2, HAT);
282: if (anchor) {
283: *f->posns[2] = k-1; /* leave out position 0 */
1.30 millert 284: for (i = 0; i < k; i++) {
1.1 tholo 285: (f->posns[0])[i] = (f->posns[2])[i];
286: }
287:
288: f->out[0] = f->out[2];
289: if (f->curstat != 2)
290: --(*f->posns[f->curstat]);
291: }
292: return f->curstat;
293: }
294:
295: void penter(Node *p) /* set up parent pointers and leaf indices */
296: {
297: switch (type(p)) {
1.15 millert 298: ELEAF
1.1 tholo 299: LEAF
1.7 millert 300: info(p) = poscnt;
1.1 tholo 301: poscnt++;
302: break;
303: UNARY
304: penter(left(p));
305: parent(left(p)) = p;
306: break;
307: case CAT:
308: case OR:
309: penter(left(p));
310: penter(right(p));
311: parent(left(p)) = p;
312: parent(right(p)) = p;
313: break;
1.31 millert 314: case ZERO:
315: break;
1.1 tholo 316: default: /* can't happen */
1.8 millert 317: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 318: break;
319: }
320: }
321:
322: void freetr(Node *p) /* free parse tree */
323: {
324: switch (type(p)) {
1.15 millert 325: ELEAF
1.1 tholo 326: LEAF
327: xfree(p);
328: break;
329: UNARY
1.31 millert 330: case ZERO:
1.1 tholo 331: freetr(left(p));
332: xfree(p);
333: break;
334: case CAT:
335: case OR:
336: freetr(left(p));
337: freetr(right(p));
338: xfree(p);
339: break;
340: default: /* can't happen */
1.8 millert 341: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 342: break;
343: }
344: }
345:
346: /* in the parsing of regular expressions, metacharacters like . have */
347: /* to be seen literally; \056 is not a metacharacter. */
348:
1.38 millert 349: int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
1.2 millert 350: { /* only pick up one 8-bit byte (2 chars) */
1.28 millert 351: const uschar *p;
1.1 tholo 352: int n = 0;
1.2 millert 353: int i;
1.1 tholo 354:
1.38 millert 355: for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
1.1 tholo 356: if (isdigit(*p))
357: n = 16 * n + *p - '0';
358: else if (*p >= 'a' && *p <= 'f')
359: n = 16 * n + *p - 'a' + 10;
360: else if (*p >= 'A' && *p <= 'F')
361: n = 16 * n + *p - 'A' + 10;
362: }
1.28 millert 363: *pp = p;
1.1 tholo 364: return n;
365: }
366:
1.2 millert 367: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 368:
1.28 millert 369: int quoted(const uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 370: /* and increment *pp */
371: {
1.28 millert 372: const uschar *p = *pp;
1.1 tholo 373: int c;
374:
1.38 millert 375: /* BUG: should advance by utf-8 char even if makes no sense */
376:
377: if ((c = *p++) == 't') {
1.1 tholo 378: c = '\t';
1.38 millert 379: } else if (c == 'n') {
1.1 tholo 380: c = '\n';
1.38 millert 381: } else if (c == 'f') {
1.1 tholo 382: c = '\f';
1.38 millert 383: } else if (c == 'r') {
1.1 tholo 384: c = '\r';
1.38 millert 385: } else if (c == 'b') {
1.1 tholo 386: c = '\b';
1.38 millert 387: } else if (c == 'v') {
1.25 millert 388: c = '\v';
1.38 millert 389: } else if (c == 'a') {
1.25 millert 390: c = '\a';
1.38 millert 391: } else if (c == '\\') {
1.1 tholo 392: c = '\\';
1.38 millert 393: } else if (c == 'x') { /* 2 hex digits follow */
394: c = hexstr(&p, 2); /* this adds a null if number is invalid */
395: } else if (c == 'u') { /* unicode char number up to 8 hex digits */
396: c = hexstr(&p, 8);
1.1 tholo 397: } else if (isoctdigit(c)) { /* \d \dd \ddd */
398: int n = c - '0';
399: if (isoctdigit(*p)) {
400: n = 8 * n + *p++ - '0';
401: if (isoctdigit(*p))
402: n = 8 * n + *p++ - '0';
403: }
404: c = n;
405: } /* else */
406: /* c = c; */
407: *pp = p;
408: return c;
409: }
410:
1.38 millert 411: int *cclenter(const char *argp) /* add a character class */
1.1 tholo 412: {
413: int i, c, c2;
1.38 millert 414: int n;
415: const uschar *p = (const uschar *) argp;
416: int *bp, *retp;
417: static int *buf = NULL;
1.5 kstailey 418: static int bufsz = 100;
1.1 tholo 419:
1.38 millert 420: if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
1.8 millert 421: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 422: bp = buf;
1.38 millert 423: for (i = 0; *p != 0; ) {
424: n = u8_rune(&c, p);
425: p += n;
1.1 tholo 426: if (c == '\\') {
1.17 millert 427: c = quoted(&p);
1.5 kstailey 428: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 429: if (*p != 0) {
1.5 kstailey 430: c = bp[-1];
1.38 millert 431: /* c2 = *p++; */
432: n = u8_rune(&c2, p);
433: p += n;
1.1 tholo 434: if (c2 == '\\')
1.38 millert 435: c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
1.1 tholo 436: if (c > c2) { /* empty; ignore */
1.5 kstailey 437: bp--;
1.1 tholo 438: i--;
439: continue;
440: }
441: while (c < c2) {
1.38 millert 442: if (i >= bufsz) {
443: buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2);
444: if (buf == NULL)
445: FATAL("out of space for character class [%.10s...] 2", p);
446: bufsz *= 2;
447: bp = buf + i;
448: }
1.5 kstailey 449: *bp++ = ++c;
1.1 tholo 450: i++;
451: }
452: continue;
453: }
454: }
1.38 millert 455: if (i >= bufsz) {
456: buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2);
457: if (buf == NULL)
458: FATAL("out of space for character class [%.10s...] 2", p);
459: bufsz *= 2;
460: bp = buf + i;
461: }
1.5 kstailey 462: *bp++ = c;
1.1 tholo 463: i++;
464: }
1.5 kstailey 465: *bp = 0;
1.38 millert 466: /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
467: /* xfree(op); BUG: what are we freeing here? */
468: retp = (int *) calloc(bp-buf+1, sizeof(int));
469: for (i = 0; i < bp-buf+1; i++)
470: retp[i] = buf[i];
471: return retp;
1.1 tholo 472: }
473:
1.11 millert 474: void overflo(const char *s)
1.1 tholo 475: {
1.27 millert 476: FATAL("regular expression too big: out of space in %.30s...", s);
1.1 tholo 477: }
478:
479: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
480: {
481: int i;
1.4 millert 482: int *p;
1.1 tholo 483:
484: switch (type(v)) {
1.15 millert 485: ELEAF
1.1 tholo 486: LEAF
1.7 millert 487: f->re[info(v)].ltype = type(v);
488: f->re[info(v)].lval.np = right(v);
1.1 tholo 489: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 490: resizesetvec(__func__);
1.1 tholo 491: }
492: for (i = 0; i <= f->accept; i++)
493: setvec[i] = 0;
494: setcnt = 0;
495: follow(v); /* computes setvec and setcnt */
1.27 millert 496: p = intalloc(setcnt + 1, __func__);
1.7 millert 497: f->re[info(v)].lfollow = p;
1.1 tholo 498: *p = setcnt;
499: for (i = f->accept; i >= 0; i--)
500: if (setvec[i] == 1)
501: *++p = i;
502: break;
503: UNARY
504: cfoll(f,left(v));
505: break;
506: case CAT:
507: case OR:
508: cfoll(f,left(v));
509: cfoll(f,right(v));
510: break;
1.31 millert 511: case ZERO:
512: break;
1.1 tholo 513: default: /* can't happen */
1.8 millert 514: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 515: }
516: }
517:
518: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 519: /* returns 0 if p matches empty string */
1.1 tholo 520: {
1.4 millert 521: int b, lp;
1.1 tholo 522:
523: switch (type(p)) {
1.15 millert 524: ELEAF
1.1 tholo 525: LEAF
1.7 millert 526: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 527: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.27 millert 528: resizesetvec(__func__);
1.1 tholo 529: }
1.15 millert 530: if (type(p) == EMPTYRE) {
531: setvec[lp] = 0;
532: return(0);
533: }
1.1 tholo 534: if (setvec[lp] != 1) {
535: setvec[lp] = 1;
536: setcnt++;
537: }
1.41 millert 538: if (type(p) == CCL && (*(int *) right(p)) == 0)
1.1 tholo 539: return(0); /* empty CCL */
1.30 millert 540: return(1);
1.1 tholo 541: case PLUS:
1.30 millert 542: if (first(left(p)) == 0)
543: return(0);
1.1 tholo 544: return(1);
545: case STAR:
546: case QUEST:
547: first(left(p));
548: return(0);
549: case CAT:
550: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
551: return(1);
552: case OR:
553: b = first(right(p));
554: if (first(left(p)) == 0 || b == 0) return(0);
555: return(1);
1.31 millert 556: case ZERO:
557: return 0;
1.1 tholo 558: }
1.8 millert 559: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 560: return(-1);
561: }
562:
563: void follow(Node *v) /* collects leaves that can follow v into setvec */
564: {
565: Node *p;
566:
567: if (type(v) == FINAL)
568: return;
569: p = parent(v);
570: switch (type(p)) {
571: case STAR:
572: case PLUS:
573: first(v);
574: follow(p);
575: return;
576:
577: case OR:
578: case QUEST:
579: follow(p);
580: return;
581:
582: case CAT:
583: if (v == left(p)) { /* v is left child of p */
584: if (first(right(p)) == 0) {
585: follow(p);
586: return;
587: }
588: } else /* v is right child */
589: follow(p);
590: return;
591: }
592: }
593:
1.38 millert 594: int member(int c, int *sarg) /* is c in s? */
1.1 tholo 595: {
1.38 millert 596: int *s = (int *) sarg;
1.10 millert 597:
1.1 tholo 598: while (*s)
599: if (c == *s++)
600: return(1);
601: return(0);
602: }
603:
1.38 millert 604: static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
605: {
606: int i;
607: for (i = 0; i < f->gototab_len; i++) {
608: if (f->gototab[state][i].ch == 0)
609: break;
610: if (f->gototab[state][i].ch == ch)
611: return f->gototab[state][i].state;
612: }
613: return 0;
614: }
615:
1.44 ! millert 616: static void reset_gototab(fa *f, int state) /* hide gototab inplementation */
! 617: {
! 618: memset(f->gototab[state], 0, f->gototab_len * sizeof(**f->gototab));
! 619: }
! 620:
1.38 millert 621: static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
622: {
623: int i;
624: for (i = 0; i < f->gototab_len; i++) {
625: if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
626: f->gototab[state][i].ch = ch;
627: f->gototab[state][i].state = val;
628: return val;
629: }
630: }
631: overflo(__func__);
632: return val; /* not used anywhere at the moment */
633: }
634:
1.11 millert 635: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 636: {
637: int s, ns;
1.38 millert 638: int n;
639: int rune;
1.28 millert 640: const uschar *p = (const uschar *) p0;
1.1 tholo 641:
1.38 millert 642: /* return pmatch(f, p0); does it matter whether longest or shortest? */
643:
1.27 millert 644: s = f->initstat;
645: assert (s < f->state_count);
646:
1.1 tholo 647: if (f->out[s])
648: return(1);
649: do {
1.15 millert 650: /* assert(*p < NCHARS); */
1.38 millert 651: n = u8_rune(&rune, p);
652: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 653: s = ns;
654: else
1.38 millert 655: s = cgoto(f, s, rune);
1.1 tholo 656: if (f->out[s])
657: return(1);
1.38 millert 658: if (*p == 0)
659: break;
660: p += n;
661: } while (1); /* was *p++ != 0 */
1.1 tholo 662: return(0);
663: }
664:
1.11 millert 665: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 666: {
667: int s, ns;
1.38 millert 668: int n;
669: int rune;
1.28 millert 670: const uschar *p = (const uschar *) p0;
671: const uschar *q;
1.1 tholo 672:
1.27 millert 673: s = f->initstat;
674: assert(s < f->state_count);
675:
1.28 millert 676: patbeg = (const char *)p;
1.1 tholo 677: patlen = -1;
678: do {
679: q = p;
680: do {
681: if (f->out[s]) /* final state */
682: patlen = q-p;
1.15 millert 683: /* assert(*q < NCHARS); */
1.38 millert 684: n = u8_rune(&rune, q);
685: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 686: s = ns;
687: else
1.38 millert 688: s = cgoto(f, s, rune);
1.27 millert 689:
690: assert(s < f->state_count);
691:
1.9 deraadt 692: if (s == 1) { /* no transition */
1.1 tholo 693: if (patlen >= 0) {
1.28 millert 694: patbeg = (const char *) p;
1.1 tholo 695: return(1);
696: }
697: else
698: goto nextin; /* no match */
1.9 deraadt 699: }
1.38 millert 700: if (*q == 0)
701: break;
702: q += n;
703: } while (1);
704: q++; /* was *q++ */
1.1 tholo 705: if (f->out[s])
706: patlen = q-p-1; /* don't count $ */
707: if (patlen >= 0) {
1.28 millert 708: patbeg = (const char *) p;
1.1 tholo 709: return(1);
710: }
711: nextin:
712: s = 2;
1.38 millert 713: if (*p == 0)
714: break;
715: n = u8_rune(&rune, p);
716: p += n;
717: } while (1); /* was *p++ */
1.1 tholo 718: return (0);
719: }
720:
1.11 millert 721: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 722: {
723: int s, ns;
1.38 millert 724: int n;
725: int rune;
1.28 millert 726: const uschar *p = (const uschar *) p0;
727: const uschar *q;
1.1 tholo 728:
1.27 millert 729: s = f->initstat;
730: assert(s < f->state_count);
731:
1.28 millert 732: patbeg = (const char *)p;
1.1 tholo 733: patlen = -1;
734: while (*p) {
735: q = p;
736: do {
737: if (f->out[s]) /* final state */
738: patlen = q-p;
1.15 millert 739: /* assert(*q < NCHARS); */
1.38 millert 740: n = u8_rune(&rune, q);
741: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 742: s = ns;
743: else
1.38 millert 744: s = cgoto(f, s, rune);
1.9 deraadt 745: if (s == 1) { /* no transition */
1.1 tholo 746: if (patlen > 0) {
1.28 millert 747: patbeg = (const char *) p;
1.1 tholo 748: return(1);
749: } else
750: goto nnextin; /* no nonempty match */
1.9 deraadt 751: }
1.38 millert 752: if (*q == 0)
753: break;
754: q += n;
755: } while (1);
756: q++;
1.1 tholo 757: if (f->out[s])
758: patlen = q-p-1; /* don't count $ */
759: if (patlen > 0 ) {
1.28 millert 760: patbeg = (const char *) p;
1.1 tholo 761: return(1);
762: }
763: nnextin:
764: s = 2;
765: p++;
766: }
767: return (0);
768: }
769:
1.43 millert 770:
771: #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
772:
773: // Read one rune at a time from the given FILE*. Return both
774: // the bytes and the actual rune.
775:
776: struct runedata {
777: int rune;
778: size_t len;
779: char bytes[6];
780: };
781:
782: struct runedata getrune(FILE *fp)
1.38 millert 783: {
1.43 millert 784: struct runedata result;
785: int c, i, next;
786:
787: memset(&result, 0, sizeof(result));
788:
789: c = getc(fp);
790: if (c == EOF)
791: return result; // result.rune == 0 --> EOF
792: else if (c < 128 || awk_mb_cur_max == 1) {
793: result.bytes[0] = c;
794: result.len = 1;
795: result.rune = c;
796:
797: return result;
798: }
799:
800: // need to get bytes and fill things in
801: result.bytes[0] = c;
802: result.len = 1;
803:
804: next = 1;
805: for (i = 1; i < MAX_UTF_BYTES; i++) {
806: c = getc(fp);
807: if (c == EOF)
1.38 millert 808: break;
1.43 millert 809: result.bytes[next++] = c;
810: result.len++;
811: }
812:
813: // put back any extra input bytes
814: int actual_len = u8_nextlen(result.bytes);
815: while (result.len > actual_len) {
816: ungetc(result.bytes[--result.len], fp);
1.38 millert 817: }
818:
1.43 millert 819: result.bytes[result.len] = '\0';
820: (void) u8_rune(& result.rune, (uschar *) result.bytes);
1.38 millert 821:
1.43 millert 822: return result;
1.38 millert 823: }
824:
1.26 millert 825:
826: /*
827: * NAME
828: * fnematch
829: *
830: * DESCRIPTION
831: * A stream-fed version of nematch which transfers characters to a
832: * null-terminated buffer. All characters up to and including the last
833: * character of the matching text or EOF are placed in the buffer. If
834: * a match is found, patbeg and patlen are set appropriately.
835: *
836: * RETURN VALUES
1.29 millert 837: * false No match found.
838: * true Match found.
1.26 millert 839: */
840:
1.29 millert 841: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26 millert 842: {
843: char *buf = *pbuf;
844: int bufsize = *pbufsize;
1.43 millert 845: int i, j, k, ns, s;
846: struct runedata r;
1.26 millert 847:
848: s = pfa->initstat;
849: patlen = 0;
850:
851: /*
852: * All indices relative to buf.
853: * i <= j <= k <= bufsize
854: *
1.43 millert 855: * i: origin of active substring (first byte of first character)
856: * j: current character (last byte of current character)
1.26 millert 857: * k: destination of next getc()
858: */
859: i = -1, k = 0;
860: do {
861: j = i++;
862: do {
1.43 millert 863: r = getrune(f);
864: if ((++j + r.len) >= k) {
865: if (k >= bufsize)
1.26 millert 866: if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
867: FATAL("stream '%.30s...' too long", buf);
1.38 millert 868: }
1.43 millert 869: memcpy(buf + k, r.bytes, r.len);
870: j += r.len - 1; // incremented next time around the loop
871: k += r.len;
1.26 millert 872:
1.43 millert 873: if ((ns = get_gototab(pfa, s, r.rune)) != 0)
1.26 millert 874: s = ns;
875: else
1.43 millert 876: s = cgoto(pfa, s, r.rune);
1.26 millert 877:
878: if (pfa->out[s]) { /* final state */
879: patlen = j - i + 1;
1.43 millert 880: if (r.rune == 0) /* don't count $ */
1.26 millert 881: patlen--;
882: }
883: } while (buf[j] && s != 1);
884: s = 2;
1.43 millert 885: if (r.len > 1)
886: i += r.len - 1; // i incremented around the loop
1.26 millert 887: } while (buf[i] && !patlen);
888:
889: /* adjbuf() may have relocated a resized buffer. Inform the world. */
890: *pbuf = buf;
891: *pbufsize = bufsize;
892:
893: if (patlen) {
894: patbeg = buf + i;
895: /*
896: * Under no circumstances is the last character fed to
897: * the automaton part of the match. It is EOF's nullbyte,
898: * or it sent the automaton into a state with no further
899: * transitions available (s==1), or both. Room for a
900: * terminating nullbyte is guaranteed.
901: *
902: * ungetc any chars after the end of matching text
903: * (except for EOF's nullbyte, if present) and null
904: * terminate the buffer.
905: */
1.43 millert 906: do {
907: int ii;
908: for (ii = r.len; ii > 0; ii--)
909: if (buf[--k] && ungetc(buf[k], f) == EOF)
910: FATAL("unable to ungetc '%c'", buf[k]);
911: } while (k > i + patlen);
1.30 millert 912: buf[k] = '\0';
1.29 millert 913: return true;
1.26 millert 914: }
915: else
1.29 millert 916: return false;
1.26 millert 917: }
918:
1.11 millert 919: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 920: { /* uses relex() to scan regular expression */
921: Node *np;
922:
1.32 millert 923: DPRINTF("reparse <%s>\n", p);
1.28 millert 924: lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 925: rtok = relex();
1.11 millert 926: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 927: if (rtok == '\0') {
1.11 millert 928: /* FATAL("empty regular expression"); previous */
1.15 millert 929: return(op2(EMPTYRE, NIL, NIL));
930: }
1.1 tholo 931: np = regexp();
932: if (rtok != '\0')
1.8 millert 933: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 934: return(np);
935: }
936:
937: Node *regexp(void) /* top-level parse of reg expr */
938: {
939: return (alt(concat(primary())));
940: }
941:
942: Node *primary(void)
943: {
944: Node *np;
1.23 millert 945: int savelastatom;
1.1 tholo 946:
947: switch (rtok) {
948: case CHAR:
1.23 millert 949: lastatom = starttok;
1.7 millert 950: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 951: rtok = relex();
952: return (unary(np));
953: case ALL:
954: rtok = relex();
955: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 956: case EMPTYRE:
957: rtok = relex();
1.23 millert 958: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 959: case DOT:
1.23 millert 960: lastatom = starttok;
1.1 tholo 961: rtok = relex();
962: return (unary(op2(DOT, NIL, NIL)));
963: case CCL:
1.28 millert 964: np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23 millert 965: lastatom = starttok;
1.1 tholo 966: rtok = relex();
967: return (unary(np));
968: case NCCL:
1.28 millert 969: np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23 millert 970: lastatom = starttok;
1.1 tholo 971: rtok = relex();
972: return (unary(np));
973: case '^':
974: rtok = relex();
1.7 millert 975: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 976: case '$':
977: rtok = relex();
978: return (unary(op2(CHAR, NIL, NIL)));
979: case '(':
1.23 millert 980: lastatom = starttok;
981: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 982: rtok = relex();
983: if (rtok == ')') { /* special pleading for () */
984: rtok = relex();
1.42 millert 985: return unary(op2(CCL, NIL, (Node *) cclenter("")));
1.1 tholo 986: }
987: np = regexp();
988: if (rtok == ')') {
1.23 millert 989: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 990: rtok = relex();
991: return (unary(np));
992: }
993: else
1.8 millert 994: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 995: default:
1.8 millert 996: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 997: }
998: return 0; /*NOTREACHED*/
999: }
1000:
1001: Node *concat(Node *np)
1002: {
1003: switch (rtok) {
1.23 millert 1004: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 1005: return (concat(op2(CAT, np, primary())));
1.23 millert 1006: case EMPTYRE:
1007: rtok = relex();
1.42 millert 1008: return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1.23 millert 1009: primary())));
1.1 tholo 1010: }
1011: return (np);
1012: }
1013:
1014: Node *alt(Node *np)
1015: {
1016: if (rtok == OR) {
1017: rtok = relex();
1018: return (alt(op2(OR, np, concat(primary()))));
1019: }
1020: return (np);
1021: }
1022:
1023: Node *unary(Node *np)
1024: {
1025: switch (rtok) {
1026: case STAR:
1027: rtok = relex();
1028: return (unary(op2(STAR, np, NIL)));
1029: case PLUS:
1030: rtok = relex();
1031: return (unary(op2(PLUS, np, NIL)));
1032: case QUEST:
1033: rtok = relex();
1034: return (unary(op2(QUEST, np, NIL)));
1.31 millert 1035: case ZERO:
1036: rtok = relex();
1037: return (unary(op2(ZERO, np, NIL)));
1.1 tholo 1038: default:
1039: return (np);
1040: }
1041: }
1042:
1.11 millert 1043: /*
1044: * Character class definitions conformant to the POSIX locale as
1045: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1046: * and operating character sets are both ASCII (ISO646) or supersets
1047: * thereof.
1048: *
1049: * Note that to avoid overflowing the temporary buffer used in
1050: * relex(), the expanded character class (prior to range expansion)
1051: * must be less than twice the size of their full name.
1052: */
1.12 millert 1053:
1054: /* Because isblank doesn't show up in any of the header files on any
1055: * system i use, it's defined here. if some other locale has a richer
1056: * definition of "blank", define HAS_ISBLANK and provide your own
1057: * version.
1058: * the parentheses here are an attempt to find a path through the maze
1059: * of macro definition and/or function and/or version provided. thanks
1060: * to nelson beebe for the suggestion; let's see if it works everywhere.
1061: */
1062:
1063: #ifndef HAS_ISBLANK
1064:
1.17 millert 1065: int (xisblank)(int c)
1.12 millert 1066: {
1067: return c==' ' || c=='\t';
1068: }
1069:
1070: #endif
1071:
1.31 millert 1072: static const struct charclass {
1.11 millert 1073: const char *cc_name;
1074: int cc_namelen;
1.12 millert 1075: int (*cc_func)(int);
1.11 millert 1076: } charclasses[] = {
1.12 millert 1077: { "alnum", 5, isalnum },
1078: { "alpha", 5, isalpha },
1.17 millert 1079: #ifndef HAS_ISBLANK
1.21 millert 1080: { "blank", 5, xisblank },
1.17 millert 1081: #else
1.12 millert 1082: { "blank", 5, isblank },
1.17 millert 1083: #endif
1.12 millert 1084: { "cntrl", 5, iscntrl },
1085: { "digit", 5, isdigit },
1086: { "graph", 5, isgraph },
1087: { "lower", 5, islower },
1088: { "print", 5, isprint },
1089: { "punct", 5, ispunct },
1090: { "space", 5, isspace },
1091: { "upper", 5, isupper },
1092: { "xdigit", 6, isxdigit },
1.11 millert 1093: { NULL, 0, NULL },
1094: };
1095:
1.23 millert 1096: #define REPEAT_SIMPLE 0
1097: #define REPEAT_PLUS_APPENDED 1
1098: #define REPEAT_WITH_Q 2
1099: #define REPEAT_ZERO 3
1100:
1101: static int
1102: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1103: int atomlen, int firstnum, int secondnum, int special_case)
1104: {
1105: int i, j;
1.26 millert 1106: uschar *buf = NULL;
1.23 millert 1107: int ret = 1;
1.31 millert 1108: int init_q = (firstnum == 0); /* first added char will be ? */
1.23 millert 1109: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1110: int prefix_length = reptok - basestr; /* prefix includes first rep */
1.28 millert 1111: int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1.23 millert 1112: int size = prefix_length + suffix_length;
1113:
1114: if (firstnum > 1) { /* add room for reps 2 through firstnum */
1115: size += atomlen*(firstnum-1);
1116: }
1117:
1118: /* Adjust size of buffer for special cases */
1119: if (special_case == REPEAT_PLUS_APPENDED) {
1120: size++; /* for the final + */
1121: } else if (special_case == REPEAT_WITH_Q) {
1.36 millert 1122: size += init_q + (atomlen+1)* (n_q_reps-init_q);
1.23 millert 1123: } else if (special_case == REPEAT_ZERO) {
1124: size += 2; /* just a null ERE: () */
1125: }
1.35 millert 1126: if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23 millert 1127: FATAL("out of space in reg expr %.10s..", lastre);
1128: memcpy(buf, basestr, prefix_length); /* copy prefix */
1129: j = prefix_length;
1130: if (special_case == REPEAT_ZERO) {
1131: j -= atomlen;
1132: buf[j++] = '(';
1133: buf[j++] = ')';
1134: }
1.30 millert 1135: for (i = 1; i < firstnum; i++) { /* copy x reps */
1.23 millert 1136: memcpy(&buf[j], atom, atomlen);
1137: j += atomlen;
1138: }
1139: if (special_case == REPEAT_PLUS_APPENDED) {
1140: buf[j++] = '+';
1141: } else if (special_case == REPEAT_WITH_Q) {
1.29 millert 1142: if (init_q)
1143: buf[j++] = '?';
1.30 millert 1144: for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1.23 millert 1145: memcpy(&buf[j], atom, atomlen);
1146: j += atomlen;
1147: buf[j++] = '?';
1148: }
1149: }
1150: memcpy(&buf[j], reptok+reptoklen, suffix_length);
1.36 millert 1151: j += suffix_length;
1152: buf[j] = '\0';
1.23 millert 1153: /* free old basestr */
1154: if (firstbasestr != basestr) {
1155: if (basestr)
1156: xfree(basestr);
1157: }
1158: basestr = buf;
1159: prestr = buf + prefix_length;
1160: if (special_case == REPEAT_ZERO) {
1161: prestr -= atomlen;
1162: ret++;
1163: }
1164: return ret;
1165: }
1166:
1167: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1168: int atomlen, int firstnum, int secondnum)
1169: {
1170: /*
1171: In general, the repetition specifier or "bound" is replaced here
1172: by an equivalent ERE string, repeating the immediately previous atom
1173: and appending ? and + as needed. Note that the first copy of the
1174: atom is left in place, except in the special_case of a zero-repeat
1175: (i.e., {0}).
1176: */
1177: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1178: if (firstnum < 2) {
1179: /* 0 or 1: should be handled before you get here */
1180: FATAL("internal error");
1181: } else {
1182: return replace_repeat(reptok, reptoklen, atom, atomlen,
1183: firstnum, secondnum, REPEAT_PLUS_APPENDED);
1184: }
1185: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1186: if (firstnum == 0) { /* {0} or {0,0} */
1187: /* This case is unusual because the resulting
1188: replacement string might actually be SMALLER than
1189: the original ERE */
1190: return replace_repeat(reptok, reptoklen, atom, atomlen,
1191: firstnum, secondnum, REPEAT_ZERO);
1192: } else { /* (firstnum >= 1) */
1193: return replace_repeat(reptok, reptoklen, atom, atomlen,
1194: firstnum, secondnum, REPEAT_SIMPLE);
1195: }
1196: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1197: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1198: return replace_repeat(reptok, reptoklen, atom, atomlen,
1199: firstnum, secondnum, REPEAT_WITH_Q);
1200: } else { /* Error - shouldn't be here (n>m) */
1201: FATAL("internal error");
1202: }
1203: return 0;
1204: }
1.11 millert 1205:
1.38 millert 1206: extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
1207:
1.1 tholo 1208: int relex(void) /* lexical analyzer for reparse */
1209: {
1.5 kstailey 1210: int c, n;
1.1 tholo 1211: int cflag;
1.25 millert 1212: static uschar *buf = NULL;
1.5 kstailey 1213: static int bufsz = 100;
1.10 millert 1214: uschar *bp;
1.31 millert 1215: const struct charclass *cc;
1.12 millert 1216: int i;
1.29 millert 1217: int num, m;
1218: bool commafound, digitfound;
1.23 millert 1219: const uschar *startreptok;
1.24 millert 1220: static int parens = 0;
1.23 millert 1221:
1222: rescan:
1223: starttok = prestr;
1.1 tholo 1224:
1.38 millert 1225: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1226: prestr += n;
1227: starttok = prestr;
1228: return CHAR;
1229: }
1230:
1.1 tholo 1231: switch (c = *prestr++) {
1232: case '|': return OR;
1233: case '*': return STAR;
1234: case '+': return PLUS;
1235: case '?': return QUEST;
1236: case '.': return DOT;
1237: case '\0': prestr--; return '\0';
1238: case '^':
1239: case '$':
1.24 millert 1240: return c;
1.1 tholo 1241: case '(':
1.24 millert 1242: parens++;
1243: return c;
1.1 tholo 1244: case ')':
1.24 millert 1245: if (parens) {
1246: parens--;
1247: return c;
1248: }
1249: /* unmatched close parenthesis; per POSIX, treat as literal */
1250: rlxval = c;
1251: return CHAR;
1.1 tholo 1252: case '\\':
1.17 millert 1253: rlxval = quoted(&prestr);
1.1 tholo 1254: return CHAR;
1255: default:
1256: rlxval = c;
1257: return CHAR;
1.25 millert 1258: case '[':
1.35 millert 1259: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 1260: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 1261: bp = buf;
1.1 tholo 1262: if (*prestr == '^') {
1263: cflag = 1;
1264: prestr++;
1265: }
1266: else
1267: cflag = 0;
1.38 millert 1268: n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1.15 millert 1269: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 1270: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 1271: for (; ; ) {
1.38 millert 1272: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1273: for (i = 0; i < n; i++)
1274: *bp++ = *prestr++;
1275: continue;
1276: }
1.1 tholo 1277: if ((c = *prestr++) == '\\') {
1.5 kstailey 1278: *bp++ = '\\';
1.1 tholo 1279: if ((c = *prestr++) == '\0')
1.8 millert 1280: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 1281: *bp++ = c;
1.10 millert 1282: /* } else if (c == '\n') { */
1283: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 1284: } else if (c == '[' && *prestr == ':') {
1285: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1286: for (cc = charclasses; cc->cc_name; cc++)
1287: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1288: break;
1289: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1290: prestr[2 + cc->cc_namelen] == ']') {
1291: prestr += cc->cc_namelen + 3;
1.22 millert 1292: /*
1293: * BUG: We begin at 1, instead of 0, since we
1294: * would otherwise prematurely terminate the
1295: * string for classes like [[:cntrl:]]. This
1296: * means that we can't match the NUL character,
1297: * not without first adapting the entire
1298: * program to track each string's length.
1299: */
1.23 millert 1300: for (i = 1; i <= UCHAR_MAX; i++) {
1.37 millert 1301: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1.12 millert 1302: FATAL("out of space for reg expr %.10s...", lastre);
1303: if (cc->cc_func(i)) {
1.32 millert 1304: /* escape backslash */
1305: if (i == '\\') {
1306: *bp++ = '\\';
1307: n++;
1308: }
1309:
1.12 millert 1310: *bp++ = i;
1311: n++;
1312: }
1313: }
1.11 millert 1314: } else
1315: *bp++ = c;
1.23 millert 1316: } else if (c == '[' && *prestr == '.') {
1317: char collate_char;
1318: prestr++;
1319: collate_char = *prestr++;
1320: if (*prestr == '.' && prestr[1] == ']') {
1321: prestr += 2;
1322: /* Found it: map via locale TBD: for
1323: now, simply return this char. This
1324: is sufficient to pass conformance
1325: test awk.ex 156
1326: */
1327: if (*prestr == ']') {
1328: prestr++;
1329: rlxval = collate_char;
1330: return CHAR;
1331: }
1332: }
1333: } else if (c == '[' && *prestr == '=') {
1334: char equiv_char;
1335: prestr++;
1336: equiv_char = *prestr++;
1337: if (*prestr == '=' && prestr[1] == ']') {
1338: prestr += 2;
1339: /* Found it: map via locale TBD: for now
1340: simply return this char. This is
1341: sufficient to pass conformance test
1342: awk.ex 156
1343: */
1344: if (*prestr == ']') {
1345: prestr++;
1346: rlxval = equiv_char;
1347: return CHAR;
1348: }
1349: }
1.1 tholo 1350: } else if (c == '\0') {
1.8 millert 1351: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1352: } else if (bp == buf) { /* 1st char is special */
1353: *bp++ = c;
1.1 tholo 1354: } else if (c == ']') {
1.5 kstailey 1355: *bp++ = 0;
1.10 millert 1356: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1357: if (cflag == 0)
1358: return CCL;
1359: else
1360: return NCCL;
1361: } else
1.5 kstailey 1362: *bp++ = c;
1.1 tholo 1363: }
1.23 millert 1364: break;
1365: case '{':
1366: if (isdigit(*(prestr))) {
1367: num = 0; /* Process as a repetition */
1368: n = -1; m = -1;
1.29 millert 1369: commafound = false;
1370: digitfound = false;
1.23 millert 1371: startreptok = prestr-1;
1372: /* Remember start of previous atom here ? */
1373: } else { /* just a { char, not a repetition */
1374: rlxval = c;
1375: return CHAR;
1376: }
1377: for (; ; ) {
1378: if ((c = *prestr++) == '}') {
1379: if (commafound) {
1380: if (digitfound) { /* {n,m} */
1381: m = num;
1.30 millert 1382: if (m < n)
1.23 millert 1383: FATAL("illegal repetition expression: class %.20s",
1384: lastre);
1.30 millert 1385: if (n == 0 && m == 1) {
1.23 millert 1386: return QUEST;
1387: }
1388: } else { /* {n,} */
1.30 millert 1389: if (n == 0)
1390: return STAR;
1391: else if (n == 1)
1392: return PLUS;
1.23 millert 1393: }
1394: } else {
1395: if (digitfound) { /* {n} same as {n,n} */
1396: n = num;
1397: m = num;
1398: } else { /* {} */
1399: FATAL("illegal repetition expression: class %.20s",
1400: lastre);
1401: }
1402: }
1403: if (repeat(starttok, prestr-starttok, lastatom,
1404: startreptok - lastatom, n, m) > 0) {
1.30 millert 1405: if (n == 0 && m == 0) {
1.31 millert 1406: return ZERO;
1.23 millert 1407: }
1408: /* must rescan input for next token */
1409: goto rescan;
1410: }
1411: /* Failed to replace: eat up {...} characters
1412: and treat like just PLUS */
1413: return PLUS;
1414: } else if (c == '\0') {
1415: FATAL("nonterminated character class %.20s",
1416: lastre);
1417: } else if (isdigit(c)) {
1418: num = 10 * num + c - '0';
1.29 millert 1419: digitfound = true;
1.23 millert 1420: } else if (c == ',') {
1421: if (commafound)
1422: FATAL("illegal repetition expression: class %.20s",
1423: lastre);
1424: /* looking for {n,} or {n,m} */
1.29 millert 1425: commafound = true;
1.23 millert 1426: n = num;
1.29 millert 1427: digitfound = false; /* reset */
1.23 millert 1428: num = 0;
1429: } else {
1430: FATAL("illegal repetition expression: class %.20s",
1431: lastre);
1432: }
1433: }
1434: break;
1.1 tholo 1435: }
1436: }
1437:
1438: int cgoto(fa *f, int s, int c)
1439: {
1.27 millert 1440: int *p, *q;
1.1 tholo 1441: int i, j, k;
1442:
1.38 millert 1443: /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1.1 tholo 1444: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 1445: resizesetvec(__func__);
1.1 tholo 1446: }
1447: for (i = 0; i <= f->accept; i++)
1448: setvec[i] = 0;
1449: setcnt = 0;
1.27 millert 1450: resize_state(f, s);
1.1 tholo 1451: /* compute positions of gototab[s,c] into setvec */
1452: p = f->posns[s];
1453: for (i = 1; i <= *p; i++) {
1454: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1455: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1456: || (k == DOT && c != 0 && c != HAT)
1457: || (k == ALL && c != 0)
1.15 millert 1458: || (k == EMPTYRE && c != 0)
1.38 millert 1459: || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1460: || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1.1 tholo 1461: q = f->re[p[i]].lfollow;
1462: for (j = 1; j <= *q; j++) {
1463: if (q[j] >= maxsetvec) {
1.27 millert 1464: resizesetvec(__func__);
1.1 tholo 1465: }
1466: if (setvec[q[j]] == 0) {
1467: setcnt++;
1468: setvec[q[j]] = 1;
1469: }
1470: }
1471: }
1472: }
1473: }
1474: /* determine if setvec is a previous state */
1475: tmpset[0] = setcnt;
1476: j = 1;
1477: for (i = f->accept; i >= 0; i--)
1478: if (setvec[i]) {
1479: tmpset[j++] = i;
1480: }
1.27 millert 1481: resize_state(f, f->curstat > s ? f->curstat : s);
1.1 tholo 1482: /* tmpset == previous state? */
1483: for (i = 1; i <= f->curstat; i++) {
1484: p = f->posns[i];
1485: if ((k = tmpset[0]) != p[0])
1486: goto different;
1487: for (j = 1; j <= k; j++)
1488: if (tmpset[j] != p[j])
1489: goto different;
1490: /* setvec is state i */
1.30 millert 1491: if (c != HAT)
1.38 millert 1492: set_gototab(f, s, c, i);
1.1 tholo 1493: return i;
1494: different:;
1495: }
1496:
1497: /* add tmpset to current set of states */
1.27 millert 1498: ++(f->curstat);
1499: resize_state(f, f->curstat);
1.1 tholo 1500: xfree(f->posns[f->curstat]);
1.27 millert 1501: p = intalloc(setcnt + 1, __func__);
1.1 tholo 1502:
1503: f->posns[f->curstat] = p;
1.30 millert 1504: if (c != HAT)
1.38 millert 1505: set_gototab(f, s, c, f->curstat);
1.1 tholo 1506: for (i = 0; i <= setcnt; i++)
1507: p[i] = tmpset[i];
1508: if (setvec[f->accept])
1509: f->out[f->curstat] = 1;
1510: else
1511: f->out[f->curstat] = 0;
1512: return f->curstat;
1513: }
1514:
1515:
1516: void freefa(fa *f) /* free a finite automaton */
1517: {
1518: int i;
1519:
1520: if (f == NULL)
1521: return;
1.27 millert 1522: for (i = 0; i < f->state_count; i++)
1523: xfree(f->gototab[i])
1.1 tholo 1524: for (i = 0; i <= f->curstat; i++)
1525: xfree(f->posns[i]);
1526: for (i = 0; i <= f->accept; i++) {
1527: xfree(f->re[i].lfollow);
1528: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30 millert 1529: xfree(f->re[i].lval.np);
1.1 tholo 1530: }
1531: xfree(f->restr);
1.27 millert 1532: xfree(f->out);
1533: xfree(f->posns);
1534: xfree(f->gototab);
1.1 tholo 1535: xfree(f);
1536: }