Annotation of src/usr.bin/awk/b.c, Revision 1.39
1.39 ! millert 1: /* $OpenBSD: b.c,v 1.38 2023/09/17 14:49:44 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.34 millert 36: #include "awkgram.tab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.28 millert 65: static const uschar *rlxstr;
66: static const uschar *prestr; /* current position in current re */
67: static const uschar *lastre; /* origin of last re */
68: static const uschar *lastatom; /* origin of last Atom */
69: static const uschar *starttok;
70: static const uschar *basestr; /* starts with original, replaced during
1.23 millert 71: repetition processing */
1.28 millert 72: static const uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
1.28 millert 77: const char *patbeg;
1.1 tholo 78: int patlen;
79:
1.27 millert 80: #define NFA 128 /* cache this many dynamic fa's */
1.1 tholo 81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.38 millert 84:
85: /* utf-8 mechanism:
86:
87: For most of Awk, utf-8 strings just "work", since they look like
88: null-terminated sequences of 8-bit bytes.
89:
90: Functions like length(), index(), and substr() have to operate
91: in units of utf-8 characters. The u8_* functions in run.c
92: handle this.
93:
94: Regular expressions are more complicated, since the basic
95: mechanism of the goto table used 8-bit byte indices into the
96: gototab entries to compute the next state. Unicode is a lot
97: bigger, so the gototab entries are now structs with a character
98: and a next state, and there is a linear search of the characters
99: to find the state. (Yes, this is slower, by a significant
100: amount. Tough.)
101:
102: Throughout the RE mechanism in b.c, utf-8 characters are
103: converted to their utf-32 value. This mostly shows up in
104: cclenter, which expands character class ranges like a-z and now
105: alpha-omega. The size of a gototab array is still about 256.
106: This should be dynamic, but for now things work ok for a single
107: code page of Unicode, which is the most likely case.
108:
109: The code changes are localized in run.c and b.c. I have added a
110: handful of functions to somewhat better hide the implementation,
111: but a lot more could be done.
112:
113: */
114:
115: static int get_gototab(fa*, int, int);
116: static int set_gototab(fa*, int, int, int);
117: extern int u8_rune(int *, const uschar *);
118:
1.27 millert 119: static int *
120: intalloc(size_t n, const char *f)
121: {
1.35 millert 122: int *p = (int *) calloc(n, sizeof(int));
1.27 millert 123: if (p == NULL)
124: overflo(f);
125: return p;
126: }
127:
128: static void
129: allocsetvec(const char *f)
130: {
131: maxsetvec = MAXLIN;
1.35 millert 132: setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
133: tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27 millert 134: if (setvec == NULL || tmpset == NULL)
135: overflo(f);
136: }
137:
138: static void
139: resizesetvec(const char *f)
140: {
1.35 millert 141: setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
142: tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27 millert 143: if (setvec == NULL || tmpset == NULL)
144: overflo(f);
145: maxsetvec *= 4;
146: }
147:
148: static void
149: resize_state(fa *f, int state)
150: {
1.38 millert 151: gtt **p;
1.35 millert 152: uschar *p2;
153: int **p3;
1.27 millert 154: int i, new_count;
155:
156: if (++state < f->state_count)
157: return;
158:
159: new_count = state + 10; /* needs to be tuned */
160:
1.38 millert 161: p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
1.27 millert 162: if (p == NULL)
163: goto out;
164: f->gototab = p;
165:
1.35 millert 166: p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
167: if (p2 == NULL)
1.27 millert 168: goto out;
1.35 millert 169: f->out = p2;
1.27 millert 170:
1.35 millert 171: p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
172: if (p3 == NULL)
1.27 millert 173: goto out;
1.35 millert 174: f->posns = p3;
1.27 millert 175:
176: for (i = f->state_count; i < new_count; ++i) {
1.38 millert 177: f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
1.27 millert 178: if (f->gototab[i] == NULL)
179: goto out;
180: f->out[i] = 0;
181: f->posns[i] = NULL;
182: }
1.38 millert 183: f->gototab_len = NCHARS; /* should be variable, growable */
1.27 millert 184: f->state_count = new_count;
185: return;
186: out:
187: overflo(__func__);
188: }
189:
1.29 millert 190: fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1.1 tholo 191: {
192: int i, use, nuse;
193: fa *pfa;
1.6 millert 194: static int now = 1;
1.1 tholo 195:
1.25 millert 196: if (setvec == NULL) { /* first time through any RE */
1.27 millert 197: allocsetvec(__func__);
1.1 tholo 198: }
199:
1.29 millert 200: if (compile_time != RUNNING) /* a constant for sure */
1.1 tholo 201: return mkdfa(s, anchor);
202: for (i = 0; i < nfatab; i++) /* is it there already? */
203: if (fatab[i]->anchor == anchor
1.11 millert 204: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 205: fatab[i]->use = now++;
1.1 tholo 206: return fatab[i];
1.10 millert 207: }
1.1 tholo 208: pfa = mkdfa(s, anchor);
209: if (nfatab < NFA) { /* room for another */
210: fatab[nfatab] = pfa;
1.6 millert 211: fatab[nfatab]->use = now++;
1.1 tholo 212: nfatab++;
213: return pfa;
214: }
215: use = fatab[0]->use; /* replace least-recently used */
216: nuse = 0;
217: for (i = 1; i < nfatab; i++)
218: if (fatab[i]->use < use) {
219: use = fatab[i]->use;
220: nuse = i;
221: }
222: freefa(fatab[nuse]);
223: fatab[nuse] = pfa;
1.6 millert 224: pfa->use = now++;
1.1 tholo 225: return pfa;
226: }
227:
1.29 millert 228: fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
1.30 millert 229: /* anchor = true for anchored matches, else false */
1.1 tholo 230: {
231: Node *p, *p1;
232: fa *f;
233:
1.28 millert 234: firstbasestr = (const uschar *) s;
1.23 millert 235: basestr = firstbasestr;
1.1 tholo 236: p = reparse(s);
237: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
238: /* put ALL STAR in front of reg. exp. */
239: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
240: /* put FINAL after reg. exp. */
241:
242: poscnt = 0;
243: penter(p1); /* enter parent pointers and leaf indices */
1.35 millert 244: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27 millert 245: overflo(__func__);
1.1 tholo 246: f->accept = poscnt-1; /* penter has computed number of positions in re */
247: cfoll(f, p1); /* set up follow sets */
248: freetr(p1);
1.27 millert 249: resize_state(f, 1);
250: f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
251: f->posns[1] = intalloc(1, __func__);
1.1 tholo 252: *f->posns[1] = 0;
253: f->initstat = makeinit(f, anchor);
254: f->anchor = anchor;
1.10 millert 255: f->restr = (uschar *) tostring(s);
1.23 millert 256: if (firstbasestr != basestr) {
257: if (basestr)
258: xfree(basestr);
259: }
1.1 tholo 260: return f;
261: }
262:
1.29 millert 263: int makeinit(fa *f, bool anchor)
1.1 tholo 264: {
265: int i, k;
266:
267: f->curstat = 2;
268: f->out[2] = 0;
269: k = *(f->re[0].lfollow);
1.25 millert 270: xfree(f->posns[2]);
1.27 millert 271: f->posns[2] = intalloc(k + 1, __func__);
1.30 millert 272: for (i = 0; i <= k; i++) {
1.1 tholo 273: (f->posns[2])[i] = (f->re[0].lfollow)[i];
274: }
275: if ((f->posns[2])[1] == f->accept)
276: f->out[2] = 1;
1.30 millert 277: for (i = 0; i < NCHARS; i++)
1.38 millert 278: set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */
1.1 tholo 279: f->curstat = cgoto(f, 2, HAT);
280: if (anchor) {
281: *f->posns[2] = k-1; /* leave out position 0 */
1.30 millert 282: for (i = 0; i < k; i++) {
1.1 tholo 283: (f->posns[0])[i] = (f->posns[2])[i];
284: }
285:
286: f->out[0] = f->out[2];
287: if (f->curstat != 2)
288: --(*f->posns[f->curstat]);
289: }
290: return f->curstat;
291: }
292:
293: void penter(Node *p) /* set up parent pointers and leaf indices */
294: {
295: switch (type(p)) {
1.15 millert 296: ELEAF
1.1 tholo 297: LEAF
1.7 millert 298: info(p) = poscnt;
1.1 tholo 299: poscnt++;
300: break;
301: UNARY
302: penter(left(p));
303: parent(left(p)) = p;
304: break;
305: case CAT:
306: case OR:
307: penter(left(p));
308: penter(right(p));
309: parent(left(p)) = p;
310: parent(right(p)) = p;
311: break;
1.31 millert 312: case ZERO:
313: break;
1.1 tholo 314: default: /* can't happen */
1.8 millert 315: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 316: break;
317: }
318: }
319:
320: void freetr(Node *p) /* free parse tree */
321: {
322: switch (type(p)) {
1.15 millert 323: ELEAF
1.1 tholo 324: LEAF
325: xfree(p);
326: break;
327: UNARY
1.31 millert 328: case ZERO:
1.1 tholo 329: freetr(left(p));
330: xfree(p);
331: break;
332: case CAT:
333: case OR:
334: freetr(left(p));
335: freetr(right(p));
336: xfree(p);
337: break;
338: default: /* can't happen */
1.8 millert 339: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 340: break;
341: }
342: }
343:
344: /* in the parsing of regular expressions, metacharacters like . have */
345: /* to be seen literally; \056 is not a metacharacter. */
346:
1.38 millert 347: int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
1.2 millert 348: { /* only pick up one 8-bit byte (2 chars) */
1.28 millert 349: const uschar *p;
1.1 tholo 350: int n = 0;
1.2 millert 351: int i;
1.1 tholo 352:
1.38 millert 353: for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
1.1 tholo 354: if (isdigit(*p))
355: n = 16 * n + *p - '0';
356: else if (*p >= 'a' && *p <= 'f')
357: n = 16 * n + *p - 'a' + 10;
358: else if (*p >= 'A' && *p <= 'F')
359: n = 16 * n + *p - 'A' + 10;
360: }
1.28 millert 361: *pp = p;
1.1 tholo 362: return n;
363: }
364:
1.2 millert 365: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 366:
1.28 millert 367: int quoted(const uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 368: /* and increment *pp */
369: {
1.28 millert 370: const uschar *p = *pp;
1.1 tholo 371: int c;
372:
1.38 millert 373: /* BUG: should advance by utf-8 char even if makes no sense */
374:
375: if ((c = *p++) == 't') {
1.1 tholo 376: c = '\t';
1.38 millert 377: } else if (c == 'n') {
1.1 tholo 378: c = '\n';
1.38 millert 379: } else if (c == 'f') {
1.1 tholo 380: c = '\f';
1.38 millert 381: } else if (c == 'r') {
1.1 tholo 382: c = '\r';
1.38 millert 383: } else if (c == 'b') {
1.1 tholo 384: c = '\b';
1.38 millert 385: } else if (c == 'v') {
1.25 millert 386: c = '\v';
1.38 millert 387: } else if (c == 'a') {
1.25 millert 388: c = '\a';
1.38 millert 389: } else if (c == '\\') {
1.1 tholo 390: c = '\\';
1.38 millert 391: } else if (c == 'x') { /* 2 hex digits follow */
392: c = hexstr(&p, 2); /* this adds a null if number is invalid */
393: } else if (c == 'u') { /* unicode char number up to 8 hex digits */
394: c = hexstr(&p, 8);
1.1 tholo 395: } else if (isoctdigit(c)) { /* \d \dd \ddd */
396: int n = c - '0';
397: if (isoctdigit(*p)) {
398: n = 8 * n + *p++ - '0';
399: if (isoctdigit(*p))
400: n = 8 * n + *p++ - '0';
401: }
402: c = n;
403: } /* else */
404: /* c = c; */
405: *pp = p;
406: return c;
407: }
408:
1.38 millert 409: int *cclenter(const char *argp) /* add a character class */
1.1 tholo 410: {
411: int i, c, c2;
1.38 millert 412: int n;
413: const uschar *p = (const uschar *) argp;
414: int *bp, *retp;
415: static int *buf = NULL;
1.5 kstailey 416: static int bufsz = 100;
1.1 tholo 417:
1.38 millert 418: if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
1.8 millert 419: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 420: bp = buf;
1.38 millert 421: for (i = 0; *p != 0; ) {
422: n = u8_rune(&c, p);
423: p += n;
1.1 tholo 424: if (c == '\\') {
1.17 millert 425: c = quoted(&p);
1.5 kstailey 426: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 427: if (*p != 0) {
1.5 kstailey 428: c = bp[-1];
1.38 millert 429: /* c2 = *p++; */
430: n = u8_rune(&c2, p);
431: p += n;
1.1 tholo 432: if (c2 == '\\')
1.38 millert 433: c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
1.1 tholo 434: if (c > c2) { /* empty; ignore */
1.5 kstailey 435: bp--;
1.1 tholo 436: i--;
437: continue;
438: }
439: while (c < c2) {
1.38 millert 440: if (i >= bufsz) {
441: buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2);
442: if (buf == NULL)
443: FATAL("out of space for character class [%.10s...] 2", p);
444: bufsz *= 2;
445: bp = buf + i;
446: }
1.5 kstailey 447: *bp++ = ++c;
1.1 tholo 448: i++;
449: }
450: continue;
451: }
452: }
1.38 millert 453: if (i >= bufsz) {
454: buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2);
455: if (buf == NULL)
456: FATAL("out of space for character class [%.10s...] 2", p);
457: bufsz *= 2;
458: bp = buf + i;
459: }
1.5 kstailey 460: *bp++ = c;
1.1 tholo 461: i++;
462: }
1.5 kstailey 463: *bp = 0;
1.38 millert 464: /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
465: /* xfree(op); BUG: what are we freeing here? */
466: retp = (int *) calloc(bp-buf+1, sizeof(int));
467: for (i = 0; i < bp-buf+1; i++)
468: retp[i] = buf[i];
469: return retp;
1.1 tholo 470: }
471:
1.11 millert 472: void overflo(const char *s)
1.1 tholo 473: {
1.27 millert 474: FATAL("regular expression too big: out of space in %.30s...", s);
1.1 tholo 475: }
476:
477: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
478: {
479: int i;
1.4 millert 480: int *p;
1.1 tholo 481:
482: switch (type(v)) {
1.15 millert 483: ELEAF
1.1 tholo 484: LEAF
1.7 millert 485: f->re[info(v)].ltype = type(v);
486: f->re[info(v)].lval.np = right(v);
1.1 tholo 487: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 488: resizesetvec(__func__);
1.1 tholo 489: }
490: for (i = 0; i <= f->accept; i++)
491: setvec[i] = 0;
492: setcnt = 0;
493: follow(v); /* computes setvec and setcnt */
1.27 millert 494: p = intalloc(setcnt + 1, __func__);
1.7 millert 495: f->re[info(v)].lfollow = p;
1.1 tholo 496: *p = setcnt;
497: for (i = f->accept; i >= 0; i--)
498: if (setvec[i] == 1)
499: *++p = i;
500: break;
501: UNARY
502: cfoll(f,left(v));
503: break;
504: case CAT:
505: case OR:
506: cfoll(f,left(v));
507: cfoll(f,right(v));
508: break;
1.31 millert 509: case ZERO:
510: break;
1.1 tholo 511: default: /* can't happen */
1.8 millert 512: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 513: }
514: }
515:
516: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 517: /* returns 0 if p matches empty string */
1.1 tholo 518: {
1.4 millert 519: int b, lp;
1.1 tholo 520:
521: switch (type(p)) {
1.15 millert 522: ELEAF
1.1 tholo 523: LEAF
1.7 millert 524: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 525: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.27 millert 526: resizesetvec(__func__);
1.1 tholo 527: }
1.15 millert 528: if (type(p) == EMPTYRE) {
529: setvec[lp] = 0;
530: return(0);
531: }
1.1 tholo 532: if (setvec[lp] != 1) {
533: setvec[lp] = 1;
534: setcnt++;
535: }
536: if (type(p) == CCL && (*(char *) right(p)) == '\0')
537: return(0); /* empty CCL */
1.30 millert 538: return(1);
1.1 tholo 539: case PLUS:
1.30 millert 540: if (first(left(p)) == 0)
541: return(0);
1.1 tholo 542: return(1);
543: case STAR:
544: case QUEST:
545: first(left(p));
546: return(0);
547: case CAT:
548: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
549: return(1);
550: case OR:
551: b = first(right(p));
552: if (first(left(p)) == 0 || b == 0) return(0);
553: return(1);
1.31 millert 554: case ZERO:
555: return 0;
1.1 tholo 556: }
1.8 millert 557: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 558: return(-1);
559: }
560:
561: void follow(Node *v) /* collects leaves that can follow v into setvec */
562: {
563: Node *p;
564:
565: if (type(v) == FINAL)
566: return;
567: p = parent(v);
568: switch (type(p)) {
569: case STAR:
570: case PLUS:
571: first(v);
572: follow(p);
573: return;
574:
575: case OR:
576: case QUEST:
577: follow(p);
578: return;
579:
580: case CAT:
581: if (v == left(p)) { /* v is left child of p */
582: if (first(right(p)) == 0) {
583: follow(p);
584: return;
585: }
586: } else /* v is right child */
587: follow(p);
588: return;
589: }
590: }
591:
1.38 millert 592: int member(int c, int *sarg) /* is c in s? */
1.1 tholo 593: {
1.38 millert 594: int *s = (int *) sarg;
1.10 millert 595:
1.1 tholo 596: while (*s)
597: if (c == *s++)
598: return(1);
599: return(0);
600: }
601:
1.38 millert 602: static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
603: {
604: int i;
605: for (i = 0; i < f->gototab_len; i++) {
606: if (f->gototab[state][i].ch == 0)
607: break;
608: if (f->gototab[state][i].ch == ch)
609: return f->gototab[state][i].state;
610: }
611: return 0;
612: }
613:
614: static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
615: {
616: int i;
617: for (i = 0; i < f->gototab_len; i++) {
618: if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
619: f->gototab[state][i].ch = ch;
620: f->gototab[state][i].state = val;
621: return val;
622: }
623: }
624: overflo(__func__);
625: return val; /* not used anywhere at the moment */
626: }
627:
1.11 millert 628: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 629: {
630: int s, ns;
1.38 millert 631: int n;
632: int rune;
1.28 millert 633: const uschar *p = (const uschar *) p0;
1.1 tholo 634:
1.38 millert 635: /* return pmatch(f, p0); does it matter whether longest or shortest? */
636:
1.27 millert 637: s = f->initstat;
638: assert (s < f->state_count);
639:
1.1 tholo 640: if (f->out[s])
641: return(1);
642: do {
1.15 millert 643: /* assert(*p < NCHARS); */
1.38 millert 644: n = u8_rune(&rune, p);
645: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 646: s = ns;
647: else
1.38 millert 648: s = cgoto(f, s, rune);
1.1 tholo 649: if (f->out[s])
650: return(1);
1.38 millert 651: if (*p == 0)
652: break;
653: p += n;
654: } while (1); /* was *p++ != 0 */
1.1 tholo 655: return(0);
656: }
657:
1.11 millert 658: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 659: {
660: int s, ns;
1.38 millert 661: int n;
662: int rune;
1.28 millert 663: const uschar *p = (const uschar *) p0;
664: const uschar *q;
1.1 tholo 665:
1.27 millert 666: s = f->initstat;
667: assert(s < f->state_count);
668:
1.28 millert 669: patbeg = (const char *)p;
1.1 tholo 670: patlen = -1;
671: do {
672: q = p;
673: do {
674: if (f->out[s]) /* final state */
675: patlen = q-p;
1.15 millert 676: /* assert(*q < NCHARS); */
1.38 millert 677: n = u8_rune(&rune, q);
678: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 679: s = ns;
680: else
1.38 millert 681: s = cgoto(f, s, rune);
1.27 millert 682:
683: assert(s < f->state_count);
684:
1.9 deraadt 685: if (s == 1) { /* no transition */
1.1 tholo 686: if (patlen >= 0) {
1.28 millert 687: patbeg = (const char *) p;
1.1 tholo 688: return(1);
689: }
690: else
691: goto nextin; /* no match */
1.9 deraadt 692: }
1.38 millert 693: if (*q == 0)
694: break;
695: q += n;
696: } while (1);
697: q++; /* was *q++ */
1.1 tholo 698: if (f->out[s])
699: patlen = q-p-1; /* don't count $ */
700: if (patlen >= 0) {
1.28 millert 701: patbeg = (const char *) p;
1.1 tholo 702: return(1);
703: }
704: nextin:
705: s = 2;
1.38 millert 706: if (*p == 0)
707: break;
708: n = u8_rune(&rune, p);
709: p += n;
710: } while (1); /* was *p++ */
1.1 tholo 711: return (0);
712: }
713:
1.11 millert 714: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 715: {
716: int s, ns;
1.38 millert 717: int n;
718: int rune;
1.28 millert 719: const uschar *p = (const uschar *) p0;
720: const uschar *q;
1.1 tholo 721:
1.27 millert 722: s = f->initstat;
723: assert(s < f->state_count);
724:
1.28 millert 725: patbeg = (const char *)p;
1.1 tholo 726: patlen = -1;
727: while (*p) {
728: q = p;
729: do {
730: if (f->out[s]) /* final state */
731: patlen = q-p;
1.15 millert 732: /* assert(*q < NCHARS); */
1.38 millert 733: n = u8_rune(&rune, q);
734: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 735: s = ns;
736: else
1.38 millert 737: s = cgoto(f, s, rune);
1.9 deraadt 738: if (s == 1) { /* no transition */
1.1 tholo 739: if (patlen > 0) {
1.28 millert 740: patbeg = (const char *) p;
1.1 tholo 741: return(1);
742: } else
743: goto nnextin; /* no nonempty match */
1.9 deraadt 744: }
1.38 millert 745: if (*q == 0)
746: break;
747: q += n;
748: } while (1);
749: q++;
1.1 tholo 750: if (f->out[s])
751: patlen = q-p-1; /* don't count $ */
752: if (patlen > 0 ) {
1.28 millert 753: patbeg = (const char *) p;
1.1 tholo 754: return(1);
755: }
756: nnextin:
757: s = 2;
758: p++;
759: }
760: return (0);
761: }
762:
1.38 millert 763: static int getrune(FILE *fp, char **pbuf, int *pbufsize, int quantum,
764: int *curpos, int *lastpos)
765: {
766: int c = 0;
767: char *buf = *pbuf;
768: static const int max_bytes = 4; // max multiple bytes in UTF-8 is 4
769: int i, rune;
770: uschar private_buf[max_bytes + 1];
771:
772: for (i = 0; i <= max_bytes; i++) {
773: if (++*curpos == *lastpos) {
774: if (*lastpos == *pbufsize)
775: if (!adjbuf((char **) pbuf, pbufsize, *pbufsize+1, quantum, 0, "getrune"))
776: FATAL("stream '%.30s...' too long", buf);
777: buf[(*lastpos)++] = (c = getc(fp)) != EOF ? c : 0;
778: private_buf[i] = c;
779: }
780: if (c == 0 || c < 128 || (c >> 6) == 4) { // 10xxxxxx starts a new character
781: ungetc(c, fp);
782: private_buf[i] = 0;
783: break;
784: }
785: }
786:
787: u8_rune(& rune, private_buf);
788:
789: return rune;
790: }
791:
1.26 millert 792:
793: /*
794: * NAME
795: * fnematch
796: *
797: * DESCRIPTION
798: * A stream-fed version of nematch which transfers characters to a
799: * null-terminated buffer. All characters up to and including the last
800: * character of the matching text or EOF are placed in the buffer. If
801: * a match is found, patbeg and patlen are set appropriately.
802: *
803: * RETURN VALUES
1.29 millert 804: * false No match found.
805: * true Match found.
1.26 millert 806: */
807:
1.29 millert 808: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26 millert 809: {
810: char *buf = *pbuf;
811: int bufsize = *pbufsize;
812: int c, i, j, k, ns, s;
1.38 millert 813: int rune;
1.26 millert 814:
815: s = pfa->initstat;
816: patlen = 0;
817:
818: /*
819: * All indices relative to buf.
820: * i <= j <= k <= bufsize
821: *
822: * i: origin of active substring
823: * j: current character
824: * k: destination of next getc()
825: */
826: i = -1, k = 0;
827: do {
828: j = i++;
829: do {
830: if (++j == k) {
831: if (k == bufsize)
832: if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
833: FATAL("stream '%.30s...' too long", buf);
834: buf[k++] = (c = getc(f)) != EOF ? c : 0;
835: }
1.33 millert 836: c = (uschar)buf[j];
1.39 ! millert 837: if (c < 128 || awk_mb_cur_max == 1)
1.38 millert 838: rune = c;
839: else {
840: j--;
841: k--;
842: ungetc(c, f);
843: rune = getrune(f, &buf, &bufsize, quantum, &j, &k);
844: }
1.26 millert 845:
1.38 millert 846: if ((ns = get_gototab(pfa, s, rune)) != 0)
1.26 millert 847: s = ns;
848: else
1.38 millert 849: s = cgoto(pfa, s, rune);
1.26 millert 850:
851: if (pfa->out[s]) { /* final state */
852: patlen = j - i + 1;
853: if (c == 0) /* don't count $ */
854: patlen--;
855: }
856: } while (buf[j] && s != 1);
857: s = 2;
858: } while (buf[i] && !patlen);
859:
860: /* adjbuf() may have relocated a resized buffer. Inform the world. */
861: *pbuf = buf;
862: *pbufsize = bufsize;
863:
864: if (patlen) {
865: patbeg = buf + i;
866: /*
867: * Under no circumstances is the last character fed to
868: * the automaton part of the match. It is EOF's nullbyte,
869: * or it sent the automaton into a state with no further
870: * transitions available (s==1), or both. Room for a
871: * terminating nullbyte is guaranteed.
872: *
873: * ungetc any chars after the end of matching text
874: * (except for EOF's nullbyte, if present) and null
875: * terminate the buffer.
876: */
877: do
878: if (buf[--k] && ungetc(buf[k], f) == EOF)
879: FATAL("unable to ungetc '%c'", buf[k]);
880: while (k > i + patlen);
1.30 millert 881: buf[k] = '\0';
1.29 millert 882: return true;
1.26 millert 883: }
884: else
1.29 millert 885: return false;
1.26 millert 886: }
887:
1.11 millert 888: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 889: { /* uses relex() to scan regular expression */
890: Node *np;
891:
1.32 millert 892: DPRINTF("reparse <%s>\n", p);
1.28 millert 893: lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 894: rtok = relex();
1.11 millert 895: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 896: if (rtok == '\0') {
1.11 millert 897: /* FATAL("empty regular expression"); previous */
1.15 millert 898: return(op2(EMPTYRE, NIL, NIL));
899: }
1.1 tholo 900: np = regexp();
901: if (rtok != '\0')
1.8 millert 902: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 903: return(np);
904: }
905:
906: Node *regexp(void) /* top-level parse of reg expr */
907: {
908: return (alt(concat(primary())));
909: }
910:
911: Node *primary(void)
912: {
913: Node *np;
1.23 millert 914: int savelastatom;
1.1 tholo 915:
916: switch (rtok) {
917: case CHAR:
1.23 millert 918: lastatom = starttok;
1.7 millert 919: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 920: rtok = relex();
921: return (unary(np));
922: case ALL:
923: rtok = relex();
924: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 925: case EMPTYRE:
926: rtok = relex();
1.23 millert 927: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 928: case DOT:
1.23 millert 929: lastatom = starttok;
1.1 tholo 930: rtok = relex();
931: return (unary(op2(DOT, NIL, NIL)));
932: case CCL:
1.28 millert 933: np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23 millert 934: lastatom = starttok;
1.1 tholo 935: rtok = relex();
936: return (unary(np));
937: case NCCL:
1.28 millert 938: np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23 millert 939: lastatom = starttok;
1.1 tholo 940: rtok = relex();
941: return (unary(np));
942: case '^':
943: rtok = relex();
1.7 millert 944: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 945: case '$':
946: rtok = relex();
947: return (unary(op2(CHAR, NIL, NIL)));
948: case '(':
1.23 millert 949: lastatom = starttok;
950: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 951: rtok = relex();
952: if (rtok == ')') { /* special pleading for () */
953: rtok = relex();
954: return unary(op2(CCL, NIL, (Node *) tostring("")));
955: }
956: np = regexp();
957: if (rtok == ')') {
1.23 millert 958: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 959: rtok = relex();
960: return (unary(np));
961: }
962: else
1.8 millert 963: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 964: default:
1.8 millert 965: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 966: }
967: return 0; /*NOTREACHED*/
968: }
969:
970: Node *concat(Node *np)
971: {
972: switch (rtok) {
1.23 millert 973: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 974: return (concat(op2(CAT, np, primary())));
1.23 millert 975: case EMPTYRE:
976: rtok = relex();
977: return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
978: primary())));
1.1 tholo 979: }
980: return (np);
981: }
982:
983: Node *alt(Node *np)
984: {
985: if (rtok == OR) {
986: rtok = relex();
987: return (alt(op2(OR, np, concat(primary()))));
988: }
989: return (np);
990: }
991:
992: Node *unary(Node *np)
993: {
994: switch (rtok) {
995: case STAR:
996: rtok = relex();
997: return (unary(op2(STAR, np, NIL)));
998: case PLUS:
999: rtok = relex();
1000: return (unary(op2(PLUS, np, NIL)));
1001: case QUEST:
1002: rtok = relex();
1003: return (unary(op2(QUEST, np, NIL)));
1.31 millert 1004: case ZERO:
1005: rtok = relex();
1006: return (unary(op2(ZERO, np, NIL)));
1.1 tholo 1007: default:
1008: return (np);
1009: }
1010: }
1011:
1.11 millert 1012: /*
1013: * Character class definitions conformant to the POSIX locale as
1014: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1015: * and operating character sets are both ASCII (ISO646) or supersets
1016: * thereof.
1017: *
1018: * Note that to avoid overflowing the temporary buffer used in
1019: * relex(), the expanded character class (prior to range expansion)
1020: * must be less than twice the size of their full name.
1021: */
1.12 millert 1022:
1023: /* Because isblank doesn't show up in any of the header files on any
1024: * system i use, it's defined here. if some other locale has a richer
1025: * definition of "blank", define HAS_ISBLANK and provide your own
1026: * version.
1027: * the parentheses here are an attempt to find a path through the maze
1028: * of macro definition and/or function and/or version provided. thanks
1029: * to nelson beebe for the suggestion; let's see if it works everywhere.
1030: */
1031:
1032: #ifndef HAS_ISBLANK
1033:
1.17 millert 1034: int (xisblank)(int c)
1.12 millert 1035: {
1036: return c==' ' || c=='\t';
1037: }
1038:
1039: #endif
1040:
1.31 millert 1041: static const struct charclass {
1.11 millert 1042: const char *cc_name;
1043: int cc_namelen;
1.12 millert 1044: int (*cc_func)(int);
1.11 millert 1045: } charclasses[] = {
1.12 millert 1046: { "alnum", 5, isalnum },
1047: { "alpha", 5, isalpha },
1.17 millert 1048: #ifndef HAS_ISBLANK
1.21 millert 1049: { "blank", 5, xisblank },
1.17 millert 1050: #else
1.12 millert 1051: { "blank", 5, isblank },
1.17 millert 1052: #endif
1.12 millert 1053: { "cntrl", 5, iscntrl },
1054: { "digit", 5, isdigit },
1055: { "graph", 5, isgraph },
1056: { "lower", 5, islower },
1057: { "print", 5, isprint },
1058: { "punct", 5, ispunct },
1059: { "space", 5, isspace },
1060: { "upper", 5, isupper },
1061: { "xdigit", 6, isxdigit },
1.11 millert 1062: { NULL, 0, NULL },
1063: };
1064:
1.23 millert 1065: #define REPEAT_SIMPLE 0
1066: #define REPEAT_PLUS_APPENDED 1
1067: #define REPEAT_WITH_Q 2
1068: #define REPEAT_ZERO 3
1069:
1070: static int
1071: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1072: int atomlen, int firstnum, int secondnum, int special_case)
1073: {
1074: int i, j;
1.26 millert 1075: uschar *buf = NULL;
1.23 millert 1076: int ret = 1;
1.31 millert 1077: int init_q = (firstnum == 0); /* first added char will be ? */
1.23 millert 1078: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1079: int prefix_length = reptok - basestr; /* prefix includes first rep */
1.28 millert 1080: int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1.23 millert 1081: int size = prefix_length + suffix_length;
1082:
1083: if (firstnum > 1) { /* add room for reps 2 through firstnum */
1084: size += atomlen*(firstnum-1);
1085: }
1086:
1087: /* Adjust size of buffer for special cases */
1088: if (special_case == REPEAT_PLUS_APPENDED) {
1089: size++; /* for the final + */
1090: } else if (special_case == REPEAT_WITH_Q) {
1.36 millert 1091: size += init_q + (atomlen+1)* (n_q_reps-init_q);
1.23 millert 1092: } else if (special_case == REPEAT_ZERO) {
1093: size += 2; /* just a null ERE: () */
1094: }
1.35 millert 1095: if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23 millert 1096: FATAL("out of space in reg expr %.10s..", lastre);
1097: memcpy(buf, basestr, prefix_length); /* copy prefix */
1098: j = prefix_length;
1099: if (special_case == REPEAT_ZERO) {
1100: j -= atomlen;
1101: buf[j++] = '(';
1102: buf[j++] = ')';
1103: }
1.30 millert 1104: for (i = 1; i < firstnum; i++) { /* copy x reps */
1.23 millert 1105: memcpy(&buf[j], atom, atomlen);
1106: j += atomlen;
1107: }
1108: if (special_case == REPEAT_PLUS_APPENDED) {
1109: buf[j++] = '+';
1110: } else if (special_case == REPEAT_WITH_Q) {
1.29 millert 1111: if (init_q)
1112: buf[j++] = '?';
1.30 millert 1113: for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1.23 millert 1114: memcpy(&buf[j], atom, atomlen);
1115: j += atomlen;
1116: buf[j++] = '?';
1117: }
1118: }
1119: memcpy(&buf[j], reptok+reptoklen, suffix_length);
1.36 millert 1120: j += suffix_length;
1121: buf[j] = '\0';
1.23 millert 1122: /* free old basestr */
1123: if (firstbasestr != basestr) {
1124: if (basestr)
1125: xfree(basestr);
1126: }
1127: basestr = buf;
1128: prestr = buf + prefix_length;
1129: if (special_case == REPEAT_ZERO) {
1130: prestr -= atomlen;
1131: ret++;
1132: }
1133: return ret;
1134: }
1135:
1136: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1137: int atomlen, int firstnum, int secondnum)
1138: {
1139: /*
1140: In general, the repetition specifier or "bound" is replaced here
1141: by an equivalent ERE string, repeating the immediately previous atom
1142: and appending ? and + as needed. Note that the first copy of the
1143: atom is left in place, except in the special_case of a zero-repeat
1144: (i.e., {0}).
1145: */
1146: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1147: if (firstnum < 2) {
1148: /* 0 or 1: should be handled before you get here */
1149: FATAL("internal error");
1150: } else {
1151: return replace_repeat(reptok, reptoklen, atom, atomlen,
1152: firstnum, secondnum, REPEAT_PLUS_APPENDED);
1153: }
1154: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1155: if (firstnum == 0) { /* {0} or {0,0} */
1156: /* This case is unusual because the resulting
1157: replacement string might actually be SMALLER than
1158: the original ERE */
1159: return replace_repeat(reptok, reptoklen, atom, atomlen,
1160: firstnum, secondnum, REPEAT_ZERO);
1161: } else { /* (firstnum >= 1) */
1162: return replace_repeat(reptok, reptoklen, atom, atomlen,
1163: firstnum, secondnum, REPEAT_SIMPLE);
1164: }
1165: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1166: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1167: return replace_repeat(reptok, reptoklen, atom, atomlen,
1168: firstnum, secondnum, REPEAT_WITH_Q);
1169: } else { /* Error - shouldn't be here (n>m) */
1170: FATAL("internal error");
1171: }
1172: return 0;
1173: }
1.11 millert 1174:
1.38 millert 1175: extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
1176:
1.1 tholo 1177: int relex(void) /* lexical analyzer for reparse */
1178: {
1.5 kstailey 1179: int c, n;
1.1 tholo 1180: int cflag;
1.25 millert 1181: static uschar *buf = NULL;
1.5 kstailey 1182: static int bufsz = 100;
1.10 millert 1183: uschar *bp;
1.31 millert 1184: const struct charclass *cc;
1.12 millert 1185: int i;
1.29 millert 1186: int num, m;
1187: bool commafound, digitfound;
1.23 millert 1188: const uschar *startreptok;
1.24 millert 1189: static int parens = 0;
1.23 millert 1190:
1191: rescan:
1192: starttok = prestr;
1.1 tholo 1193:
1.38 millert 1194: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1195: prestr += n;
1196: starttok = prestr;
1197: return CHAR;
1198: }
1199:
1.1 tholo 1200: switch (c = *prestr++) {
1201: case '|': return OR;
1202: case '*': return STAR;
1203: case '+': return PLUS;
1204: case '?': return QUEST;
1205: case '.': return DOT;
1206: case '\0': prestr--; return '\0';
1207: case '^':
1208: case '$':
1.24 millert 1209: return c;
1.1 tholo 1210: case '(':
1.24 millert 1211: parens++;
1212: return c;
1.1 tholo 1213: case ')':
1.24 millert 1214: if (parens) {
1215: parens--;
1216: return c;
1217: }
1218: /* unmatched close parenthesis; per POSIX, treat as literal */
1219: rlxval = c;
1220: return CHAR;
1.1 tholo 1221: case '\\':
1.17 millert 1222: rlxval = quoted(&prestr);
1.1 tholo 1223: return CHAR;
1224: default:
1225: rlxval = c;
1226: return CHAR;
1.25 millert 1227: case '[':
1.35 millert 1228: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 1229: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 1230: bp = buf;
1.1 tholo 1231: if (*prestr == '^') {
1232: cflag = 1;
1233: prestr++;
1234: }
1235: else
1236: cflag = 0;
1.38 millert 1237: n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1.15 millert 1238: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 1239: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 1240: for (; ; ) {
1.38 millert 1241: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1242: for (i = 0; i < n; i++)
1243: *bp++ = *prestr++;
1244: continue;
1245: }
1.1 tholo 1246: if ((c = *prestr++) == '\\') {
1.5 kstailey 1247: *bp++ = '\\';
1.1 tholo 1248: if ((c = *prestr++) == '\0')
1.8 millert 1249: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 1250: *bp++ = c;
1.10 millert 1251: /* } else if (c == '\n') { */
1252: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 1253: } else if (c == '[' && *prestr == ':') {
1254: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1255: for (cc = charclasses; cc->cc_name; cc++)
1256: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1257: break;
1258: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1259: prestr[2 + cc->cc_namelen] == ']') {
1260: prestr += cc->cc_namelen + 3;
1.22 millert 1261: /*
1262: * BUG: We begin at 1, instead of 0, since we
1263: * would otherwise prematurely terminate the
1264: * string for classes like [[:cntrl:]]. This
1265: * means that we can't match the NUL character,
1266: * not without first adapting the entire
1267: * program to track each string's length.
1268: */
1.23 millert 1269: for (i = 1; i <= UCHAR_MAX; i++) {
1.37 millert 1270: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1.12 millert 1271: FATAL("out of space for reg expr %.10s...", lastre);
1272: if (cc->cc_func(i)) {
1.32 millert 1273: /* escape backslash */
1274: if (i == '\\') {
1275: *bp++ = '\\';
1276: n++;
1277: }
1278:
1.12 millert 1279: *bp++ = i;
1280: n++;
1281: }
1282: }
1.11 millert 1283: } else
1284: *bp++ = c;
1.23 millert 1285: } else if (c == '[' && *prestr == '.') {
1286: char collate_char;
1287: prestr++;
1288: collate_char = *prestr++;
1289: if (*prestr == '.' && prestr[1] == ']') {
1290: prestr += 2;
1291: /* Found it: map via locale TBD: for
1292: now, simply return this char. This
1293: is sufficient to pass conformance
1294: test awk.ex 156
1295: */
1296: if (*prestr == ']') {
1297: prestr++;
1298: rlxval = collate_char;
1299: return CHAR;
1300: }
1301: }
1302: } else if (c == '[' && *prestr == '=') {
1303: char equiv_char;
1304: prestr++;
1305: equiv_char = *prestr++;
1306: if (*prestr == '=' && prestr[1] == ']') {
1307: prestr += 2;
1308: /* Found it: map via locale TBD: for now
1309: simply return this char. This is
1310: sufficient to pass conformance test
1311: awk.ex 156
1312: */
1313: if (*prestr == ']') {
1314: prestr++;
1315: rlxval = equiv_char;
1316: return CHAR;
1317: }
1318: }
1.1 tholo 1319: } else if (c == '\0') {
1.8 millert 1320: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1321: } else if (bp == buf) { /* 1st char is special */
1322: *bp++ = c;
1.1 tholo 1323: } else if (c == ']') {
1.5 kstailey 1324: *bp++ = 0;
1.10 millert 1325: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1326: if (cflag == 0)
1327: return CCL;
1328: else
1329: return NCCL;
1330: } else
1.5 kstailey 1331: *bp++ = c;
1.1 tholo 1332: }
1.23 millert 1333: break;
1334: case '{':
1335: if (isdigit(*(prestr))) {
1336: num = 0; /* Process as a repetition */
1337: n = -1; m = -1;
1.29 millert 1338: commafound = false;
1339: digitfound = false;
1.23 millert 1340: startreptok = prestr-1;
1341: /* Remember start of previous atom here ? */
1342: } else { /* just a { char, not a repetition */
1343: rlxval = c;
1344: return CHAR;
1345: }
1346: for (; ; ) {
1347: if ((c = *prestr++) == '}') {
1348: if (commafound) {
1349: if (digitfound) { /* {n,m} */
1350: m = num;
1.30 millert 1351: if (m < n)
1.23 millert 1352: FATAL("illegal repetition expression: class %.20s",
1353: lastre);
1.30 millert 1354: if (n == 0 && m == 1) {
1.23 millert 1355: return QUEST;
1356: }
1357: } else { /* {n,} */
1.30 millert 1358: if (n == 0)
1359: return STAR;
1360: else if (n == 1)
1361: return PLUS;
1.23 millert 1362: }
1363: } else {
1364: if (digitfound) { /* {n} same as {n,n} */
1365: n = num;
1366: m = num;
1367: } else { /* {} */
1368: FATAL("illegal repetition expression: class %.20s",
1369: lastre);
1370: }
1371: }
1372: if (repeat(starttok, prestr-starttok, lastatom,
1373: startreptok - lastatom, n, m) > 0) {
1.30 millert 1374: if (n == 0 && m == 0) {
1.31 millert 1375: return ZERO;
1.23 millert 1376: }
1377: /* must rescan input for next token */
1378: goto rescan;
1379: }
1380: /* Failed to replace: eat up {...} characters
1381: and treat like just PLUS */
1382: return PLUS;
1383: } else if (c == '\0') {
1384: FATAL("nonterminated character class %.20s",
1385: lastre);
1386: } else if (isdigit(c)) {
1387: num = 10 * num + c - '0';
1.29 millert 1388: digitfound = true;
1.23 millert 1389: } else if (c == ',') {
1390: if (commafound)
1391: FATAL("illegal repetition expression: class %.20s",
1392: lastre);
1393: /* looking for {n,} or {n,m} */
1.29 millert 1394: commafound = true;
1.23 millert 1395: n = num;
1.29 millert 1396: digitfound = false; /* reset */
1.23 millert 1397: num = 0;
1398: } else {
1399: FATAL("illegal repetition expression: class %.20s",
1400: lastre);
1401: }
1402: }
1403: break;
1.1 tholo 1404: }
1405: }
1406:
1407: int cgoto(fa *f, int s, int c)
1408: {
1.27 millert 1409: int *p, *q;
1.1 tholo 1410: int i, j, k;
1411:
1.38 millert 1412: /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1.1 tholo 1413: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 1414: resizesetvec(__func__);
1.1 tholo 1415: }
1416: for (i = 0; i <= f->accept; i++)
1417: setvec[i] = 0;
1418: setcnt = 0;
1.27 millert 1419: resize_state(f, s);
1.1 tholo 1420: /* compute positions of gototab[s,c] into setvec */
1421: p = f->posns[s];
1422: for (i = 1; i <= *p; i++) {
1423: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1424: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1425: || (k == DOT && c != 0 && c != HAT)
1426: || (k == ALL && c != 0)
1.15 millert 1427: || (k == EMPTYRE && c != 0)
1.38 millert 1428: || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1429: || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1.1 tholo 1430: q = f->re[p[i]].lfollow;
1431: for (j = 1; j <= *q; j++) {
1432: if (q[j] >= maxsetvec) {
1.27 millert 1433: resizesetvec(__func__);
1.1 tholo 1434: }
1435: if (setvec[q[j]] == 0) {
1436: setcnt++;
1437: setvec[q[j]] = 1;
1438: }
1439: }
1440: }
1441: }
1442: }
1443: /* determine if setvec is a previous state */
1444: tmpset[0] = setcnt;
1445: j = 1;
1446: for (i = f->accept; i >= 0; i--)
1447: if (setvec[i]) {
1448: tmpset[j++] = i;
1449: }
1.27 millert 1450: resize_state(f, f->curstat > s ? f->curstat : s);
1.1 tholo 1451: /* tmpset == previous state? */
1452: for (i = 1; i <= f->curstat; i++) {
1453: p = f->posns[i];
1454: if ((k = tmpset[0]) != p[0])
1455: goto different;
1456: for (j = 1; j <= k; j++)
1457: if (tmpset[j] != p[j])
1458: goto different;
1459: /* setvec is state i */
1.30 millert 1460: if (c != HAT)
1.38 millert 1461: set_gototab(f, s, c, i);
1.1 tholo 1462: return i;
1463: different:;
1464: }
1465:
1466: /* add tmpset to current set of states */
1.27 millert 1467: ++(f->curstat);
1468: resize_state(f, f->curstat);
1.1 tholo 1469: for (i = 0; i < NCHARS; i++)
1.38 millert 1470: set_gototab(f, f->curstat, 0, 0);
1.1 tholo 1471: xfree(f->posns[f->curstat]);
1.27 millert 1472: p = intalloc(setcnt + 1, __func__);
1.1 tholo 1473:
1474: f->posns[f->curstat] = p;
1.30 millert 1475: if (c != HAT)
1.38 millert 1476: set_gototab(f, s, c, f->curstat);
1.1 tholo 1477: for (i = 0; i <= setcnt; i++)
1478: p[i] = tmpset[i];
1479: if (setvec[f->accept])
1480: f->out[f->curstat] = 1;
1481: else
1482: f->out[f->curstat] = 0;
1483: return f->curstat;
1484: }
1485:
1486:
1487: void freefa(fa *f) /* free a finite automaton */
1488: {
1489: int i;
1490:
1491: if (f == NULL)
1492: return;
1.27 millert 1493: for (i = 0; i < f->state_count; i++)
1494: xfree(f->gototab[i])
1.1 tholo 1495: for (i = 0; i <= f->curstat; i++)
1496: xfree(f->posns[i]);
1497: for (i = 0; i <= f->accept; i++) {
1498: xfree(f->re[i].lfollow);
1499: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30 millert 1500: xfree(f->re[i].lval.np);
1.1 tholo 1501: }
1502: xfree(f->restr);
1.27 millert 1503: xfree(f->out);
1504: xfree(f->posns);
1505: xfree(f->gototab);
1.1 tholo 1506: xfree(f);
1507: }