Annotation of src/usr.bin/awk/b.c, Revision 1.43
1.43 ! millert 1: /* $OpenBSD: b.c,v 1.42 2023/09/21 17:19:06 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.34 millert 36: #include "awkgram.tab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.28 millert 65: static const uschar *rlxstr;
66: static const uschar *prestr; /* current position in current re */
67: static const uschar *lastre; /* origin of last re */
68: static const uschar *lastatom; /* origin of last Atom */
69: static const uschar *starttok;
70: static const uschar *basestr; /* starts with original, replaced during
1.23 millert 71: repetition processing */
1.28 millert 72: static const uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
1.28 millert 77: const char *patbeg;
1.1 tholo 78: int patlen;
79:
1.27 millert 80: #define NFA 128 /* cache this many dynamic fa's */
1.1 tholo 81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.43 ! millert 84: extern int u8_nextlen(const char *s);
! 85:
1.38 millert 86:
87: /* utf-8 mechanism:
88:
89: For most of Awk, utf-8 strings just "work", since they look like
90: null-terminated sequences of 8-bit bytes.
91:
92: Functions like length(), index(), and substr() have to operate
93: in units of utf-8 characters. The u8_* functions in run.c
94: handle this.
95:
96: Regular expressions are more complicated, since the basic
97: mechanism of the goto table used 8-bit byte indices into the
98: gototab entries to compute the next state. Unicode is a lot
99: bigger, so the gototab entries are now structs with a character
100: and a next state, and there is a linear search of the characters
101: to find the state. (Yes, this is slower, by a significant
102: amount. Tough.)
103:
104: Throughout the RE mechanism in b.c, utf-8 characters are
105: converted to their utf-32 value. This mostly shows up in
106: cclenter, which expands character class ranges like a-z and now
107: alpha-omega. The size of a gototab array is still about 256.
108: This should be dynamic, but for now things work ok for a single
109: code page of Unicode, which is the most likely case.
110:
111: The code changes are localized in run.c and b.c. I have added a
112: handful of functions to somewhat better hide the implementation,
113: but a lot more could be done.
114:
115: */
116:
117: static int get_gototab(fa*, int, int);
118: static int set_gototab(fa*, int, int, int);
119: extern int u8_rune(int *, const uschar *);
120:
1.27 millert 121: static int *
122: intalloc(size_t n, const char *f)
123: {
1.35 millert 124: int *p = (int *) calloc(n, sizeof(int));
1.27 millert 125: if (p == NULL)
126: overflo(f);
127: return p;
128: }
129:
130: static void
131: allocsetvec(const char *f)
132: {
133: maxsetvec = MAXLIN;
1.35 millert 134: setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
135: tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27 millert 136: if (setvec == NULL || tmpset == NULL)
137: overflo(f);
138: }
139:
140: static void
141: resizesetvec(const char *f)
142: {
1.35 millert 143: setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
144: tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27 millert 145: if (setvec == NULL || tmpset == NULL)
146: overflo(f);
147: maxsetvec *= 4;
148: }
149:
150: static void
151: resize_state(fa *f, int state)
152: {
1.38 millert 153: gtt **p;
1.35 millert 154: uschar *p2;
155: int **p3;
1.27 millert 156: int i, new_count;
157:
158: if (++state < f->state_count)
159: return;
160:
161: new_count = state + 10; /* needs to be tuned */
162:
1.38 millert 163: p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
1.27 millert 164: if (p == NULL)
165: goto out;
166: f->gototab = p;
167:
1.35 millert 168: p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
169: if (p2 == NULL)
1.27 millert 170: goto out;
1.35 millert 171: f->out = p2;
1.27 millert 172:
1.35 millert 173: p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
174: if (p3 == NULL)
1.27 millert 175: goto out;
1.35 millert 176: f->posns = p3;
1.27 millert 177:
178: for (i = f->state_count; i < new_count; ++i) {
1.38 millert 179: f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
1.27 millert 180: if (f->gototab[i] == NULL)
181: goto out;
182: f->out[i] = 0;
183: f->posns[i] = NULL;
184: }
1.38 millert 185: f->gototab_len = NCHARS; /* should be variable, growable */
1.27 millert 186: f->state_count = new_count;
187: return;
188: out:
189: overflo(__func__);
190: }
191:
1.29 millert 192: fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1.1 tholo 193: {
194: int i, use, nuse;
195: fa *pfa;
1.6 millert 196: static int now = 1;
1.1 tholo 197:
1.25 millert 198: if (setvec == NULL) { /* first time through any RE */
1.27 millert 199: allocsetvec(__func__);
1.1 tholo 200: }
201:
1.29 millert 202: if (compile_time != RUNNING) /* a constant for sure */
1.1 tholo 203: return mkdfa(s, anchor);
204: for (i = 0; i < nfatab; i++) /* is it there already? */
205: if (fatab[i]->anchor == anchor
1.11 millert 206: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 207: fatab[i]->use = now++;
1.1 tholo 208: return fatab[i];
1.10 millert 209: }
1.1 tholo 210: pfa = mkdfa(s, anchor);
211: if (nfatab < NFA) { /* room for another */
212: fatab[nfatab] = pfa;
1.6 millert 213: fatab[nfatab]->use = now++;
1.1 tholo 214: nfatab++;
215: return pfa;
216: }
217: use = fatab[0]->use; /* replace least-recently used */
218: nuse = 0;
219: for (i = 1; i < nfatab; i++)
220: if (fatab[i]->use < use) {
221: use = fatab[i]->use;
222: nuse = i;
223: }
224: freefa(fatab[nuse]);
225: fatab[nuse] = pfa;
1.6 millert 226: pfa->use = now++;
1.1 tholo 227: return pfa;
228: }
229:
1.29 millert 230: fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
1.30 millert 231: /* anchor = true for anchored matches, else false */
1.1 tholo 232: {
233: Node *p, *p1;
234: fa *f;
235:
1.28 millert 236: firstbasestr = (const uschar *) s;
1.23 millert 237: basestr = firstbasestr;
1.1 tholo 238: p = reparse(s);
239: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
240: /* put ALL STAR in front of reg. exp. */
241: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
242: /* put FINAL after reg. exp. */
243:
244: poscnt = 0;
245: penter(p1); /* enter parent pointers and leaf indices */
1.35 millert 246: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27 millert 247: overflo(__func__);
1.1 tholo 248: f->accept = poscnt-1; /* penter has computed number of positions in re */
249: cfoll(f, p1); /* set up follow sets */
250: freetr(p1);
1.27 millert 251: resize_state(f, 1);
252: f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
253: f->posns[1] = intalloc(1, __func__);
1.1 tholo 254: *f->posns[1] = 0;
255: f->initstat = makeinit(f, anchor);
256: f->anchor = anchor;
1.10 millert 257: f->restr = (uschar *) tostring(s);
1.23 millert 258: if (firstbasestr != basestr) {
259: if (basestr)
260: xfree(basestr);
261: }
1.1 tholo 262: return f;
263: }
264:
1.29 millert 265: int makeinit(fa *f, bool anchor)
1.1 tholo 266: {
267: int i, k;
268:
269: f->curstat = 2;
270: f->out[2] = 0;
271: k = *(f->re[0].lfollow);
1.25 millert 272: xfree(f->posns[2]);
1.27 millert 273: f->posns[2] = intalloc(k + 1, __func__);
1.30 millert 274: for (i = 0; i <= k; i++) {
1.1 tholo 275: (f->posns[2])[i] = (f->re[0].lfollow)[i];
276: }
277: if ((f->posns[2])[1] == f->accept)
278: f->out[2] = 1;
1.30 millert 279: for (i = 0; i < NCHARS; i++)
1.38 millert 280: set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */
1.1 tholo 281: f->curstat = cgoto(f, 2, HAT);
282: if (anchor) {
283: *f->posns[2] = k-1; /* leave out position 0 */
1.30 millert 284: for (i = 0; i < k; i++) {
1.1 tholo 285: (f->posns[0])[i] = (f->posns[2])[i];
286: }
287:
288: f->out[0] = f->out[2];
289: if (f->curstat != 2)
290: --(*f->posns[f->curstat]);
291: }
292: return f->curstat;
293: }
294:
295: void penter(Node *p) /* set up parent pointers and leaf indices */
296: {
297: switch (type(p)) {
1.15 millert 298: ELEAF
1.1 tholo 299: LEAF
1.7 millert 300: info(p) = poscnt;
1.1 tholo 301: poscnt++;
302: break;
303: UNARY
304: penter(left(p));
305: parent(left(p)) = p;
306: break;
307: case CAT:
308: case OR:
309: penter(left(p));
310: penter(right(p));
311: parent(left(p)) = p;
312: parent(right(p)) = p;
313: break;
1.31 millert 314: case ZERO:
315: break;
1.1 tholo 316: default: /* can't happen */
1.8 millert 317: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 318: break;
319: }
320: }
321:
322: void freetr(Node *p) /* free parse tree */
323: {
324: switch (type(p)) {
1.15 millert 325: ELEAF
1.1 tholo 326: LEAF
327: xfree(p);
328: break;
329: UNARY
1.31 millert 330: case ZERO:
1.1 tholo 331: freetr(left(p));
332: xfree(p);
333: break;
334: case CAT:
335: case OR:
336: freetr(left(p));
337: freetr(right(p));
338: xfree(p);
339: break;
340: default: /* can't happen */
1.8 millert 341: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 342: break;
343: }
344: }
345:
346: /* in the parsing of regular expressions, metacharacters like . have */
347: /* to be seen literally; \056 is not a metacharacter. */
348:
1.38 millert 349: int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
1.2 millert 350: { /* only pick up one 8-bit byte (2 chars) */
1.28 millert 351: const uschar *p;
1.1 tholo 352: int n = 0;
1.2 millert 353: int i;
1.1 tholo 354:
1.38 millert 355: for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
1.1 tholo 356: if (isdigit(*p))
357: n = 16 * n + *p - '0';
358: else if (*p >= 'a' && *p <= 'f')
359: n = 16 * n + *p - 'a' + 10;
360: else if (*p >= 'A' && *p <= 'F')
361: n = 16 * n + *p - 'A' + 10;
362: }
1.28 millert 363: *pp = p;
1.1 tholo 364: return n;
365: }
366:
1.2 millert 367: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 368:
1.28 millert 369: int quoted(const uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 370: /* and increment *pp */
371: {
1.28 millert 372: const uschar *p = *pp;
1.1 tholo 373: int c;
374:
1.38 millert 375: /* BUG: should advance by utf-8 char even if makes no sense */
376:
377: if ((c = *p++) == 't') {
1.1 tholo 378: c = '\t';
1.38 millert 379: } else if (c == 'n') {
1.1 tholo 380: c = '\n';
1.38 millert 381: } else if (c == 'f') {
1.1 tholo 382: c = '\f';
1.38 millert 383: } else if (c == 'r') {
1.1 tholo 384: c = '\r';
1.38 millert 385: } else if (c == 'b') {
1.1 tholo 386: c = '\b';
1.38 millert 387: } else if (c == 'v') {
1.25 millert 388: c = '\v';
1.38 millert 389: } else if (c == 'a') {
1.25 millert 390: c = '\a';
1.38 millert 391: } else if (c == '\\') {
1.1 tholo 392: c = '\\';
1.38 millert 393: } else if (c == 'x') { /* 2 hex digits follow */
394: c = hexstr(&p, 2); /* this adds a null if number is invalid */
395: } else if (c == 'u') { /* unicode char number up to 8 hex digits */
396: c = hexstr(&p, 8);
1.1 tholo 397: } else if (isoctdigit(c)) { /* \d \dd \ddd */
398: int n = c - '0';
399: if (isoctdigit(*p)) {
400: n = 8 * n + *p++ - '0';
401: if (isoctdigit(*p))
402: n = 8 * n + *p++ - '0';
403: }
404: c = n;
405: } /* else */
406: /* c = c; */
407: *pp = p;
408: return c;
409: }
410:
1.38 millert 411: int *cclenter(const char *argp) /* add a character class */
1.1 tholo 412: {
413: int i, c, c2;
1.38 millert 414: int n;
415: const uschar *p = (const uschar *) argp;
416: int *bp, *retp;
417: static int *buf = NULL;
1.5 kstailey 418: static int bufsz = 100;
1.1 tholo 419:
1.38 millert 420: if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
1.8 millert 421: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 422: bp = buf;
1.38 millert 423: for (i = 0; *p != 0; ) {
424: n = u8_rune(&c, p);
425: p += n;
1.1 tholo 426: if (c == '\\') {
1.17 millert 427: c = quoted(&p);
1.5 kstailey 428: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 429: if (*p != 0) {
1.5 kstailey 430: c = bp[-1];
1.38 millert 431: /* c2 = *p++; */
432: n = u8_rune(&c2, p);
433: p += n;
1.1 tholo 434: if (c2 == '\\')
1.38 millert 435: c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
1.1 tholo 436: if (c > c2) { /* empty; ignore */
1.5 kstailey 437: bp--;
1.1 tholo 438: i--;
439: continue;
440: }
441: while (c < c2) {
1.38 millert 442: if (i >= bufsz) {
443: buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2);
444: if (buf == NULL)
445: FATAL("out of space for character class [%.10s...] 2", p);
446: bufsz *= 2;
447: bp = buf + i;
448: }
1.5 kstailey 449: *bp++ = ++c;
1.1 tholo 450: i++;
451: }
452: continue;
453: }
454: }
1.38 millert 455: if (i >= bufsz) {
456: buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2);
457: if (buf == NULL)
458: FATAL("out of space for character class [%.10s...] 2", p);
459: bufsz *= 2;
460: bp = buf + i;
461: }
1.5 kstailey 462: *bp++ = c;
1.1 tholo 463: i++;
464: }
1.5 kstailey 465: *bp = 0;
1.38 millert 466: /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
467: /* xfree(op); BUG: what are we freeing here? */
468: retp = (int *) calloc(bp-buf+1, sizeof(int));
469: for (i = 0; i < bp-buf+1; i++)
470: retp[i] = buf[i];
471: return retp;
1.1 tholo 472: }
473:
1.11 millert 474: void overflo(const char *s)
1.1 tholo 475: {
1.27 millert 476: FATAL("regular expression too big: out of space in %.30s...", s);
1.1 tholo 477: }
478:
479: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
480: {
481: int i;
1.4 millert 482: int *p;
1.1 tholo 483:
484: switch (type(v)) {
1.15 millert 485: ELEAF
1.1 tholo 486: LEAF
1.7 millert 487: f->re[info(v)].ltype = type(v);
488: f->re[info(v)].lval.np = right(v);
1.1 tholo 489: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 490: resizesetvec(__func__);
1.1 tholo 491: }
492: for (i = 0; i <= f->accept; i++)
493: setvec[i] = 0;
494: setcnt = 0;
495: follow(v); /* computes setvec and setcnt */
1.27 millert 496: p = intalloc(setcnt + 1, __func__);
1.7 millert 497: f->re[info(v)].lfollow = p;
1.1 tholo 498: *p = setcnt;
499: for (i = f->accept; i >= 0; i--)
500: if (setvec[i] == 1)
501: *++p = i;
502: break;
503: UNARY
504: cfoll(f,left(v));
505: break;
506: case CAT:
507: case OR:
508: cfoll(f,left(v));
509: cfoll(f,right(v));
510: break;
1.31 millert 511: case ZERO:
512: break;
1.1 tholo 513: default: /* can't happen */
1.8 millert 514: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 515: }
516: }
517:
518: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 519: /* returns 0 if p matches empty string */
1.1 tholo 520: {
1.4 millert 521: int b, lp;
1.1 tholo 522:
523: switch (type(p)) {
1.15 millert 524: ELEAF
1.1 tholo 525: LEAF
1.7 millert 526: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 527: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.27 millert 528: resizesetvec(__func__);
1.1 tholo 529: }
1.15 millert 530: if (type(p) == EMPTYRE) {
531: setvec[lp] = 0;
532: return(0);
533: }
1.1 tholo 534: if (setvec[lp] != 1) {
535: setvec[lp] = 1;
536: setcnt++;
537: }
1.41 millert 538: if (type(p) == CCL && (*(int *) right(p)) == 0)
1.1 tholo 539: return(0); /* empty CCL */
1.30 millert 540: return(1);
1.1 tholo 541: case PLUS:
1.30 millert 542: if (first(left(p)) == 0)
543: return(0);
1.1 tholo 544: return(1);
545: case STAR:
546: case QUEST:
547: first(left(p));
548: return(0);
549: case CAT:
550: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
551: return(1);
552: case OR:
553: b = first(right(p));
554: if (first(left(p)) == 0 || b == 0) return(0);
555: return(1);
1.31 millert 556: case ZERO:
557: return 0;
1.1 tholo 558: }
1.8 millert 559: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 560: return(-1);
561: }
562:
563: void follow(Node *v) /* collects leaves that can follow v into setvec */
564: {
565: Node *p;
566:
567: if (type(v) == FINAL)
568: return;
569: p = parent(v);
570: switch (type(p)) {
571: case STAR:
572: case PLUS:
573: first(v);
574: follow(p);
575: return;
576:
577: case OR:
578: case QUEST:
579: follow(p);
580: return;
581:
582: case CAT:
583: if (v == left(p)) { /* v is left child of p */
584: if (first(right(p)) == 0) {
585: follow(p);
586: return;
587: }
588: } else /* v is right child */
589: follow(p);
590: return;
591: }
592: }
593:
1.38 millert 594: int member(int c, int *sarg) /* is c in s? */
1.1 tholo 595: {
1.38 millert 596: int *s = (int *) sarg;
1.10 millert 597:
1.1 tholo 598: while (*s)
599: if (c == *s++)
600: return(1);
601: return(0);
602: }
603:
1.38 millert 604: static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
605: {
606: int i;
607: for (i = 0; i < f->gototab_len; i++) {
608: if (f->gototab[state][i].ch == 0)
609: break;
610: if (f->gototab[state][i].ch == ch)
611: return f->gototab[state][i].state;
612: }
613: return 0;
614: }
615:
616: static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
617: {
618: int i;
619: for (i = 0; i < f->gototab_len; i++) {
620: if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
621: f->gototab[state][i].ch = ch;
622: f->gototab[state][i].state = val;
623: return val;
624: }
625: }
626: overflo(__func__);
627: return val; /* not used anywhere at the moment */
628: }
629:
1.11 millert 630: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 631: {
632: int s, ns;
1.38 millert 633: int n;
634: int rune;
1.28 millert 635: const uschar *p = (const uschar *) p0;
1.1 tholo 636:
1.38 millert 637: /* return pmatch(f, p0); does it matter whether longest or shortest? */
638:
1.27 millert 639: s = f->initstat;
640: assert (s < f->state_count);
641:
1.1 tholo 642: if (f->out[s])
643: return(1);
644: do {
1.15 millert 645: /* assert(*p < NCHARS); */
1.38 millert 646: n = u8_rune(&rune, p);
647: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 648: s = ns;
649: else
1.38 millert 650: s = cgoto(f, s, rune);
1.1 tholo 651: if (f->out[s])
652: return(1);
1.38 millert 653: if (*p == 0)
654: break;
655: p += n;
656: } while (1); /* was *p++ != 0 */
1.1 tholo 657: return(0);
658: }
659:
1.11 millert 660: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 661: {
662: int s, ns;
1.38 millert 663: int n;
664: int rune;
1.28 millert 665: const uschar *p = (const uschar *) p0;
666: const uschar *q;
1.1 tholo 667:
1.27 millert 668: s = f->initstat;
669: assert(s < f->state_count);
670:
1.28 millert 671: patbeg = (const char *)p;
1.1 tholo 672: patlen = -1;
673: do {
674: q = p;
675: do {
676: if (f->out[s]) /* final state */
677: patlen = q-p;
1.15 millert 678: /* assert(*q < NCHARS); */
1.38 millert 679: n = u8_rune(&rune, q);
680: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 681: s = ns;
682: else
1.38 millert 683: s = cgoto(f, s, rune);
1.27 millert 684:
685: assert(s < f->state_count);
686:
1.9 deraadt 687: if (s == 1) { /* no transition */
1.1 tholo 688: if (patlen >= 0) {
1.28 millert 689: patbeg = (const char *) p;
1.1 tholo 690: return(1);
691: }
692: else
693: goto nextin; /* no match */
1.9 deraadt 694: }
1.38 millert 695: if (*q == 0)
696: break;
697: q += n;
698: } while (1);
699: q++; /* was *q++ */
1.1 tholo 700: if (f->out[s])
701: patlen = q-p-1; /* don't count $ */
702: if (patlen >= 0) {
1.28 millert 703: patbeg = (const char *) p;
1.1 tholo 704: return(1);
705: }
706: nextin:
707: s = 2;
1.38 millert 708: if (*p == 0)
709: break;
710: n = u8_rune(&rune, p);
711: p += n;
712: } while (1); /* was *p++ */
1.1 tholo 713: return (0);
714: }
715:
1.11 millert 716: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 717: {
718: int s, ns;
1.38 millert 719: int n;
720: int rune;
1.28 millert 721: const uschar *p = (const uschar *) p0;
722: const uschar *q;
1.1 tholo 723:
1.27 millert 724: s = f->initstat;
725: assert(s < f->state_count);
726:
1.28 millert 727: patbeg = (const char *)p;
1.1 tholo 728: patlen = -1;
729: while (*p) {
730: q = p;
731: do {
732: if (f->out[s]) /* final state */
733: patlen = q-p;
1.15 millert 734: /* assert(*q < NCHARS); */
1.38 millert 735: n = u8_rune(&rune, q);
736: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 737: s = ns;
738: else
1.38 millert 739: s = cgoto(f, s, rune);
1.9 deraadt 740: if (s == 1) { /* no transition */
1.1 tholo 741: if (patlen > 0) {
1.28 millert 742: patbeg = (const char *) p;
1.1 tholo 743: return(1);
744: } else
745: goto nnextin; /* no nonempty match */
1.9 deraadt 746: }
1.38 millert 747: if (*q == 0)
748: break;
749: q += n;
750: } while (1);
751: q++;
1.1 tholo 752: if (f->out[s])
753: patlen = q-p-1; /* don't count $ */
754: if (patlen > 0 ) {
1.28 millert 755: patbeg = (const char *) p;
1.1 tholo 756: return(1);
757: }
758: nnextin:
759: s = 2;
760: p++;
761: }
762: return (0);
763: }
764:
1.43 ! millert 765:
! 766: #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
! 767:
! 768: // Read one rune at a time from the given FILE*. Return both
! 769: // the bytes and the actual rune.
! 770:
! 771: struct runedata {
! 772: int rune;
! 773: size_t len;
! 774: char bytes[6];
! 775: };
! 776:
! 777: struct runedata getrune(FILE *fp)
1.38 millert 778: {
1.43 ! millert 779: struct runedata result;
! 780: int c, i, next;
! 781:
! 782: memset(&result, 0, sizeof(result));
! 783:
! 784: c = getc(fp);
! 785: if (c == EOF)
! 786: return result; // result.rune == 0 --> EOF
! 787: else if (c < 128 || awk_mb_cur_max == 1) {
! 788: result.bytes[0] = c;
! 789: result.len = 1;
! 790: result.rune = c;
! 791:
! 792: return result;
! 793: }
! 794:
! 795: // need to get bytes and fill things in
! 796: result.bytes[0] = c;
! 797: result.len = 1;
! 798:
! 799: next = 1;
! 800: for (i = 1; i < MAX_UTF_BYTES; i++) {
! 801: c = getc(fp);
! 802: if (c == EOF)
1.38 millert 803: break;
1.43 ! millert 804: result.bytes[next++] = c;
! 805: result.len++;
! 806: }
! 807:
! 808: // put back any extra input bytes
! 809: int actual_len = u8_nextlen(result.bytes);
! 810: while (result.len > actual_len) {
! 811: ungetc(result.bytes[--result.len], fp);
1.38 millert 812: }
813:
1.43 ! millert 814: result.bytes[result.len] = '\0';
! 815: (void) u8_rune(& result.rune, (uschar *) result.bytes);
1.38 millert 816:
1.43 ! millert 817: return result;
1.38 millert 818: }
819:
1.26 millert 820:
821: /*
822: * NAME
823: * fnematch
824: *
825: * DESCRIPTION
826: * A stream-fed version of nematch which transfers characters to a
827: * null-terminated buffer. All characters up to and including the last
828: * character of the matching text or EOF are placed in the buffer. If
829: * a match is found, patbeg and patlen are set appropriately.
830: *
831: * RETURN VALUES
1.29 millert 832: * false No match found.
833: * true Match found.
1.26 millert 834: */
835:
1.29 millert 836: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26 millert 837: {
838: char *buf = *pbuf;
839: int bufsize = *pbufsize;
1.43 ! millert 840: int i, j, k, ns, s;
! 841: struct runedata r;
1.26 millert 842:
843: s = pfa->initstat;
844: patlen = 0;
845:
846: /*
847: * All indices relative to buf.
848: * i <= j <= k <= bufsize
849: *
1.43 ! millert 850: * i: origin of active substring (first byte of first character)
! 851: * j: current character (last byte of current character)
1.26 millert 852: * k: destination of next getc()
853: */
854: i = -1, k = 0;
855: do {
856: j = i++;
857: do {
1.43 ! millert 858: r = getrune(f);
! 859: if ((++j + r.len) >= k) {
! 860: if (k >= bufsize)
1.26 millert 861: if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
862: FATAL("stream '%.30s...' too long", buf);
1.38 millert 863: }
1.43 ! millert 864: memcpy(buf + k, r.bytes, r.len);
! 865: j += r.len - 1; // incremented next time around the loop
! 866: k += r.len;
1.26 millert 867:
1.43 ! millert 868: if ((ns = get_gototab(pfa, s, r.rune)) != 0)
1.26 millert 869: s = ns;
870: else
1.43 ! millert 871: s = cgoto(pfa, s, r.rune);
1.26 millert 872:
873: if (pfa->out[s]) { /* final state */
874: patlen = j - i + 1;
1.43 ! millert 875: if (r.rune == 0) /* don't count $ */
1.26 millert 876: patlen--;
877: }
878: } while (buf[j] && s != 1);
879: s = 2;
1.43 ! millert 880: if (r.len > 1)
! 881: i += r.len - 1; // i incremented around the loop
1.26 millert 882: } while (buf[i] && !patlen);
883:
884: /* adjbuf() may have relocated a resized buffer. Inform the world. */
885: *pbuf = buf;
886: *pbufsize = bufsize;
887:
888: if (patlen) {
889: patbeg = buf + i;
890: /*
891: * Under no circumstances is the last character fed to
892: * the automaton part of the match. It is EOF's nullbyte,
893: * or it sent the automaton into a state with no further
894: * transitions available (s==1), or both. Room for a
895: * terminating nullbyte is guaranteed.
896: *
897: * ungetc any chars after the end of matching text
898: * (except for EOF's nullbyte, if present) and null
899: * terminate the buffer.
900: */
1.43 ! millert 901: do {
! 902: int ii;
! 903: for (ii = r.len; ii > 0; ii--)
! 904: if (buf[--k] && ungetc(buf[k], f) == EOF)
! 905: FATAL("unable to ungetc '%c'", buf[k]);
! 906: } while (k > i + patlen);
1.30 millert 907: buf[k] = '\0';
1.29 millert 908: return true;
1.26 millert 909: }
910: else
1.29 millert 911: return false;
1.26 millert 912: }
913:
1.11 millert 914: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 915: { /* uses relex() to scan regular expression */
916: Node *np;
917:
1.32 millert 918: DPRINTF("reparse <%s>\n", p);
1.28 millert 919: lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 920: rtok = relex();
1.11 millert 921: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 922: if (rtok == '\0') {
1.11 millert 923: /* FATAL("empty regular expression"); previous */
1.15 millert 924: return(op2(EMPTYRE, NIL, NIL));
925: }
1.1 tholo 926: np = regexp();
927: if (rtok != '\0')
1.8 millert 928: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 929: return(np);
930: }
931:
932: Node *regexp(void) /* top-level parse of reg expr */
933: {
934: return (alt(concat(primary())));
935: }
936:
937: Node *primary(void)
938: {
939: Node *np;
1.23 millert 940: int savelastatom;
1.1 tholo 941:
942: switch (rtok) {
943: case CHAR:
1.23 millert 944: lastatom = starttok;
1.7 millert 945: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 946: rtok = relex();
947: return (unary(np));
948: case ALL:
949: rtok = relex();
950: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 951: case EMPTYRE:
952: rtok = relex();
1.23 millert 953: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 954: case DOT:
1.23 millert 955: lastatom = starttok;
1.1 tholo 956: rtok = relex();
957: return (unary(op2(DOT, NIL, NIL)));
958: case CCL:
1.28 millert 959: np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23 millert 960: lastatom = starttok;
1.1 tholo 961: rtok = relex();
962: return (unary(np));
963: case NCCL:
1.28 millert 964: np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23 millert 965: lastatom = starttok;
1.1 tholo 966: rtok = relex();
967: return (unary(np));
968: case '^':
969: rtok = relex();
1.7 millert 970: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 971: case '$':
972: rtok = relex();
973: return (unary(op2(CHAR, NIL, NIL)));
974: case '(':
1.23 millert 975: lastatom = starttok;
976: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 977: rtok = relex();
978: if (rtok == ')') { /* special pleading for () */
979: rtok = relex();
1.42 millert 980: return unary(op2(CCL, NIL, (Node *) cclenter("")));
1.1 tholo 981: }
982: np = regexp();
983: if (rtok == ')') {
1.23 millert 984: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 985: rtok = relex();
986: return (unary(np));
987: }
988: else
1.8 millert 989: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 990: default:
1.8 millert 991: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 992: }
993: return 0; /*NOTREACHED*/
994: }
995:
996: Node *concat(Node *np)
997: {
998: switch (rtok) {
1.23 millert 999: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 1000: return (concat(op2(CAT, np, primary())));
1.23 millert 1001: case EMPTYRE:
1002: rtok = relex();
1.42 millert 1003: return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1.23 millert 1004: primary())));
1.1 tholo 1005: }
1006: return (np);
1007: }
1008:
1009: Node *alt(Node *np)
1010: {
1011: if (rtok == OR) {
1012: rtok = relex();
1013: return (alt(op2(OR, np, concat(primary()))));
1014: }
1015: return (np);
1016: }
1017:
1018: Node *unary(Node *np)
1019: {
1020: switch (rtok) {
1021: case STAR:
1022: rtok = relex();
1023: return (unary(op2(STAR, np, NIL)));
1024: case PLUS:
1025: rtok = relex();
1026: return (unary(op2(PLUS, np, NIL)));
1027: case QUEST:
1028: rtok = relex();
1029: return (unary(op2(QUEST, np, NIL)));
1.31 millert 1030: case ZERO:
1031: rtok = relex();
1032: return (unary(op2(ZERO, np, NIL)));
1.1 tholo 1033: default:
1034: return (np);
1035: }
1036: }
1037:
1.11 millert 1038: /*
1039: * Character class definitions conformant to the POSIX locale as
1040: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1041: * and operating character sets are both ASCII (ISO646) or supersets
1042: * thereof.
1043: *
1044: * Note that to avoid overflowing the temporary buffer used in
1045: * relex(), the expanded character class (prior to range expansion)
1046: * must be less than twice the size of their full name.
1047: */
1.12 millert 1048:
1049: /* Because isblank doesn't show up in any of the header files on any
1050: * system i use, it's defined here. if some other locale has a richer
1051: * definition of "blank", define HAS_ISBLANK and provide your own
1052: * version.
1053: * the parentheses here are an attempt to find a path through the maze
1054: * of macro definition and/or function and/or version provided. thanks
1055: * to nelson beebe for the suggestion; let's see if it works everywhere.
1056: */
1057:
1058: #ifndef HAS_ISBLANK
1059:
1.17 millert 1060: int (xisblank)(int c)
1.12 millert 1061: {
1062: return c==' ' || c=='\t';
1063: }
1064:
1065: #endif
1066:
1.31 millert 1067: static const struct charclass {
1.11 millert 1068: const char *cc_name;
1069: int cc_namelen;
1.12 millert 1070: int (*cc_func)(int);
1.11 millert 1071: } charclasses[] = {
1.12 millert 1072: { "alnum", 5, isalnum },
1073: { "alpha", 5, isalpha },
1.17 millert 1074: #ifndef HAS_ISBLANK
1.21 millert 1075: { "blank", 5, xisblank },
1.17 millert 1076: #else
1.12 millert 1077: { "blank", 5, isblank },
1.17 millert 1078: #endif
1.12 millert 1079: { "cntrl", 5, iscntrl },
1080: { "digit", 5, isdigit },
1081: { "graph", 5, isgraph },
1082: { "lower", 5, islower },
1083: { "print", 5, isprint },
1084: { "punct", 5, ispunct },
1085: { "space", 5, isspace },
1086: { "upper", 5, isupper },
1087: { "xdigit", 6, isxdigit },
1.11 millert 1088: { NULL, 0, NULL },
1089: };
1090:
1.23 millert 1091: #define REPEAT_SIMPLE 0
1092: #define REPEAT_PLUS_APPENDED 1
1093: #define REPEAT_WITH_Q 2
1094: #define REPEAT_ZERO 3
1095:
1096: static int
1097: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1098: int atomlen, int firstnum, int secondnum, int special_case)
1099: {
1100: int i, j;
1.26 millert 1101: uschar *buf = NULL;
1.23 millert 1102: int ret = 1;
1.31 millert 1103: int init_q = (firstnum == 0); /* first added char will be ? */
1.23 millert 1104: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1105: int prefix_length = reptok - basestr; /* prefix includes first rep */
1.28 millert 1106: int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1.23 millert 1107: int size = prefix_length + suffix_length;
1108:
1109: if (firstnum > 1) { /* add room for reps 2 through firstnum */
1110: size += atomlen*(firstnum-1);
1111: }
1112:
1113: /* Adjust size of buffer for special cases */
1114: if (special_case == REPEAT_PLUS_APPENDED) {
1115: size++; /* for the final + */
1116: } else if (special_case == REPEAT_WITH_Q) {
1.36 millert 1117: size += init_q + (atomlen+1)* (n_q_reps-init_q);
1.23 millert 1118: } else if (special_case == REPEAT_ZERO) {
1119: size += 2; /* just a null ERE: () */
1120: }
1.35 millert 1121: if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23 millert 1122: FATAL("out of space in reg expr %.10s..", lastre);
1123: memcpy(buf, basestr, prefix_length); /* copy prefix */
1124: j = prefix_length;
1125: if (special_case == REPEAT_ZERO) {
1126: j -= atomlen;
1127: buf[j++] = '(';
1128: buf[j++] = ')';
1129: }
1.30 millert 1130: for (i = 1; i < firstnum; i++) { /* copy x reps */
1.23 millert 1131: memcpy(&buf[j], atom, atomlen);
1132: j += atomlen;
1133: }
1134: if (special_case == REPEAT_PLUS_APPENDED) {
1135: buf[j++] = '+';
1136: } else if (special_case == REPEAT_WITH_Q) {
1.29 millert 1137: if (init_q)
1138: buf[j++] = '?';
1.30 millert 1139: for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1.23 millert 1140: memcpy(&buf[j], atom, atomlen);
1141: j += atomlen;
1142: buf[j++] = '?';
1143: }
1144: }
1145: memcpy(&buf[j], reptok+reptoklen, suffix_length);
1.36 millert 1146: j += suffix_length;
1147: buf[j] = '\0';
1.23 millert 1148: /* free old basestr */
1149: if (firstbasestr != basestr) {
1150: if (basestr)
1151: xfree(basestr);
1152: }
1153: basestr = buf;
1154: prestr = buf + prefix_length;
1155: if (special_case == REPEAT_ZERO) {
1156: prestr -= atomlen;
1157: ret++;
1158: }
1159: return ret;
1160: }
1161:
1162: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1163: int atomlen, int firstnum, int secondnum)
1164: {
1165: /*
1166: In general, the repetition specifier or "bound" is replaced here
1167: by an equivalent ERE string, repeating the immediately previous atom
1168: and appending ? and + as needed. Note that the first copy of the
1169: atom is left in place, except in the special_case of a zero-repeat
1170: (i.e., {0}).
1171: */
1172: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1173: if (firstnum < 2) {
1174: /* 0 or 1: should be handled before you get here */
1175: FATAL("internal error");
1176: } else {
1177: return replace_repeat(reptok, reptoklen, atom, atomlen,
1178: firstnum, secondnum, REPEAT_PLUS_APPENDED);
1179: }
1180: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1181: if (firstnum == 0) { /* {0} or {0,0} */
1182: /* This case is unusual because the resulting
1183: replacement string might actually be SMALLER than
1184: the original ERE */
1185: return replace_repeat(reptok, reptoklen, atom, atomlen,
1186: firstnum, secondnum, REPEAT_ZERO);
1187: } else { /* (firstnum >= 1) */
1188: return replace_repeat(reptok, reptoklen, atom, atomlen,
1189: firstnum, secondnum, REPEAT_SIMPLE);
1190: }
1191: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1192: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1193: return replace_repeat(reptok, reptoklen, atom, atomlen,
1194: firstnum, secondnum, REPEAT_WITH_Q);
1195: } else { /* Error - shouldn't be here (n>m) */
1196: FATAL("internal error");
1197: }
1198: return 0;
1199: }
1.11 millert 1200:
1.38 millert 1201: extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
1202:
1.1 tholo 1203: int relex(void) /* lexical analyzer for reparse */
1204: {
1.5 kstailey 1205: int c, n;
1.1 tholo 1206: int cflag;
1.25 millert 1207: static uschar *buf = NULL;
1.5 kstailey 1208: static int bufsz = 100;
1.10 millert 1209: uschar *bp;
1.31 millert 1210: const struct charclass *cc;
1.12 millert 1211: int i;
1.29 millert 1212: int num, m;
1213: bool commafound, digitfound;
1.23 millert 1214: const uschar *startreptok;
1.24 millert 1215: static int parens = 0;
1.23 millert 1216:
1217: rescan:
1218: starttok = prestr;
1.1 tholo 1219:
1.38 millert 1220: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1221: prestr += n;
1222: starttok = prestr;
1223: return CHAR;
1224: }
1225:
1.1 tholo 1226: switch (c = *prestr++) {
1227: case '|': return OR;
1228: case '*': return STAR;
1229: case '+': return PLUS;
1230: case '?': return QUEST;
1231: case '.': return DOT;
1232: case '\0': prestr--; return '\0';
1233: case '^':
1234: case '$':
1.24 millert 1235: return c;
1.1 tholo 1236: case '(':
1.24 millert 1237: parens++;
1238: return c;
1.1 tholo 1239: case ')':
1.24 millert 1240: if (parens) {
1241: parens--;
1242: return c;
1243: }
1244: /* unmatched close parenthesis; per POSIX, treat as literal */
1245: rlxval = c;
1246: return CHAR;
1.1 tholo 1247: case '\\':
1.17 millert 1248: rlxval = quoted(&prestr);
1.1 tholo 1249: return CHAR;
1250: default:
1251: rlxval = c;
1252: return CHAR;
1.25 millert 1253: case '[':
1.35 millert 1254: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 1255: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 1256: bp = buf;
1.1 tholo 1257: if (*prestr == '^') {
1258: cflag = 1;
1259: prestr++;
1260: }
1261: else
1262: cflag = 0;
1.38 millert 1263: n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1.15 millert 1264: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 1265: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 1266: for (; ; ) {
1.38 millert 1267: if ((n = u8_rune(&rlxval, prestr)) > 1) {
1268: for (i = 0; i < n; i++)
1269: *bp++ = *prestr++;
1270: continue;
1271: }
1.1 tholo 1272: if ((c = *prestr++) == '\\') {
1.5 kstailey 1273: *bp++ = '\\';
1.1 tholo 1274: if ((c = *prestr++) == '\0')
1.8 millert 1275: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 1276: *bp++ = c;
1.10 millert 1277: /* } else if (c == '\n') { */
1278: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 1279: } else if (c == '[' && *prestr == ':') {
1280: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1281: for (cc = charclasses; cc->cc_name; cc++)
1282: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1283: break;
1284: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1285: prestr[2 + cc->cc_namelen] == ']') {
1286: prestr += cc->cc_namelen + 3;
1.22 millert 1287: /*
1288: * BUG: We begin at 1, instead of 0, since we
1289: * would otherwise prematurely terminate the
1290: * string for classes like [[:cntrl:]]. This
1291: * means that we can't match the NUL character,
1292: * not without first adapting the entire
1293: * program to track each string's length.
1294: */
1.23 millert 1295: for (i = 1; i <= UCHAR_MAX; i++) {
1.37 millert 1296: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1.12 millert 1297: FATAL("out of space for reg expr %.10s...", lastre);
1298: if (cc->cc_func(i)) {
1.32 millert 1299: /* escape backslash */
1300: if (i == '\\') {
1301: *bp++ = '\\';
1302: n++;
1303: }
1304:
1.12 millert 1305: *bp++ = i;
1306: n++;
1307: }
1308: }
1.11 millert 1309: } else
1310: *bp++ = c;
1.23 millert 1311: } else if (c == '[' && *prestr == '.') {
1312: char collate_char;
1313: prestr++;
1314: collate_char = *prestr++;
1315: if (*prestr == '.' && prestr[1] == ']') {
1316: prestr += 2;
1317: /* Found it: map via locale TBD: for
1318: now, simply return this char. This
1319: is sufficient to pass conformance
1320: test awk.ex 156
1321: */
1322: if (*prestr == ']') {
1323: prestr++;
1324: rlxval = collate_char;
1325: return CHAR;
1326: }
1327: }
1328: } else if (c == '[' && *prestr == '=') {
1329: char equiv_char;
1330: prestr++;
1331: equiv_char = *prestr++;
1332: if (*prestr == '=' && prestr[1] == ']') {
1333: prestr += 2;
1334: /* Found it: map via locale TBD: for now
1335: simply return this char. This is
1336: sufficient to pass conformance test
1337: awk.ex 156
1338: */
1339: if (*prestr == ']') {
1340: prestr++;
1341: rlxval = equiv_char;
1342: return CHAR;
1343: }
1344: }
1.1 tholo 1345: } else if (c == '\0') {
1.8 millert 1346: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1347: } else if (bp == buf) { /* 1st char is special */
1348: *bp++ = c;
1.1 tholo 1349: } else if (c == ']') {
1.5 kstailey 1350: *bp++ = 0;
1.10 millert 1351: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1352: if (cflag == 0)
1353: return CCL;
1354: else
1355: return NCCL;
1356: } else
1.5 kstailey 1357: *bp++ = c;
1.1 tholo 1358: }
1.23 millert 1359: break;
1360: case '{':
1361: if (isdigit(*(prestr))) {
1362: num = 0; /* Process as a repetition */
1363: n = -1; m = -1;
1.29 millert 1364: commafound = false;
1365: digitfound = false;
1.23 millert 1366: startreptok = prestr-1;
1367: /* Remember start of previous atom here ? */
1368: } else { /* just a { char, not a repetition */
1369: rlxval = c;
1370: return CHAR;
1371: }
1372: for (; ; ) {
1373: if ((c = *prestr++) == '}') {
1374: if (commafound) {
1375: if (digitfound) { /* {n,m} */
1376: m = num;
1.30 millert 1377: if (m < n)
1.23 millert 1378: FATAL("illegal repetition expression: class %.20s",
1379: lastre);
1.30 millert 1380: if (n == 0 && m == 1) {
1.23 millert 1381: return QUEST;
1382: }
1383: } else { /* {n,} */
1.30 millert 1384: if (n == 0)
1385: return STAR;
1386: else if (n == 1)
1387: return PLUS;
1.23 millert 1388: }
1389: } else {
1390: if (digitfound) { /* {n} same as {n,n} */
1391: n = num;
1392: m = num;
1393: } else { /* {} */
1394: FATAL("illegal repetition expression: class %.20s",
1395: lastre);
1396: }
1397: }
1398: if (repeat(starttok, prestr-starttok, lastatom,
1399: startreptok - lastatom, n, m) > 0) {
1.30 millert 1400: if (n == 0 && m == 0) {
1.31 millert 1401: return ZERO;
1.23 millert 1402: }
1403: /* must rescan input for next token */
1404: goto rescan;
1405: }
1406: /* Failed to replace: eat up {...} characters
1407: and treat like just PLUS */
1408: return PLUS;
1409: } else if (c == '\0') {
1410: FATAL("nonterminated character class %.20s",
1411: lastre);
1412: } else if (isdigit(c)) {
1413: num = 10 * num + c - '0';
1.29 millert 1414: digitfound = true;
1.23 millert 1415: } else if (c == ',') {
1416: if (commafound)
1417: FATAL("illegal repetition expression: class %.20s",
1418: lastre);
1419: /* looking for {n,} or {n,m} */
1.29 millert 1420: commafound = true;
1.23 millert 1421: n = num;
1.29 millert 1422: digitfound = false; /* reset */
1.23 millert 1423: num = 0;
1424: } else {
1425: FATAL("illegal repetition expression: class %.20s",
1426: lastre);
1427: }
1428: }
1429: break;
1.1 tholo 1430: }
1431: }
1432:
1433: int cgoto(fa *f, int s, int c)
1434: {
1.27 millert 1435: int *p, *q;
1.1 tholo 1436: int i, j, k;
1437:
1.38 millert 1438: /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1.1 tholo 1439: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 1440: resizesetvec(__func__);
1.1 tholo 1441: }
1442: for (i = 0; i <= f->accept; i++)
1443: setvec[i] = 0;
1444: setcnt = 0;
1.27 millert 1445: resize_state(f, s);
1.1 tholo 1446: /* compute positions of gototab[s,c] into setvec */
1447: p = f->posns[s];
1448: for (i = 1; i <= *p; i++) {
1449: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1450: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1451: || (k == DOT && c != 0 && c != HAT)
1452: || (k == ALL && c != 0)
1.15 millert 1453: || (k == EMPTYRE && c != 0)
1.38 millert 1454: || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1455: || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1.1 tholo 1456: q = f->re[p[i]].lfollow;
1457: for (j = 1; j <= *q; j++) {
1458: if (q[j] >= maxsetvec) {
1.27 millert 1459: resizesetvec(__func__);
1.1 tholo 1460: }
1461: if (setvec[q[j]] == 0) {
1462: setcnt++;
1463: setvec[q[j]] = 1;
1464: }
1465: }
1466: }
1467: }
1468: }
1469: /* determine if setvec is a previous state */
1470: tmpset[0] = setcnt;
1471: j = 1;
1472: for (i = f->accept; i >= 0; i--)
1473: if (setvec[i]) {
1474: tmpset[j++] = i;
1475: }
1.27 millert 1476: resize_state(f, f->curstat > s ? f->curstat : s);
1.1 tholo 1477: /* tmpset == previous state? */
1478: for (i = 1; i <= f->curstat; i++) {
1479: p = f->posns[i];
1480: if ((k = tmpset[0]) != p[0])
1481: goto different;
1482: for (j = 1; j <= k; j++)
1483: if (tmpset[j] != p[j])
1484: goto different;
1485: /* setvec is state i */
1.30 millert 1486: if (c != HAT)
1.38 millert 1487: set_gototab(f, s, c, i);
1.1 tholo 1488: return i;
1489: different:;
1490: }
1491:
1492: /* add tmpset to current set of states */
1.27 millert 1493: ++(f->curstat);
1494: resize_state(f, f->curstat);
1.1 tholo 1495: for (i = 0; i < NCHARS; i++)
1.38 millert 1496: set_gototab(f, f->curstat, 0, 0);
1.1 tholo 1497: xfree(f->posns[f->curstat]);
1.27 millert 1498: p = intalloc(setcnt + 1, __func__);
1.1 tholo 1499:
1500: f->posns[f->curstat] = p;
1.30 millert 1501: if (c != HAT)
1.38 millert 1502: set_gototab(f, s, c, f->curstat);
1.1 tholo 1503: for (i = 0; i <= setcnt; i++)
1504: p[i] = tmpset[i];
1505: if (setvec[f->accept])
1506: f->out[f->curstat] = 1;
1507: else
1508: f->out[f->curstat] = 0;
1509: return f->curstat;
1510: }
1511:
1512:
1513: void freefa(fa *f) /* free a finite automaton */
1514: {
1515: int i;
1516:
1517: if (f == NULL)
1518: return;
1.27 millert 1519: for (i = 0; i < f->state_count; i++)
1520: xfree(f->gototab[i])
1.1 tholo 1521: for (i = 0; i <= f->curstat; i++)
1522: xfree(f->posns[i]);
1523: for (i = 0; i <= f->accept; i++) {
1524: xfree(f->re[i].lfollow);
1525: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30 millert 1526: xfree(f->re[i].lval.np);
1.1 tholo 1527: }
1528: xfree(f->restr);
1.27 millert 1529: xfree(f->out);
1530: xfree(f->posns);
1531: xfree(f->gototab);
1.1 tholo 1532: xfree(f);
1533: }