src/usr.bin/awk/b.c - annotate

Return to b.c CVS log
Up to [local] / src / usr.bin / awk
Annotation of src/usr.bin/awk/b.c, Revision 1.52

1.52    ! millert     1: /*     $OpenBSD: b.c,v 1.51 2024/04/25 18:33:53 millert Exp $  */
1.1       tholo       2: /****************************************************************
1.5       kstailey    3: Copyright (C) Lucent Technologies 1997
1.1       tholo       4: All Rights Reserved
                      5:
                      6: Permission to use, copy, modify, and distribute this software and
                      7: its documentation for any purpose and without fee is hereby
                      8: granted, provided that the above copyright notice appear in all
                      9: copies and that both that the copyright notice and this
                     10: permission notice and warranty disclaimer appear in supporting
1.5       kstailey   11: documentation, and that the name Lucent Technologies or any of
                     12: its entities not be used in advertising or publicity pertaining
                     13: to distribution of the software without specific, written prior
                     14: permission.
                     15:
                     16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
                     17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
                     18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
                     19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
                     20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
                     21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
                     22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
                     23: THIS SOFTWARE.
1.1       tholo      24: ****************************************************************/
                     25:
1.15      millert    26: /* lasciate ogne speranza, voi ch'intrate. */
1.1       tholo      27:
                     28: #define        DEBUG
                     29:
                     30: #include <ctype.h>
1.23      millert    31: #include <limits.h>
1.1       tholo      32: #include <stdio.h>
                     33: #include <string.h>
                     34: #include <stdlib.h>
                     35: #include "awk.h"
1.34      millert    36: #include "awkgram.tab.h"
1.1       tholo      37:
                     38: #define MAXLIN 22
                     39:
1.7       millert    40: #define type(v)                (v)->nobj       /* badly overloaded here */
                     41: #define info(v)                (v)->ntype      /* badly overloaded here */
1.1       tholo      42: #define left(v)                (v)->narg[0]
                     43: #define right(v)       (v)->narg[1]
                     44: #define parent(v)      (v)->nnext
                     45:
                     46: #define LEAF   case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15      millert    47: #define ELEAF  case EMPTYRE:           /* empty string in regexp */
1.1       tholo      48: #define UNARY  case STAR: case PLUS: case QUEST:
                     49:
                     50: /* encoding in tree Nodes:
1.15      millert    51:        leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1       tholo      52:                left is index, right contains value or pointer to value
                     53:        unary (STAR, PLUS, QUEST): left is child, right is null
                     54:        binary (CAT, OR): left and right are children
                     55:        parent contains pointer to parent
                     56: */
                     57:
                     58:
                     59: int    *setvec;
                     60: int    *tmpset;
                     61: int    maxsetvec = 0;
                     62:
                     63: int    rtok;           /* next token in current re */
1.4       millert    64: int    rlxval;
1.28      millert    65: static const uschar    *rlxstr;
                     66: static const uschar    *prestr;        /* current position in current re */
                     67: static const uschar    *lastre;        /* origin of last re */
                     68: static const uschar    *lastatom;      /* origin of last Atom */
                     69: static const uschar    *starttok;
                     70: static const uschar    *basestr;       /* starts with original, replaced during
1.23      millert    71:                                   repetition processing */
1.28      millert    72: static const uschar    *firstbasestr;
1.1       tholo      73:
1.4       millert    74: static int setcnt;
                     75: static int poscnt;
1.1       tholo      76:
1.28      millert    77: const char     *patbeg;
1.1       tholo      78: int    patlen;
                     79:
1.27      millert    80: #define        NFA     128     /* cache this many dynamic fa's */
1.1       tholo      81: fa     *fatab[NFA];
                     82: int    nfatab  = 0;    /* entries in fatab */
                     83:
1.43      millert    84: extern int u8_nextlen(const char *s);
                     85:
1.38      millert    86:
                     87: /* utf-8 mechanism:
                     88:
                     89:    For most of Awk, utf-8 strings just "work", since they look like
                     90:    null-terminated sequences of 8-bit bytes.
                     91:
                     92:    Functions like length(), index(), and substr() have to operate
                     93:    in units of utf-8 characters.  The u8_* functions in run.c
                     94:    handle this.
                     95:
                     96:    Regular expressions are more complicated, since the basic
                     97:    mechanism of the goto table used 8-bit byte indices into the
                     98:    gototab entries to compute the next state.  Unicode is a lot
                     99:    bigger, so the gototab entries are now structs with a character
1.49      millert   100:    and a next state. These are sorted by code point and binary
                    101:    searched.
1.38      millert   102:
                    103:    Throughout the RE mechanism in b.c, utf-8 characters are
                    104:    converted to their utf-32 value.  This mostly shows up in
                    105:    cclenter, which expands character class ranges like a-z and now
                    106:    alpha-omega.  The size of a gototab array is still about 256.
                    107:    This should be dynamic, but for now things work ok for a single
                    108:    code page of Unicode, which is the most likely case.
                    109:
                    110:    The code changes are localized in run.c and b.c.  I have added a
                    111:    handful of functions to somewhat better hide the implementation,
                    112:    but a lot more could be done.
                    113:
                    114:  */
                    115:
1.49      millert   116: static int entry_cmp(const void *l, const void *r);
1.38      millert   117: static int get_gototab(fa*, int, int);
                    118: static int set_gototab(fa*, int, int, int);
1.49      millert   119: static void clear_gototab(fa*, int);
1.50      millert   120: extern int u8_rune(int *, const char *);
1.38      millert   121:
1.27      millert   122: static int *
                    123: intalloc(size_t n, const char *f)
                    124: {
1.35      millert   125:        int *p = (int *) calloc(n, sizeof(int));
1.27      millert   126:        if (p == NULL)
                    127:                overflo(f);
                    128:        return p;
                    129: }
                    130:
                    131: static void
                    132: allocsetvec(const char *f)
                    133: {
                    134:        maxsetvec = MAXLIN;
1.35      millert   135:        setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
                    136:        tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27      millert   137:        if (setvec == NULL || tmpset == NULL)
                    138:                overflo(f);
                    139: }
                    140:
                    141: static void
                    142: resizesetvec(const char *f)
                    143: {
1.35      millert   144:        setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
                    145:        tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27      millert   146:        if (setvec == NULL || tmpset == NULL)
                    147:                overflo(f);
                    148:        maxsetvec *= 4;
                    149: }
                    150:
                    151: static void
                    152: resize_state(fa *f, int state)
                    153: {
1.49      millert   154:        gtt *p;
1.35      millert   155:        uschar *p2;
                    156:        int **p3;
1.27      millert   157:        int i, new_count;
                    158:
                    159:        if (++state < f->state_count)
                    160:                return;
                    161:
                    162:        new_count = state + 10; /* needs to be tuned */
                    163:
1.49      millert   164:        p = (gtt *) reallocarray(f->gototab, new_count, sizeof(gtt));
1.27      millert   165:        if (p == NULL)
                    166:                goto out;
                    167:        f->gototab = p;
                    168:
1.35      millert   169:        p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
                    170:        if (p2 == NULL)
1.27      millert   171:                goto out;
1.35      millert   172:        f->out = p2;
1.27      millert   173:
1.35      millert   174:        p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
                    175:        if (p3 == NULL)
1.27      millert   176:                goto out;
1.35      millert   177:        f->posns = p3;
1.27      millert   178:
                    179:        for (i = f->state_count; i < new_count; ++i) {
1.49      millert   180:                f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
                    181:                if (f->gototab[i].entries == NULL)
1.27      millert   182:                        goto out;
1.49      millert   183:                f->gototab[i].allocated = NCHARS;
                    184:                f->gototab[i].inuse = 0;
                    185:                f->out[i] = 0;
1.27      millert   186:                f->posns[i] = NULL;
                    187:        }
                    188:        f->state_count = new_count;
                    189:        return;
                    190: out:
                    191:        overflo(__func__);
                    192: }
                    193:
1.29      millert   194: fa *makedfa(const char *s, bool anchor)        /* returns dfa for reg expr s */
1.1       tholo     195: {
                    196:        int i, use, nuse;
                    197:        fa *pfa;
1.6       millert   198:        static int now = 1;
1.1       tholo     199:
1.25      millert   200:        if (setvec == NULL) {   /* first time through any RE */
1.27      millert   201:                allocsetvec(__func__);
1.1       tholo     202:        }
                    203:
1.29      millert   204:        if (compile_time != RUNNING)    /* a constant for sure */
1.1       tholo     205:                return mkdfa(s, anchor);
                    206:        for (i = 0; i < nfatab; i++)    /* is it there already? */
                    207:                if (fatab[i]->anchor == anchor
1.11      millert   208:                  && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6       millert   209:                        fatab[i]->use = now++;
1.1       tholo     210:                        return fatab[i];
1.10      millert   211:                }
1.1       tholo     212:        pfa = mkdfa(s, anchor);
                    213:        if (nfatab < NFA) {     /* room for another */
                    214:                fatab[nfatab] = pfa;
1.6       millert   215:                fatab[nfatab]->use = now++;
1.1       tholo     216:                nfatab++;
                    217:                return pfa;
                    218:        }
                    219:        use = fatab[0]->use;    /* replace least-recently used */
                    220:        nuse = 0;
                    221:        for (i = 1; i < nfatab; i++)
                    222:                if (fatab[i]->use < use) {
                    223:                        use = fatab[i]->use;
                    224:                        nuse = i;
                    225:                }
                    226:        freefa(fatab[nuse]);
                    227:        fatab[nuse] = pfa;
1.6       millert   228:        pfa->use = now++;
1.1       tholo     229:        return pfa;
                    230: }
                    231:
1.29      millert   232: fa *mkdfa(const char *s, bool anchor)  /* does the real work of making a dfa */
1.30      millert   233:                                /* anchor = true for anchored matches, else false */
1.1       tholo     234: {
                    235:        Node *p, *p1;
                    236:        fa *f;
                    237:
1.28      millert   238:        firstbasestr = (const uschar *) s;
1.23      millert   239:        basestr = firstbasestr;
1.1       tholo     240:        p = reparse(s);
                    241:        p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
                    242:                /* put ALL STAR in front of reg.  exp. */
                    243:        p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
                    244:                /* put FINAL after reg.  exp. */
                    245:
                    246:        poscnt = 0;
                    247:        penter(p1);     /* enter parent pointers and leaf indices */
1.35      millert   248:        if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27      millert   249:                overflo(__func__);
1.1       tholo     250:        f->accept = poscnt-1;   /* penter has computed number of positions in re */
                    251:        cfoll(f, p1);   /* set up follow sets */
                    252:        freetr(p1);
1.27      millert   253:        resize_state(f, 1);
                    254:        f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
                    255:        f->posns[1] = intalloc(1, __func__);
1.1       tholo     256:        *f->posns[1] = 0;
                    257:        f->initstat = makeinit(f, anchor);
                    258:        f->anchor = anchor;
1.10      millert   259:        f->restr = (uschar *) tostring(s);
1.23      millert   260:        if (firstbasestr != basestr) {
                    261:                if (basestr)
                    262:                        xfree(basestr);
                    263:        }
1.1       tholo     264:        return f;
                    265: }
                    266:
1.29      millert   267: int makeinit(fa *f, bool anchor)
1.1       tholo     268: {
                    269:        int i, k;
                    270:
                    271:        f->curstat = 2;
                    272:        f->out[2] = 0;
                    273:        k = *(f->re[0].lfollow);
1.25      millert   274:        xfree(f->posns[2]);
1.27      millert   275:        f->posns[2] = intalloc(k + 1,  __func__);
1.30      millert   276:        for (i = 0; i <= k; i++) {
1.1       tholo     277:                (f->posns[2])[i] = (f->re[0].lfollow)[i];
                    278:        }
                    279:        if ((f->posns[2])[1] == f->accept)
                    280:                f->out[2] = 1;
1.49      millert   281:        clear_gototab(f, 2);
1.1       tholo     282:        f->curstat = cgoto(f, 2, HAT);
                    283:        if (anchor) {
                    284:                *f->posns[2] = k-1;     /* leave out position 0 */
1.30      millert   285:                for (i = 0; i < k; i++) {
1.1       tholo     286:                        (f->posns[0])[i] = (f->posns[2])[i];
                    287:                }
                    288:
                    289:                f->out[0] = f->out[2];
                    290:                if (f->curstat != 2)
                    291:                        --(*f->posns[f->curstat]);
                    292:        }
                    293:        return f->curstat;
                    294: }
                    295:
                    296: void penter(Node *p)   /* set up parent pointers and leaf indices */
                    297: {
                    298:        switch (type(p)) {
1.15      millert   299:        ELEAF
1.1       tholo     300:        LEAF
1.7       millert   301:                info(p) = poscnt;
1.1       tholo     302:                poscnt++;
                    303:                break;
                    304:        UNARY
                    305:                penter(left(p));
                    306:                parent(left(p)) = p;
                    307:                break;
                    308:        case CAT:
                    309:        case OR:
                    310:                penter(left(p));
                    311:                penter(right(p));
                    312:                parent(left(p)) = p;
                    313:                parent(right(p)) = p;
                    314:                break;
1.31      millert   315:        case ZERO:
                    316:                break;
1.1       tholo     317:        default:        /* can't happen */
1.8       millert   318:                FATAL("can't happen: unknown type %d in penter", type(p));
1.1       tholo     319:                break;
                    320:        }
                    321: }
                    322:
                    323: void freetr(Node *p)   /* free parse tree */
                    324: {
                    325:        switch (type(p)) {
1.15      millert   326:        ELEAF
1.1       tholo     327:        LEAF
                    328:                xfree(p);
                    329:                break;
                    330:        UNARY
1.31      millert   331:        case ZERO:
1.1       tholo     332:                freetr(left(p));
                    333:                xfree(p);
                    334:                break;
                    335:        case CAT:
                    336:        case OR:
                    337:                freetr(left(p));
                    338:                freetr(right(p));
                    339:                xfree(p);
                    340:                break;
                    341:        default:        /* can't happen */
1.8       millert   342:                FATAL("can't happen: unknown type %d in freetr", type(p));
1.1       tholo     343:                break;
                    344:        }
                    345: }
                    346:
                    347: /* in the parsing of regular expressions, metacharacters like . have */
                    348: /* to be seen literally;  \056 is not a metacharacter. */
                    349:
1.38      millert   350: int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
1.2       millert   351: {                      /* only pick up one 8-bit byte (2 chars) */
1.28      millert   352:        const uschar *p;
1.1       tholo     353:        int n = 0;
1.2       millert   354:        int i;
1.1       tholo     355:
1.38      millert   356:        for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
1.1       tholo     357:                if (isdigit(*p))
                    358:                        n = 16 * n + *p - '0';
                    359:                else if (*p >= 'a' && *p <= 'f')
                    360:                        n = 16 * n + *p - 'a' + 10;
                    361:                else if (*p >= 'A' && *p <= 'F')
                    362:                        n = 16 * n + *p - 'A' + 10;
                    363:        }
1.28      millert   364:        *pp = p;
1.1       tholo     365:        return n;
                    366: }
                    367:
1.2       millert   368: #define isoctdigit(c) ((c) >= '0' && (c) <= '7')       /* multiple use of arg */
1.1       tholo     369:
1.28      millert   370: int quoted(const uschar **pp)  /* pick up next thing after a \\ */
1.1       tholo     371:                        /* and increment *pp */
                    372: {
1.28      millert   373:        const uschar *p = *pp;
1.1       tholo     374:        int c;
                    375:
1.38      millert   376: /* BUG: should advance by utf-8 char even if makes no sense */
                    377:
                    378:        if ((c = *p++) == 't') {
1.1       tholo     379:                c = '\t';
1.38      millert   380:        } else if (c == 'n') {
1.1       tholo     381:                c = '\n';
1.38      millert   382:        } else if (c == 'f') {
1.1       tholo     383:                c = '\f';
1.38      millert   384:        } else if (c == 'r') {
1.1       tholo     385:                c = '\r';
1.38      millert   386:        } else if (c == 'b') {
1.1       tholo     387:                c = '\b';
1.38      millert   388:        } else if (c == 'v') {
1.25      millert   389:                c = '\v';
1.38      millert   390:        } else if (c == 'a') {
1.25      millert   391:                c = '\a';
1.38      millert   392:        } else if (c == '\\') {
1.1       tholo     393:                c = '\\';
1.38      millert   394:        } else if (c == 'x') {  /* 2 hex digits follow */
                    395:                c = hexstr(&p, 2);      /* this adds a null if number is invalid */
                    396:        } else if (c == 'u') {  /* unicode char number up to 8 hex digits */
                    397:                c = hexstr(&p, 8);
1.1       tholo     398:        } else if (isoctdigit(c)) {     /* \d \dd \ddd */
                    399:                int n = c - '0';
                    400:                if (isoctdigit(*p)) {
                    401:                        n = 8 * n + *p++ - '0';
                    402:                        if (isoctdigit(*p))
                    403:                                n = 8 * n + *p++ - '0';
                    404:                }
                    405:                c = n;
                    406:        } /* else */
                    407:                /* c = c; */
                    408:        *pp = p;
                    409:        return c;
                    410: }
                    411:
1.38      millert   412: int *cclenter(const char *argp)        /* add a character class */
1.1       tholo     413: {
                    414:        int i, c, c2;
1.38      millert   415:        int n;
                    416:        const uschar *p = (const uschar *) argp;
                    417:        int *bp, *retp;
                    418:        static int *buf = NULL;
1.5       kstailey  419:        static int bufsz = 100;
1.1       tholo     420:
1.38      millert   421:        if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
1.8       millert   422:                FATAL("out of space for character class [%.10s...] 1", p);
1.5       kstailey  423:        bp = buf;
1.38      millert   424:        for (i = 0; *p != 0; ) {
1.50      millert   425:                n = u8_rune(&c, (const char *) p);
1.38      millert   426:                p += n;
1.1       tholo     427:                if (c == '\\') {
1.17      millert   428:                        c = quoted(&p);
1.5       kstailey  429:                } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1       tholo     430:                        if (*p != 0) {
1.5       kstailey  431:                                c = bp[-1];
1.38      millert   432:                                /* c2 = *p++; */
1.50      millert   433:                                n = u8_rune(&c2, (const char *) p);
1.38      millert   434:                                p += n;
1.1       tholo     435:                                if (c2 == '\\')
1.38      millert   436:                                        c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
1.1       tholo     437:                                if (c > c2) {   /* empty; ignore */
1.5       kstailey  438:                                        bp--;
1.1       tholo     439:                                        i--;
                    440:                                        continue;
                    441:                                }
                    442:                                while (c < c2) {
1.38      millert   443:                                        if (i >= bufsz) {
1.45      millert   444:                                                buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38      millert   445:                                                if (buf == NULL)
                    446:                                                        FATAL("out of space for character class [%.10s...] 2", p);
                    447:                                                bufsz *= 2;
                    448:                                                bp = buf + i;
                    449:                                        }
1.5       kstailey  450:                                        *bp++ = ++c;
1.1       tholo     451:                                        i++;
                    452:                                }
                    453:                                continue;
                    454:                        }
                    455:                }
1.38      millert   456:                if (i >= bufsz) {
1.45      millert   457:                        buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38      millert   458:                        if (buf == NULL)
                    459:                                FATAL("out of space for character class [%.10s...] 2", p);
                    460:                        bufsz *= 2;
                    461:                        bp = buf + i;
                    462:                }
1.5       kstailey  463:                *bp++ = c;
1.1       tholo     464:                i++;
                    465:        }
1.5       kstailey  466:        *bp = 0;
1.38      millert   467:        /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
                    468:        /* xfree(op);  BUG: what are we freeing here? */
                    469:        retp = (int *) calloc(bp-buf+1, sizeof(int));
                    470:        for (i = 0; i < bp-buf+1; i++)
                    471:                retp[i] = buf[i];
                    472:        return retp;
1.1       tholo     473: }
                    474:
1.11      millert   475: void overflo(const char *s)
1.1       tholo     476: {
1.27      millert   477:        FATAL("regular expression too big: out of space in %.30s...", s);
1.1       tholo     478: }
                    479:
                    480: void cfoll(fa *f, Node *v)     /* enter follow set of each leaf of vertex v into lfollow[leaf] */
                    481: {
                    482:        int i;
1.4       millert   483:        int *p;
1.1       tholo     484:
                    485:        switch (type(v)) {
1.15      millert   486:        ELEAF
1.1       tholo     487:        LEAF
1.7       millert   488:                f->re[info(v)].ltype = type(v);
                    489:                f->re[info(v)].lval.np = right(v);
1.1       tholo     490:                while (f->accept >= maxsetvec) {        /* guessing here! */
1.27      millert   491:                        resizesetvec(__func__);
1.1       tholo     492:                }
                    493:                for (i = 0; i <= f->accept; i++)
                    494:                        setvec[i] = 0;
                    495:                setcnt = 0;
                    496:                follow(v);      /* computes setvec and setcnt */
1.27      millert   497:                p = intalloc(setcnt + 1, __func__);
1.7       millert   498:                f->re[info(v)].lfollow = p;
1.1       tholo     499:                *p = setcnt;
                    500:                for (i = f->accept; i >= 0; i--)
                    501:                        if (setvec[i] == 1)
                    502:                                *++p = i;
                    503:                break;
                    504:        UNARY
                    505:                cfoll(f,left(v));
                    506:                break;
                    507:        case CAT:
                    508:        case OR:
                    509:                cfoll(f,left(v));
                    510:                cfoll(f,right(v));
                    511:                break;
1.31      millert   512:        case ZERO:
                    513:                break;
1.1       tholo     514:        default:        /* can't happen */
1.8       millert   515:                FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1       tholo     516:        }
                    517: }
                    518:
                    519: int first(Node *p)     /* collects initially active leaves of p into setvec */
1.15      millert   520:                        /* returns 0 if p matches empty string */
1.1       tholo     521: {
1.4       millert   522:        int b, lp;
1.1       tholo     523:
                    524:        switch (type(p)) {
1.15      millert   525:        ELEAF
1.1       tholo     526:        LEAF
1.7       millert   527:                lp = info(p);   /* look for high-water mark of subscripts */
1.1       tholo     528:                while (setcnt >= maxsetvec || lp >= maxsetvec) {        /* guessing here! */
1.27      millert   529:                        resizesetvec(__func__);
1.1       tholo     530:                }
1.15      millert   531:                if (type(p) == EMPTYRE) {
                    532:                        setvec[lp] = 0;
                    533:                        return(0);
                    534:                }
1.1       tholo     535:                if (setvec[lp] != 1) {
                    536:                        setvec[lp] = 1;
                    537:                        setcnt++;
                    538:                }
1.41      millert   539:                if (type(p) == CCL && (*(int *) right(p)) == 0)
1.1       tholo     540:                        return(0);              /* empty CCL */
1.30      millert   541:                return(1);
1.1       tholo     542:        case PLUS:
1.30      millert   543:                if (first(left(p)) == 0)
                    544:                        return(0);
1.1       tholo     545:                return(1);
                    546:        case STAR:
                    547:        case QUEST:
                    548:                first(left(p));
                    549:                return(0);
                    550:        case CAT:
                    551:                if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
                    552:                return(1);
                    553:        case OR:
                    554:                b = first(right(p));
                    555:                if (first(left(p)) == 0 || b == 0) return(0);
                    556:                return(1);
1.31      millert   557:        case ZERO:
                    558:                return 0;
1.1       tholo     559:        }
1.8       millert   560:        FATAL("can't happen: unknown type %d in first", type(p));       /* can't happen */
1.1       tholo     561:        return(-1);
                    562: }
                    563:
                    564: void follow(Node *v)   /* collects leaves that can follow v into setvec */
                    565: {
                    566:        Node *p;
                    567:
                    568:        if (type(v) == FINAL)
                    569:                return;
                    570:        p = parent(v);
                    571:        switch (type(p)) {
                    572:        case STAR:
                    573:        case PLUS:
                    574:                first(v);
                    575:                follow(p);
                    576:                return;
                    577:
                    578:        case OR:
                    579:        case QUEST:
                    580:                follow(p);
                    581:                return;
                    582:
                    583:        case CAT:
                    584:                if (v == left(p)) {     /* v is left child of p */
                    585:                        if (first(right(p)) == 0) {
                    586:                                follow(p);
                    587:                                return;
                    588:                        }
                    589:                } else          /* v is right child */
                    590:                        follow(p);
                    591:                return;
                    592:        }
                    593: }
                    594:
1.38      millert   595: int member(int c, int *sarg)   /* is c in s? */
1.1       tholo     596: {
1.38      millert   597:        int *s = (int *) sarg;
1.10      millert   598:
1.1       tholo     599:        while (*s)
                    600:                if (c == *s++)
                    601:                        return(1);
                    602:        return(0);
                    603: }
                    604:
1.49      millert   605: static void resize_gototab(fa *f, int state)
                    606: {
                    607:        size_t new_size = f->gototab[state].allocated * 2;
                    608:        gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
                    609:        if (p == NULL)
                    610:                overflo(__func__);
                    611:
                    612:        // need to initialized the new memory to zero
                    613:        size_t orig_size = f->gototab[state].allocated;         // 2nd half of new mem is this size
                    614:        memset(p + orig_size, 0, orig_size * sizeof(gtte));     // clean it out
                    615:
1.51      millert   616:        f->gototab[state].allocated = new_size;                 // update gototab info
1.49      millert   617:        f->gototab[state].entries = p;
                    618: }
                    619:
1.51      millert   620: static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
1.38      millert   621: {
1.49      millert   622:        gtte key;
                    623:        gtte *item;
                    624:
                    625:        key.ch = ch;
                    626:        key.state = 0;  /* irrelevant */
1.50      millert   627:        item = (gtte *) bsearch(& key, f->gototab[state].entries,
1.49      millert   628:                        f->gototab[state].inuse, sizeof(gtte),
                    629:                        entry_cmp);
                    630:
                    631:        if (item == NULL)
                    632:                return 0;
                    633:        else
                    634:                return item->state;
1.38      millert   635: }
                    636:
1.49      millert   637: static int entry_cmp(const void *l, const void *r)
1.44      millert   638: {
1.49      millert   639:        const gtte *left, *right;
                    640:
                    641:        left = (const gtte *) l;
                    642:        right = (const gtte *) r;
                    643:
                    644:        return left->ch - right->ch;
1.44      millert   645: }
                    646:
1.51      millert   647: static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
1.38      millert   648: {
1.49      millert   649:        if (f->gototab[state].inuse == 0) {
                    650:                f->gototab[state].entries[0].ch = ch;
                    651:                f->gototab[state].entries[0].state = val;
                    652:                f->gototab[state].inuse++;
                    653:                return val;
1.52    ! millert   654:        } else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
1.49      millert   655:                // not seen yet, insert and return
                    656:                gtt *tab = & f->gototab[state];
                    657:                if (tab->inuse + 1 >= tab->allocated)
                    658:                        resize_gototab(f, state);
                    659:
1.51      millert   660:                f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
                    661:                f->gototab[state].entries[f->gototab[state].inuse].state = val;
1.49      millert   662:                f->gototab[state].inuse++;
                    663:                return val;
                    664:        } else {
                    665:                // maybe we have it, maybe we don't
                    666:                gtte key;
                    667:                gtte *item;
                    668:
                    669:                key.ch = ch;
                    670:                key.state = 0;  /* irrelevant */
1.50      millert   671:                item = (gtte *) bsearch(& key, f->gototab[state].entries,
1.49      millert   672:                                f->gototab[state].inuse, sizeof(gtte),
                    673:                                entry_cmp);
                    674:
                    675:                if (item != NULL) {
                    676:                        // we have it, update state and return
                    677:                        item->state = val;
                    678:                        return item->state;
                    679:                }
                    680:                // otherwise, fall through to insert and reallocate.
                    681:        }
                    682:
                    683:        gtt *tab = & f->gototab[state];
                    684:        if (tab->inuse + 1 >= tab->allocated)
                    685:                resize_gototab(f, state);
                    686:        f->gototab[state].entries[tab->inuse].ch = ch;
                    687:        f->gototab[state].entries[tab->inuse].state = val;
1.51      millert   688:        ++tab->inuse;
1.49      millert   689:
                    690:        qsort(f->gototab[state].entries,
                    691:                f->gototab[state].inuse, sizeof(gtte), entry_cmp);
                    692:
1.38      millert   693:        return val; /* not used anywhere at the moment */
                    694: }
                    695:
1.49      millert   696: static void clear_gototab(fa *f, int state)
                    697: {
                    698:        memset(f->gototab[state].entries, 0,
                    699:                f->gototab[state].allocated * sizeof(gtte));
                    700:        f->gototab[state].inuse = 0;
                    701: }
                    702:
1.11      millert   703: int match(fa *f, const char *p0)       /* shortest match ? */
1.1       tholo     704: {
                    705:        int s, ns;
1.38      millert   706:        int n;
                    707:        int rune;
1.28      millert   708:        const uschar *p = (const uschar *) p0;
1.1       tholo     709:
1.38      millert   710:        /* return pmatch(f, p0); does it matter whether longest or shortest? */
                    711:
1.27      millert   712:        s = f->initstat;
                    713:        assert (s < f->state_count);
                    714:
1.1       tholo     715:        if (f->out[s])
                    716:                return(1);
                    717:        do {
1.15      millert   718:                /* assert(*p < NCHARS); */
1.50      millert   719:                n = u8_rune(&rune, (const char *) p);
1.38      millert   720:                if ((ns = get_gototab(f, s, rune)) != 0)
1.1       tholo     721:                        s = ns;
                    722:                else
1.38      millert   723:                        s = cgoto(f, s, rune);
1.1       tholo     724:                if (f->out[s])
                    725:                        return(1);
1.38      millert   726:                if (*p == 0)
                    727:                        break;
                    728:                p += n;
                    729:        } while (1);  /* was *p++ != 0 */
1.1       tholo     730:        return(0);
                    731: }
                    732:
1.11      millert   733: int pmatch(fa *f, const char *p0)      /* longest match, for sub */
1.1       tholo     734: {
                    735:        int s, ns;
1.38      millert   736:        int n;
                    737:        int rune;
1.28      millert   738:        const uschar *p = (const uschar *) p0;
                    739:        const uschar *q;
1.1       tholo     740:
1.27      millert   741:        s = f->initstat;
                    742:        assert(s < f->state_count);
                    743:
1.28      millert   744:        patbeg = (const char *)p;
1.1       tholo     745:        patlen = -1;
                    746:        do {
                    747:                q = p;
                    748:                do {
                    749:                        if (f->out[s])          /* final state */
                    750:                                patlen = q-p;
1.15      millert   751:                        /* assert(*q < NCHARS); */
1.50      millert   752:                        n = u8_rune(&rune, (const char *) q);
1.38      millert   753:                        if ((ns = get_gototab(f, s, rune)) != 0)
1.1       tholo     754:                                s = ns;
                    755:                        else
1.38      millert   756:                                s = cgoto(f, s, rune);
1.27      millert   757:
                    758:                        assert(s < f->state_count);
                    759:
1.9       deraadt   760:                        if (s == 1) {   /* no transition */
1.1       tholo     761:                                if (patlen >= 0) {
1.28      millert   762:                                        patbeg = (const char *) p;
1.1       tholo     763:                                        return(1);
                    764:                                }
                    765:                                else
                    766:                                        goto nextin;    /* no match */
1.9       deraadt   767:                        }
1.38      millert   768:                        if (*q == 0)
                    769:                                break;
                    770:                        q += n;
                    771:                } while (1);
                    772:                q++;  /* was *q++ */
1.1       tholo     773:                if (f->out[s])
                    774:                        patlen = q-p-1; /* don't count $ */
                    775:                if (patlen >= 0) {
1.28      millert   776:                        patbeg = (const char *) p;
1.1       tholo     777:                        return(1);
                    778:                }
                    779:        nextin:
                    780:                s = 2;
1.38      millert   781:                if (*p == 0)
                    782:                        break;
1.50      millert   783:                n = u8_rune(&rune, (const char *) p);
1.38      millert   784:                p += n;
                    785:        } while (1); /* was *p++ */
1.1       tholo     786:        return (0);
                    787: }
                    788:
1.11      millert   789: int nematch(fa *f, const char *p0)     /* non-empty match, for sub */
1.1       tholo     790: {
                    791:        int s, ns;
1.38      millert   792:        int n;
                    793:        int rune;
1.28      millert   794:        const uschar *p = (const uschar *) p0;
                    795:        const uschar *q;
1.1       tholo     796:
1.27      millert   797:        s = f->initstat;
                    798:        assert(s < f->state_count);
                    799:
1.28      millert   800:        patbeg = (const char *)p;
1.1       tholo     801:        patlen = -1;
                    802:        while (*p) {
                    803:                q = p;
                    804:                do {
                    805:                        if (f->out[s])          /* final state */
                    806:                                patlen = q-p;
1.15      millert   807:                        /* assert(*q < NCHARS); */
1.50      millert   808:                        n = u8_rune(&rune, (const char *) q);
1.38      millert   809:                        if ((ns = get_gototab(f, s, rune)) != 0)
1.1       tholo     810:                                s = ns;
                    811:                        else
1.38      millert   812:                                s = cgoto(f, s, rune);
1.9       deraadt   813:                        if (s == 1) {   /* no transition */
1.1       tholo     814:                                if (patlen > 0) {
1.28      millert   815:                                        patbeg = (const char *) p;
1.1       tholo     816:                                        return(1);
                    817:                                } else
                    818:                                        goto nnextin;   /* no nonempty match */
1.9       deraadt   819:                        }
1.38      millert   820:                        if (*q == 0)
                    821:                                break;
                    822:                        q += n;
                    823:                } while (1);
                    824:                q++;
1.1       tholo     825:                if (f->out[s])
                    826:                        patlen = q-p-1; /* don't count $ */
                    827:                if (patlen > 0 ) {
1.28      millert   828:                        patbeg = (const char *) p;
1.1       tholo     829:                        return(1);
                    830:                }
                    831:        nnextin:
                    832:                s = 2;
                    833:                p++;
                    834:        }
                    835:        return (0);
                    836: }
                    837:
1.43      millert   838:
1.26      millert   839: /*
                    840:  * NAME
                    841:  *     fnematch
                    842:  *
                    843:  * DESCRIPTION
                    844:  *     A stream-fed version of nematch which transfers characters to a
                    845:  *     null-terminated buffer. All characters up to and including the last
                    846:  *     character of the matching text or EOF are placed in the buffer. If
                    847:  *     a match is found, patbeg and patlen are set appropriately.
                    848:  *
                    849:  * RETURN VALUES
1.29      millert   850:  *     false    No match found.
                    851:  *     true     Match found.
1.26      millert   852:  */
                    853:
1.29      millert   854: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26      millert   855: {
1.48      millert   856:        char *i, *j, *k, *buf = *pbuf;
1.26      millert   857:        int bufsize = *pbufsize;
1.48      millert   858:        int c, n, ns, s;
1.26      millert   859:
                    860:        s = pfa->initstat;
                    861:        patlen = 0;
                    862:
                    863:        /*
1.48      millert   864:         * buf <= i <= j <= k <= buf+bufsize
1.26      millert   865:         *
1.48      millert   866:         * i: origin of active substring
                    867:         * j: current character
                    868:         * k: destination of the next getc
1.26      millert   869:         */
1.48      millert   870:
                    871:        i = j = k = buf;
                    872:
                    873:        do {
                    874:                /*
1.51      millert   875:                 * Call u8_rune with at least awk_mb_cur_max ahead in
1.48      millert   876:                 * the buffer until EOF interferes.
                    877:                 */
1.52    ! millert   878:                if (k - j < (int)awk_mb_cur_max) {
1.51      millert   879:                        if (k + awk_mb_cur_max > buf + bufsize) {
                    880:                                char *obuf = buf;
1.48      millert   881:                                adjbuf(&buf, &bufsize,
1.51      millert   882:                                    bufsize + awk_mb_cur_max,
1.48      millert   883:                                    quantum, 0, "fnematch");
1.51      millert   884:
                    885:                                /* buf resized, maybe moved. update pointers */
                    886:                                *pbufsize = bufsize;
                    887:                                if (obuf != buf) {
                    888:                                        i = buf + (i - obuf);
                    889:                                        j = buf + (j - obuf);
                    890:                                        k = buf + (k - obuf);
                    891:                                        *pbuf = buf;
                    892:                                        if (patlen)
                    893:                                                patbeg = buf + (patbeg - obuf);
                    894:                                }
1.46      millert   895:                        }
1.51      millert   896:                        for (n = awk_mb_cur_max ; n > 0; n--) {
1.48      millert   897:                                *k++ = (c = getc(f)) != EOF ? c : 0;
                    898:                                if (c == EOF) {
                    899:                                        if (ferror(f))
                    900:                                                FATAL("fnematch: getc error");
                    901:                                        break;
                    902:                                }
1.38      millert   903:                        }
1.48      millert   904:                }
1.26      millert   905:
1.50      millert   906:                j += u8_rune(&c, j);
1.48      millert   907:
                    908:                if ((ns = get_gototab(pfa, s, c)) != 0)
                    909:                        s = ns;
                    910:                else
                    911:                        s = cgoto(pfa, s, c);
1.26      millert   912:
1.48      millert   913:                if (pfa->out[s]) {      /* final state */
                    914:                        patbeg = i;
                    915:                        patlen = j - i;
                    916:                        if (c == 0)     /* don't count $ */
                    917:                                patlen--;
                    918:                }
                    919:
                    920:                if (c && s != 1)
                    921:                        continue;  /* origin i still viable, next j */
                    922:                if (patlen)
                    923:                        break;     /* best match found */
                    924:
                    925:                /* no match at origin i, next i and start over */
1.50      millert   926:                i += u8_rune(&c, i);
1.48      millert   927:                if (c == 0)
                    928:                        break;    /* no match */
                    929:                j = i;
1.26      millert   930:                s = 2;
1.48      millert   931:        } while (1);
1.26      millert   932:
                    933:        if (patlen) {
                    934:                /*
                    935:                 * Under no circumstances is the last character fed to
                    936:                 * the automaton part of the match. It is EOF's nullbyte,
                    937:                 * or it sent the automaton into a state with no further
                    938:                 * transitions available (s==1), or both. Room for a
                    939:                 * terminating nullbyte is guaranteed.
                    940:                 *
                    941:                 * ungetc any chars after the end of matching text
                    942:                 * (except for EOF's nullbyte, if present) and null
                    943:                 * terminate the buffer.
                    944:                 */
1.48      millert   945:                do
                    946:                        if (*--k && ungetc(*k, f) == EOF)
                    947:                                FATAL("unable to ungetc '%c'", *k);
                    948:                while (k > patbeg + patlen);
                    949:                *k = '\0';
1.29      millert   950:                return true;
1.26      millert   951:        }
                    952:        else
1.29      millert   953:                return false;
1.26      millert   954: }
                    955:
1.11      millert   956: Node *reparse(const char *p)   /* parses regular expression pointed to by p */
1.1       tholo     957: {                      /* uses relex() to scan regular expression */
                    958:        Node *np;
                    959:
1.32      millert   960:        DPRINTF("reparse <%s>\n", p);
1.28      millert   961:        lastre = prestr = (const uschar *) p;   /* prestr points to string to be parsed */
1.1       tholo     962:        rtok = relex();
1.11      millert   963:        /* GNU compatibility: an empty regexp matches anything */
1.15      millert   964:        if (rtok == '\0') {
1.11      millert   965:                /* FATAL("empty regular expression"); previous */
1.15      millert   966:                return(op2(EMPTYRE, NIL, NIL));
                    967:        }
1.1       tholo     968:        np = regexp();
                    969:        if (rtok != '\0')
1.8       millert   970:                FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1       tholo     971:        return(np);
                    972: }
                    973:
                    974: Node *regexp(void)     /* top-level parse of reg expr */
                    975: {
                    976:        return (alt(concat(primary())));
                    977: }
                    978:
                    979: Node *primary(void)
                    980: {
                    981:        Node *np;
1.23      millert   982:        int savelastatom;
1.1       tholo     983:
                    984:        switch (rtok) {
                    985:        case CHAR:
1.23      millert   986:                lastatom = starttok;
1.7       millert   987:                np = op2(CHAR, NIL, itonp(rlxval));
1.1       tholo     988:                rtok = relex();
                    989:                return (unary(np));
                    990:        case ALL:
                    991:                rtok = relex();
                    992:                return (unary(op2(ALL, NIL, NIL)));
1.15      millert   993:        case EMPTYRE:
                    994:                rtok = relex();
1.23      millert   995:                return (unary(op2(EMPTYRE, NIL, NIL)));
1.1       tholo     996:        case DOT:
1.23      millert   997:                lastatom = starttok;
1.1       tholo     998:                rtok = relex();
                    999:                return (unary(op2(DOT, NIL, NIL)));
                   1000:        case CCL:
1.28      millert  1001:                np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23      millert  1002:                lastatom = starttok;
1.1       tholo    1003:                rtok = relex();
                   1004:                return (unary(np));
                   1005:        case NCCL:
1.28      millert  1006:                np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23      millert  1007:                lastatom = starttok;
1.1       tholo    1008:                rtok = relex();
                   1009:                return (unary(np));
                   1010:        case '^':
                   1011:                rtok = relex();
1.7       millert  1012:                return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1       tholo    1013:        case '$':
                   1014:                rtok = relex();
                   1015:                return (unary(op2(CHAR, NIL, NIL)));
                   1016:        case '(':
1.23      millert  1017:                lastatom = starttok;
                   1018:                savelastatom = starttok - basestr; /* Retain over recursion */
1.1       tholo    1019:                rtok = relex();
                   1020:                if (rtok == ')') {      /* special pleading for () */
                   1021:                        rtok = relex();
1.42      millert  1022:                        return unary(op2(CCL, NIL, (Node *) cclenter("")));
1.1       tholo    1023:                }
                   1024:                np = regexp();
                   1025:                if (rtok == ')') {
1.23      millert  1026:                        lastatom = basestr + savelastatom; /* Restore */
1.1       tholo    1027:                        rtok = relex();
                   1028:                        return (unary(np));
                   1029:                }
                   1030:                else
1.8       millert  1031:                        FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1       tholo    1032:        default:
1.8       millert  1033:                FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1       tholo    1034:        }
                   1035:        return 0;       /*NOTREACHED*/
                   1036: }
                   1037:
                   1038: Node *concat(Node *np)
                   1039: {
                   1040:        switch (rtok) {
1.23      millert  1041:        case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1       tholo    1042:                return (concat(op2(CAT, np, primary())));
1.23      millert  1043:        case EMPTYRE:
                   1044:                rtok = relex();
1.42      millert  1045:                return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1.23      millert  1046:                                primary())));
1.1       tholo    1047:        }
                   1048:        return (np);
                   1049: }
                   1050:
                   1051: Node *alt(Node *np)
                   1052: {
                   1053:        if (rtok == OR) {
                   1054:                rtok = relex();
                   1055:                return (alt(op2(OR, np, concat(primary()))));
                   1056:        }
                   1057:        return (np);
                   1058: }
                   1059:
                   1060: Node *unary(Node *np)
                   1061: {
                   1062:        switch (rtok) {
                   1063:        case STAR:
                   1064:                rtok = relex();
                   1065:                return (unary(op2(STAR, np, NIL)));
                   1066:        case PLUS:
                   1067:                rtok = relex();
                   1068:                return (unary(op2(PLUS, np, NIL)));
                   1069:        case QUEST:
                   1070:                rtok = relex();
                   1071:                return (unary(op2(QUEST, np, NIL)));
1.31      millert  1072:        case ZERO:
                   1073:                rtok = relex();
                   1074:                return (unary(op2(ZERO, np, NIL)));
1.1       tholo    1075:        default:
                   1076:                return (np);
                   1077:        }
                   1078: }
                   1079:
1.11      millert  1080: /*
                   1081:  * Character class definitions conformant to the POSIX locale as
                   1082:  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
                   1083:  * and operating character sets are both ASCII (ISO646) or supersets
                   1084:  * thereof.
                   1085:  *
                   1086:  * Note that to avoid overflowing the temporary buffer used in
                   1087:  * relex(), the expanded character class (prior to range expansion)
                   1088:  * must be less than twice the size of their full name.
                   1089:  */
1.12      millert  1090:
                   1091: /* Because isblank doesn't show up in any of the header files on any
                   1092:  * system i use, it's defined here.  if some other locale has a richer
                   1093:  * definition of "blank", define HAS_ISBLANK and provide your own
                   1094:  * version.
                   1095:  * the parentheses here are an attempt to find a path through the maze
                   1096:  * of macro definition and/or function and/or version provided.  thanks
                   1097:  * to nelson beebe for the suggestion; let's see if it works everywhere.
                   1098:  */
                   1099:
                   1100: #ifndef HAS_ISBLANK
                   1101:
1.17      millert  1102: int (xisblank)(int c)
1.12      millert  1103: {
                   1104:        return c==' ' || c=='\t';
                   1105: }
                   1106:
                   1107: #endif
                   1108:
1.31      millert  1109: static const struct charclass {
1.11      millert  1110:        const char *cc_name;
                   1111:        int cc_namelen;
1.12      millert  1112:        int (*cc_func)(int);
1.11      millert  1113: } charclasses[] = {
1.12      millert  1114:        { "alnum",      5,      isalnum },
                   1115:        { "alpha",      5,      isalpha },
1.17      millert  1116: #ifndef HAS_ISBLANK
1.21      millert  1117:        { "blank",      5,      xisblank },
1.17      millert  1118: #else
1.12      millert  1119:        { "blank",      5,      isblank },
1.17      millert  1120: #endif
1.12      millert  1121:        { "cntrl",      5,      iscntrl },
                   1122:        { "digit",      5,      isdigit },
                   1123:        { "graph",      5,      isgraph },
                   1124:        { "lower",      5,      islower },
                   1125:        { "print",      5,      isprint },
                   1126:        { "punct",      5,      ispunct },
                   1127:        { "space",      5,      isspace },
                   1128:        { "upper",      5,      isupper },
                   1129:        { "xdigit",     6,      isxdigit },
1.11      millert  1130:        { NULL,         0,      NULL },
                   1131: };
                   1132:
1.23      millert  1133: #define REPEAT_SIMPLE          0
                   1134: #define REPEAT_PLUS_APPENDED   1
                   1135: #define REPEAT_WITH_Q          2
                   1136: #define REPEAT_ZERO            3
                   1137:
                   1138: static int
                   1139: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
                   1140:               int atomlen, int firstnum, int secondnum, int special_case)
                   1141: {
                   1142:        int i, j;
1.26      millert  1143:        uschar *buf = NULL;
1.23      millert  1144:        int ret = 1;
1.31      millert  1145:        int init_q = (firstnum == 0);           /* first added char will be ? */
1.23      millert  1146:        int n_q_reps = secondnum-firstnum;      /* m>n, so reduce until {1,m-n} left  */
                   1147:        int prefix_length = reptok - basestr;   /* prefix includes first rep    */
1.28      millert  1148:        int suffix_length = strlen((const char *) reptok) - reptoklen;  /* string after rep specifier   */
1.23      millert  1149:        int size = prefix_length +  suffix_length;
                   1150:
                   1151:        if (firstnum > 1) {     /* add room for reps 2 through firstnum */
                   1152:                size += atomlen*(firstnum-1);
                   1153:        }
                   1154:
                   1155:        /* Adjust size of buffer for special cases */
                   1156:        if (special_case == REPEAT_PLUS_APPENDED) {
                   1157:                size++;         /* for the final + */
                   1158:        } else if (special_case == REPEAT_WITH_Q) {
1.36      millert  1159:                size += init_q + (atomlen+1)* (n_q_reps-init_q);
1.23      millert  1160:        } else if (special_case == REPEAT_ZERO) {
                   1161:                size += 2;      /* just a null ERE: () */
                   1162:        }
1.35      millert  1163:        if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23      millert  1164:                FATAL("out of space in reg expr %.10s..", lastre);
                   1165:        memcpy(buf, basestr, prefix_length);    /* copy prefix  */
                   1166:        j = prefix_length;
                   1167:        if (special_case == REPEAT_ZERO) {
                   1168:                j -= atomlen;
                   1169:                buf[j++] = '(';
                   1170:                buf[j++] = ')';
                   1171:        }
1.30      millert  1172:        for (i = 1; i < firstnum; i++) {        /* copy x reps  */
1.23      millert  1173:                memcpy(&buf[j], atom, atomlen);
                   1174:                j += atomlen;
                   1175:        }
                   1176:        if (special_case == REPEAT_PLUS_APPENDED) {
                   1177:                buf[j++] = '+';
                   1178:        } else if (special_case == REPEAT_WITH_Q) {
1.29      millert  1179:                if (init_q)
                   1180:                        buf[j++] = '?';
1.30      millert  1181:                for (i = init_q; i < n_q_reps; i++) {   /* copy x? reps */
1.23      millert  1182:                        memcpy(&buf[j], atom, atomlen);
                   1183:                        j += atomlen;
                   1184:                        buf[j++] = '?';
                   1185:                }
                   1186:        }
                   1187:        memcpy(&buf[j], reptok+reptoklen, suffix_length);
1.36      millert  1188:        j += suffix_length;
                   1189:        buf[j] = '\0';
1.23      millert  1190:        /* free old basestr */
                   1191:        if (firstbasestr != basestr) {
                   1192:                if (basestr)
                   1193:                        xfree(basestr);
                   1194:        }
                   1195:        basestr = buf;
                   1196:        prestr  = buf + prefix_length;
                   1197:        if (special_case == REPEAT_ZERO) {
                   1198:                prestr  -= atomlen;
                   1199:                ret++;
                   1200:        }
                   1201:        return ret;
                   1202: }
                   1203:
                   1204: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
                   1205:                  int atomlen, int firstnum, int secondnum)
                   1206: {
                   1207:        /*
                   1208:           In general, the repetition specifier or "bound" is replaced here
                   1209:           by an equivalent ERE string, repeating the immediately previous atom
                   1210:           and appending ? and + as needed. Note that the first copy of the
                   1211:           atom is left in place, except in the special_case of a zero-repeat
                   1212:           (i.e., {0}).
                   1213:         */
                   1214:        if (secondnum < 0) {    /* means {n,} -> repeat n-1 times followed by PLUS */
                   1215:                if (firstnum < 2) {
                   1216:                        /* 0 or 1: should be handled before you get here */
                   1217:                        FATAL("internal error");
                   1218:                } else {
                   1219:                        return replace_repeat(reptok, reptoklen, atom, atomlen,
                   1220:                                firstnum, secondnum, REPEAT_PLUS_APPENDED);
                   1221:                }
                   1222:        } else if (firstnum == secondnum) {     /* {n} or {n,n} -> simply repeat n-1 times */
                   1223:                if (firstnum == 0) {    /* {0} or {0,0} */
                   1224:                        /* This case is unusual because the resulting
                   1225:                           replacement string might actually be SMALLER than
                   1226:                           the original ERE */
                   1227:                        return replace_repeat(reptok, reptoklen, atom, atomlen,
                   1228:                                        firstnum, secondnum, REPEAT_ZERO);
                   1229:                } else {                /* (firstnum >= 1) */
                   1230:                        return replace_repeat(reptok, reptoklen, atom, atomlen,
                   1231:                                        firstnum, secondnum, REPEAT_SIMPLE);
                   1232:                }
                   1233:        } else if (firstnum < secondnum) {      /* {n,m} -> repeat n-1 times then alternate  */
                   1234:                /*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?   */
                   1235:                return replace_repeat(reptok, reptoklen, atom, atomlen,
                   1236:                                        firstnum, secondnum, REPEAT_WITH_Q);
                   1237:        } else {        /* Error - shouldn't be here (n>m) */
                   1238:                FATAL("internal error");
                   1239:        }
                   1240:        return 0;
                   1241: }
1.11      millert  1242:
1.1       tholo    1243: int relex(void)                /* lexical analyzer for reparse */
                   1244: {
1.5       kstailey 1245:        int c, n;
1.1       tholo    1246:        int cflag;
1.25      millert  1247:        static uschar *buf = NULL;
1.5       kstailey 1248:        static int bufsz = 100;
1.10      millert  1249:        uschar *bp;
1.31      millert  1250:        const struct charclass *cc;
1.12      millert  1251:        int i;
1.29      millert  1252:        int num, m;
                   1253:        bool commafound, digitfound;
1.23      millert  1254:        const uschar *startreptok;
1.24      millert  1255:        static int parens = 0;
1.23      millert  1256:
                   1257: rescan:
                   1258:        starttok = prestr;
1.1       tholo    1259:
1.50      millert  1260:        if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1.38      millert  1261:                prestr += n;
                   1262:                starttok = prestr;
                   1263:                return CHAR;
                   1264:        }
                   1265:
1.1       tholo    1266:        switch (c = *prestr++) {
                   1267:        case '|': return OR;
                   1268:        case '*': return STAR;
                   1269:        case '+': return PLUS;
                   1270:        case '?': return QUEST;
                   1271:        case '.': return DOT;
                   1272:        case '\0': prestr--; return '\0';
                   1273:        case '^':
                   1274:        case '$':
1.24      millert  1275:                return c;
1.1       tholo    1276:        case '(':
1.24      millert  1277:                parens++;
                   1278:                return c;
1.1       tholo    1279:        case ')':
1.24      millert  1280:                if (parens) {
                   1281:                        parens--;
                   1282:                        return c;
                   1283:                }
                   1284:                /* unmatched close parenthesis; per POSIX, treat as literal */
                   1285:                rlxval = c;
                   1286:                return CHAR;
1.1       tholo    1287:        case '\\':
1.17      millert  1288:                rlxval = quoted(&prestr);
1.1       tholo    1289:                return CHAR;
                   1290:        default:
                   1291:                rlxval = c;
                   1292:                return CHAR;
1.25      millert  1293:        case '[':
1.35      millert  1294:                if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8       millert  1295:                        FATAL("out of space in reg expr %.10s..", lastre);
1.5       kstailey 1296:                bp = buf;
1.1       tholo    1297:                if (*prestr == '^') {
                   1298:                        cflag = 1;
                   1299:                        prestr++;
                   1300:                }
                   1301:                else
                   1302:                        cflag = 0;
1.38      millert  1303:                n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1.15      millert  1304:                if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8       millert  1305:                        FATAL("out of space for reg expr %.10s...", lastre);
1.1       tholo    1306:                for (; ; ) {
1.50      millert  1307:                        if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1.38      millert  1308:                                for (i = 0; i < n; i++)
                   1309:                                        *bp++ = *prestr++;
                   1310:                                continue;
                   1311:                        }
1.1       tholo    1312:                        if ((c = *prestr++) == '\\') {
1.5       kstailey 1313:                                *bp++ = '\\';
1.1       tholo    1314:                                if ((c = *prestr++) == '\0')
1.8       millert  1315:                                        FATAL("nonterminated character class %.20s...", lastre);
1.5       kstailey 1316:                                *bp++ = c;
1.10      millert  1317:                        /* } else if (c == '\n') { */
                   1318:                        /*      FATAL("newline in character class %.20s...", lastre); */
1.11      millert  1319:                        } else if (c == '[' && *prestr == ':') {
                   1320:                                /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
                   1321:                                for (cc = charclasses; cc->cc_name; cc++)
                   1322:                                        if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
                   1323:                                                break;
                   1324:                                if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
                   1325:                                    prestr[2 + cc->cc_namelen] == ']') {
                   1326:                                        prestr += cc->cc_namelen + 3;
1.22      millert  1327:                                        /*
                   1328:                                         * BUG: We begin at 1, instead of 0, since we
                   1329:                                         * would otherwise prematurely terminate the
                   1330:                                         * string for classes like [[:cntrl:]]. This
                   1331:                                         * means that we can't match the NUL character,
                   1332:                                         * not without first adapting the entire
                   1333:                                         * program to track each string's length.
                   1334:                                         */
1.23      millert  1335:                                        for (i = 1; i <= UCHAR_MAX; i++) {
1.37      millert  1336:                                                if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1.12      millert  1337:                                                    FATAL("out of space for reg expr %.10s...", lastre);
                   1338:                                                if (cc->cc_func(i)) {
1.32      millert  1339:                                                        /* escape backslash */
                   1340:                                                        if (i == '\\') {
                   1341:                                                                *bp++ = '\\';
                   1342:                                                                n++;
                   1343:                                                        }
                   1344:
1.12      millert  1345:                                                        *bp++ = i;
                   1346:                                                        n++;
                   1347:                                                }
                   1348:                                        }
1.11      millert  1349:                                } else
                   1350:                                        *bp++ = c;
1.23      millert  1351:                        } else if (c == '[' && *prestr == '.') {
                   1352:                                char collate_char;
                   1353:                                prestr++;
                   1354:                                collate_char = *prestr++;
                   1355:                                if (*prestr == '.' && prestr[1] == ']') {
                   1356:                                        prestr += 2;
                   1357:                                        /* Found it: map via locale TBD: for
                   1358:                                           now, simply return this char.  This
                   1359:                                           is sufficient to pass conformance
                   1360:                                           test awk.ex 156
                   1361:                                         */
                   1362:                                        if (*prestr == ']') {
                   1363:                                                prestr++;
                   1364:                                                rlxval = collate_char;
                   1365:                                                return CHAR;
                   1366:                                        }
                   1367:                                }
                   1368:                        } else if (c == '[' && *prestr == '=') {
                   1369:                                char equiv_char;
                   1370:                                prestr++;
                   1371:                                equiv_char = *prestr++;
                   1372:                                if (*prestr == '=' && prestr[1] == ']') {
                   1373:                                        prestr += 2;
                   1374:                                        /* Found it: map via locale TBD: for now
                   1375:                                           simply return this char. This is
                   1376:                                           sufficient to pass conformance test
                   1377:                                           awk.ex 156
                   1378:                                         */
                   1379:                                        if (*prestr == ']') {
                   1380:                                                prestr++;
                   1381:                                                rlxval = equiv_char;
                   1382:                                                return CHAR;
                   1383:                                        }
                   1384:                                }
1.1       tholo    1385:                        } else if (c == '\0') {
1.8       millert  1386:                                FATAL("nonterminated character class %.20s", lastre);
1.5       kstailey 1387:                        } else if (bp == buf) { /* 1st char is special */
                   1388:                                *bp++ = c;
1.1       tholo    1389:                        } else if (c == ']') {
1.5       kstailey 1390:                                *bp++ = 0;
1.10      millert  1391:                                rlxstr = (uschar *) tostring((char *) buf);
1.1       tholo    1392:                                if (cflag == 0)
                   1393:                                        return CCL;
                   1394:                                else
                   1395:                                        return NCCL;
                   1396:                        } else
1.5       kstailey 1397:                                *bp++ = c;
1.1       tholo    1398:                }
1.23      millert  1399:                break;
                   1400:        case '{':
                   1401:                if (isdigit(*(prestr))) {
                   1402:                        num = 0;        /* Process as a repetition */
                   1403:                        n = -1; m = -1;
1.29      millert  1404:                        commafound = false;
                   1405:                        digitfound = false;
1.23      millert  1406:                        startreptok = prestr-1;
                   1407:                        /* Remember start of previous atom here ? */
                   1408:                } else {                /* just a { char, not a repetition */
                   1409:                        rlxval = c;
                   1410:                        return CHAR;
                   1411:                 }
                   1412:                for (; ; ) {
                   1413:                        if ((c = *prestr++) == '}') {
                   1414:                                if (commafound) {
                   1415:                                        if (digitfound) { /* {n,m} */
                   1416:                                                m = num;
1.30      millert  1417:                                                if (m < n)
1.23      millert  1418:                                                        FATAL("illegal repetition expression: class %.20s",
                   1419:                                                                lastre);
1.30      millert  1420:                                                if (n == 0 && m == 1) {
1.23      millert  1421:                                                        return QUEST;
                   1422:                                                }
                   1423:                                        } else {        /* {n,} */
1.30      millert  1424:                                                if (n == 0)
                   1425:                                                        return STAR;
                   1426:                                                else if (n == 1)
                   1427:                                                        return PLUS;
1.23      millert  1428:                                        }
                   1429:                                } else {
                   1430:                                        if (digitfound) { /* {n} same as {n,n} */
                   1431:                                                n = num;
                   1432:                                                m = num;
                   1433:                                        } else {        /* {} */
                   1434:                                                FATAL("illegal repetition expression: class %.20s",
                   1435:                                                        lastre);
                   1436:                                        }
                   1437:                                }
                   1438:                                if (repeat(starttok, prestr-starttok, lastatom,
                   1439:                                           startreptok - lastatom, n, m) > 0) {
1.30      millert  1440:                                        if (n == 0 && m == 0) {
1.31      millert  1441:                                                return ZERO;
1.23      millert  1442:                                        }
                   1443:                                        /* must rescan input for next token */
                   1444:                                        goto rescan;
                   1445:                                }
                   1446:                                /* Failed to replace: eat up {...} characters
                   1447:                                   and treat like just PLUS */
                   1448:                                return PLUS;
                   1449:                        } else if (c == '\0') {
                   1450:                                FATAL("nonterminated character class %.20s",
                   1451:                                        lastre);
                   1452:                        } else if (isdigit(c)) {
                   1453:                                num = 10 * num + c - '0';
1.29      millert  1454:                                digitfound = true;
1.23      millert  1455:                        } else if (c == ',') {
                   1456:                                if (commafound)
                   1457:                                        FATAL("illegal repetition expression: class %.20s",
                   1458:                                                lastre);
                   1459:                                /* looking for {n,} or {n,m} */
1.29      millert  1460:                                commafound = true;
1.23      millert  1461:                                n = num;
1.29      millert  1462:                                digitfound = false; /* reset */
1.23      millert  1463:                                num = 0;
                   1464:                        } else {
                   1465:                                FATAL("illegal repetition expression: class %.20s",
                   1466:                                        lastre);
                   1467:                        }
                   1468:                }
                   1469:                break;
1.1       tholo    1470:        }
                   1471: }
                   1472:
                   1473: int cgoto(fa *f, int s, int c)
                   1474: {
1.27      millert  1475:        int *p, *q;
1.1       tholo    1476:        int i, j, k;
                   1477:
1.38      millert  1478:        /* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1.1       tholo    1479:        while (f->accept >= maxsetvec) {        /* guessing here! */
1.27      millert  1480:                resizesetvec(__func__);
1.1       tholo    1481:        }
                   1482:        for (i = 0; i <= f->accept; i++)
                   1483:                setvec[i] = 0;
                   1484:        setcnt = 0;
1.27      millert  1485:        resize_state(f, s);
1.1       tholo    1486:        /* compute positions of gototab[s,c] into setvec */
                   1487:        p = f->posns[s];
                   1488:        for (i = 1; i <= *p; i++) {
                   1489:                if ((k = f->re[p[i]].ltype) != FINAL) {
1.7       millert  1490:                        if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1       tholo    1491:                         || (k == DOT && c != 0 && c != HAT)
                   1492:                         || (k == ALL && c != 0)
1.15      millert  1493:                         || (k == EMPTYRE && c != 0)
1.38      millert  1494:                         || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
                   1495:                         || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1.1       tholo    1496:                                q = f->re[p[i]].lfollow;
                   1497:                                for (j = 1; j <= *q; j++) {
                   1498:                                        if (q[j] >= maxsetvec) {
1.27      millert  1499:                                                resizesetvec(__func__);
1.1       tholo    1500:                                        }
                   1501:                                        if (setvec[q[j]] == 0) {
                   1502:                                                setcnt++;
                   1503:                                                setvec[q[j]] = 1;
                   1504:                                        }
                   1505:                                }
                   1506:                        }
                   1507:                }
                   1508:        }
                   1509:        /* determine if setvec is a previous state */
                   1510:        tmpset[0] = setcnt;
                   1511:        j = 1;
                   1512:        for (i = f->accept; i >= 0; i--)
                   1513:                if (setvec[i]) {
                   1514:                        tmpset[j++] = i;
                   1515:                }
1.27      millert  1516:        resize_state(f, f->curstat > s ? f->curstat : s);
1.1       tholo    1517:        /* tmpset == previous state? */
                   1518:        for (i = 1; i <= f->curstat; i++) {
                   1519:                p = f->posns[i];
                   1520:                if ((k = tmpset[0]) != p[0])
                   1521:                        goto different;
                   1522:                for (j = 1; j <= k; j++)
                   1523:                        if (tmpset[j] != p[j])
                   1524:                                goto different;
                   1525:                /* setvec is state i */
1.30      millert  1526:                if (c != HAT)
1.38      millert  1527:                        set_gototab(f, s, c, i);
1.1       tholo    1528:                return i;
                   1529:          different:;
                   1530:        }
                   1531:
                   1532:        /* add tmpset to current set of states */
1.27      millert  1533:        ++(f->curstat);
                   1534:        resize_state(f, f->curstat);
1.49      millert  1535:        clear_gototab(f, f->curstat);
1.1       tholo    1536:        xfree(f->posns[f->curstat]);
1.27      millert  1537:        p = intalloc(setcnt + 1, __func__);
1.1       tholo    1538:
                   1539:        f->posns[f->curstat] = p;
1.30      millert  1540:        if (c != HAT)
1.38      millert  1541:                set_gototab(f, s, c, f->curstat);
1.1       tholo    1542:        for (i = 0; i <= setcnt; i++)
                   1543:                p[i] = tmpset[i];
                   1544:        if (setvec[f->accept])
                   1545:                f->out[f->curstat] = 1;
                   1546:        else
                   1547:                f->out[f->curstat] = 0;
                   1548:        return f->curstat;
                   1549: }
                   1550:
                   1551:
                   1552: void freefa(fa *f)     /* free a finite automaton */
                   1553: {
                   1554:        int i;
                   1555:
                   1556:        if (f == NULL)
                   1557:                return;
1.27      millert  1558:        for (i = 0; i < f->state_count; i++)
1.49      millert  1559:                xfree(f->gototab[i].entries);
                   1560:        xfree(f->gototab);
1.1       tholo    1561:        for (i = 0; i <= f->curstat; i++)
                   1562:                xfree(f->posns[i]);
                   1563:        for (i = 0; i <= f->accept; i++) {
                   1564:                xfree(f->re[i].lfollow);
                   1565:                if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30      millert  1566:                        xfree(f->re[i].lval.np);
1.1       tholo    1567:        }
                   1568:        xfree(f->restr);
1.27      millert  1569:        xfree(f->out);
                   1570:        xfree(f->posns);
                   1571:        xfree(f->gototab);
1.1       tholo    1572:        xfree(f);
                   1573: }