src/usr.bin/awk/b.c - annotate

Return to b.c CVS log
Up to [local] / src / usr.bin / awk
Annotation of src/usr.bin/awk/b.c, Revision 1.32

1.32    ! millert     1: /*     $OpenBSD: b.c,v 1.31 2020/06/10 21:05:02 millert Exp $  */
1.1       tholo       2: /****************************************************************
1.5       kstailey    3: Copyright (C) Lucent Technologies 1997
1.1       tholo       4: All Rights Reserved
                      5:
                      6: Permission to use, copy, modify, and distribute this software and
                      7: its documentation for any purpose and without fee is hereby
                      8: granted, provided that the above copyright notice appear in all
                      9: copies and that both that the copyright notice and this
                     10: permission notice and warranty disclaimer appear in supporting
1.5       kstailey   11: documentation, and that the name Lucent Technologies or any of
                     12: its entities not be used in advertising or publicity pertaining
                     13: to distribution of the software without specific, written prior
                     14: permission.
                     15:
                     16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
                     17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
                     18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
                     19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
                     20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
                     21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
                     22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
                     23: THIS SOFTWARE.
1.1       tholo      24: ****************************************************************/
                     25:
1.15      millert    26: /* lasciate ogne speranza, voi ch'intrate. */
1.1       tholo      27:
                     28: #define        DEBUG
                     29:
                     30: #include <ctype.h>
1.23      millert    31: #include <limits.h>
1.1       tholo      32: #include <stdio.h>
                     33: #include <string.h>
                     34: #include <stdlib.h>
                     35: #include "awk.h"
1.5       kstailey   36: #include "ytab.h"
1.1       tholo      37:
                     38: #define MAXLIN 22
                     39:
1.7       millert    40: #define type(v)                (v)->nobj       /* badly overloaded here */
                     41: #define info(v)                (v)->ntype      /* badly overloaded here */
1.1       tholo      42: #define left(v)                (v)->narg[0]
                     43: #define right(v)       (v)->narg[1]
                     44: #define parent(v)      (v)->nnext
                     45:
                     46: #define LEAF   case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15      millert    47: #define ELEAF  case EMPTYRE:           /* empty string in regexp */
1.1       tholo      48: #define UNARY  case STAR: case PLUS: case QUEST:
                     49:
                     50: /* encoding in tree Nodes:
1.15      millert    51:        leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1       tholo      52:                left is index, right contains value or pointer to value
                     53:        unary (STAR, PLUS, QUEST): left is child, right is null
                     54:        binary (CAT, OR): left and right are children
                     55:        parent contains pointer to parent
                     56: */
                     57:
                     58:
                     59: int    *setvec;
                     60: int    *tmpset;
                     61: int    maxsetvec = 0;
                     62:
                     63: int    rtok;           /* next token in current re */
1.4       millert    64: int    rlxval;
1.28      millert    65: static const uschar    *rlxstr;
                     66: static const uschar    *prestr;        /* current position in current re */
                     67: static const uschar    *lastre;        /* origin of last re */
                     68: static const uschar    *lastatom;      /* origin of last Atom */
                     69: static const uschar    *starttok;
                     70: static const uschar    *basestr;       /* starts with original, replaced during
1.23      millert    71:                                   repetition processing */
1.28      millert    72: static const uschar    *firstbasestr;
1.1       tholo      73:
1.4       millert    74: static int setcnt;
                     75: static int poscnt;
1.1       tholo      76:
1.28      millert    77: const char     *patbeg;
1.1       tholo      78: int    patlen;
                     79:
1.27      millert    80: #define        NFA     128     /* cache this many dynamic fa's */
1.1       tholo      81: fa     *fatab[NFA];
                     82: int    nfatab  = 0;    /* entries in fatab */
                     83:
1.27      millert    84: static int *
                     85: intalloc(size_t n, const char *f)
                     86: {
                     87:        void *p = calloc(n, sizeof(int));
                     88:        if (p == NULL)
                     89:                overflo(f);
                     90:        return p;
                     91: }
                     92:
                     93: static void
                     94: allocsetvec(const char *f)
                     95: {
                     96:        maxsetvec = MAXLIN;
                     97:        setvec = reallocarray(setvec, maxsetvec, sizeof(*setvec));
                     98:        tmpset = reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
                     99:        if (setvec == NULL || tmpset == NULL)
                    100:                overflo(f);
                    101: }
                    102:
                    103: static void
                    104: resizesetvec(const char *f)
                    105: {
                    106:        setvec = reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
                    107:        tmpset = reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
                    108:        if (setvec == NULL || tmpset == NULL)
                    109:                overflo(f);
                    110:        maxsetvec *= 4;
                    111: }
                    112:
                    113: static void
                    114: resize_state(fa *f, int state)
                    115: {
                    116:        void *p;
                    117:        int i, new_count;
                    118:
                    119:        if (++state < f->state_count)
                    120:                return;
                    121:
                    122:        new_count = state + 10; /* needs to be tuned */
                    123:
                    124:        p = reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
                    125:        if (p == NULL)
                    126:                goto out;
                    127:        f->gototab = p;
                    128:
                    129:        p = reallocarray(f->out, new_count, sizeof(f->out[0]));
                    130:        if (p == NULL)
                    131:                goto out;
                    132:        f->out = p;
                    133:
                    134:        p = reallocarray(f->posns, new_count, sizeof(f->posns[0]));
                    135:        if (p == NULL)
                    136:                goto out;
                    137:        f->posns = p;
                    138:
                    139:        for (i = f->state_count; i < new_count; ++i) {
                    140:                f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab));
                    141:                if (f->gototab[i] == NULL)
                    142:                        goto out;
                    143:                f->out[i]  = 0;
                    144:                f->posns[i] = NULL;
                    145:        }
                    146:        f->state_count = new_count;
                    147:        return;
                    148: out:
                    149:        overflo(__func__);
                    150: }
                    151:
1.29      millert   152: fa *makedfa(const char *s, bool anchor)        /* returns dfa for reg expr s */
1.1       tholo     153: {
                    154:        int i, use, nuse;
                    155:        fa *pfa;
1.6       millert   156:        static int now = 1;
1.1       tholo     157:
1.25      millert   158:        if (setvec == NULL) {   /* first time through any RE */
1.27      millert   159:                allocsetvec(__func__);
1.1       tholo     160:        }
                    161:
1.29      millert   162:        if (compile_time != RUNNING)    /* a constant for sure */
1.1       tholo     163:                return mkdfa(s, anchor);
                    164:        for (i = 0; i < nfatab; i++)    /* is it there already? */
                    165:                if (fatab[i]->anchor == anchor
1.11      millert   166:                  && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6       millert   167:                        fatab[i]->use = now++;
1.1       tholo     168:                        return fatab[i];
1.10      millert   169:                }
1.1       tholo     170:        pfa = mkdfa(s, anchor);
                    171:        if (nfatab < NFA) {     /* room for another */
                    172:                fatab[nfatab] = pfa;
1.6       millert   173:                fatab[nfatab]->use = now++;
1.1       tholo     174:                nfatab++;
                    175:                return pfa;
                    176:        }
                    177:        use = fatab[0]->use;    /* replace least-recently used */
                    178:        nuse = 0;
                    179:        for (i = 1; i < nfatab; i++)
                    180:                if (fatab[i]->use < use) {
                    181:                        use = fatab[i]->use;
                    182:                        nuse = i;
                    183:                }
                    184:        freefa(fatab[nuse]);
                    185:        fatab[nuse] = pfa;
1.6       millert   186:        pfa->use = now++;
1.1       tholo     187:        return pfa;
                    188: }
                    189:
1.29      millert   190: fa *mkdfa(const char *s, bool anchor)  /* does the real work of making a dfa */
1.30      millert   191:                                /* anchor = true for anchored matches, else false */
1.1       tholo     192: {
                    193:        Node *p, *p1;
                    194:        fa *f;
                    195:
1.28      millert   196:        firstbasestr = (const uschar *) s;
1.23      millert   197:        basestr = firstbasestr;
1.1       tholo     198:        p = reparse(s);
                    199:        p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
                    200:                /* put ALL STAR in front of reg.  exp. */
                    201:        p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
                    202:                /* put FINAL after reg.  exp. */
                    203:
                    204:        poscnt = 0;
                    205:        penter(p1);     /* enter parent pointers and leaf indices */
1.28      millert   206:        if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27      millert   207:                overflo(__func__);
1.1       tholo     208:        f->accept = poscnt-1;   /* penter has computed number of positions in re */
                    209:        cfoll(f, p1);   /* set up follow sets */
                    210:        freetr(p1);
1.27      millert   211:        resize_state(f, 1);
                    212:        f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
                    213:        f->posns[1] = intalloc(1, __func__);
1.1       tholo     214:        *f->posns[1] = 0;
                    215:        f->initstat = makeinit(f, anchor);
                    216:        f->anchor = anchor;
1.10      millert   217:        f->restr = (uschar *) tostring(s);
1.23      millert   218:        if (firstbasestr != basestr) {
                    219:                if (basestr)
                    220:                        xfree(basestr);
                    221:        }
1.1       tholo     222:        return f;
                    223: }
                    224:
1.29      millert   225: int makeinit(fa *f, bool anchor)
1.1       tholo     226: {
                    227:        int i, k;
                    228:
                    229:        f->curstat = 2;
                    230:        f->out[2] = 0;
                    231:        k = *(f->re[0].lfollow);
1.25      millert   232:        xfree(f->posns[2]);
1.27      millert   233:        f->posns[2] = intalloc(k + 1,  __func__);
1.30      millert   234:        for (i = 0; i <= k; i++) {
1.1       tholo     235:                (f->posns[2])[i] = (f->re[0].lfollow)[i];
                    236:        }
                    237:        if ((f->posns[2])[1] == f->accept)
                    238:                f->out[2] = 1;
1.30      millert   239:        for (i = 0; i < NCHARS; i++)
1.1       tholo     240:                f->gototab[2][i] = 0;
                    241:        f->curstat = cgoto(f, 2, HAT);
                    242:        if (anchor) {
                    243:                *f->posns[2] = k-1;     /* leave out position 0 */
1.30      millert   244:                for (i = 0; i < k; i++) {
1.1       tholo     245:                        (f->posns[0])[i] = (f->posns[2])[i];
                    246:                }
                    247:
                    248:                f->out[0] = f->out[2];
                    249:                if (f->curstat != 2)
                    250:                        --(*f->posns[f->curstat]);
                    251:        }
                    252:        return f->curstat;
                    253: }
                    254:
                    255: void penter(Node *p)   /* set up parent pointers and leaf indices */
                    256: {
                    257:        switch (type(p)) {
1.15      millert   258:        ELEAF
1.1       tholo     259:        LEAF
1.7       millert   260:                info(p) = poscnt;
1.1       tholo     261:                poscnt++;
                    262:                break;
                    263:        UNARY
                    264:                penter(left(p));
                    265:                parent(left(p)) = p;
                    266:                break;
                    267:        case CAT:
                    268:        case OR:
                    269:                penter(left(p));
                    270:                penter(right(p));
                    271:                parent(left(p)) = p;
                    272:                parent(right(p)) = p;
                    273:                break;
1.31      millert   274:        case ZERO:
                    275:                break;
1.1       tholo     276:        default:        /* can't happen */
1.8       millert   277:                FATAL("can't happen: unknown type %d in penter", type(p));
1.1       tholo     278:                break;
                    279:        }
                    280: }
                    281:
                    282: void freetr(Node *p)   /* free parse tree */
                    283: {
                    284:        switch (type(p)) {
1.15      millert   285:        ELEAF
1.1       tholo     286:        LEAF
                    287:                xfree(p);
                    288:                break;
                    289:        UNARY
1.31      millert   290:        case ZERO:
1.1       tholo     291:                freetr(left(p));
                    292:                xfree(p);
                    293:                break;
                    294:        case CAT:
                    295:        case OR:
                    296:                freetr(left(p));
                    297:                freetr(right(p));
                    298:                xfree(p);
                    299:                break;
                    300:        default:        /* can't happen */
1.8       millert   301:                FATAL("can't happen: unknown type %d in freetr", type(p));
1.1       tholo     302:                break;
                    303:        }
                    304: }
                    305:
                    306: /* in the parsing of regular expressions, metacharacters like . have */
                    307: /* to be seen literally;  \056 is not a metacharacter. */
                    308:
1.28      millert   309: int hexstr(const uschar **pp)  /* find and eval hex string at pp, return new p */
1.2       millert   310: {                      /* only pick up one 8-bit byte (2 chars) */
1.28      millert   311:        const uschar *p;
1.1       tholo     312:        int n = 0;
1.2       millert   313:        int i;
1.1       tholo     314:
1.28      millert   315:        for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
1.1       tholo     316:                if (isdigit(*p))
                    317:                        n = 16 * n + *p - '0';
                    318:                else if (*p >= 'a' && *p <= 'f')
                    319:                        n = 16 * n + *p - 'a' + 10;
                    320:                else if (*p >= 'A' && *p <= 'F')
                    321:                        n = 16 * n + *p - 'A' + 10;
                    322:        }
1.28      millert   323:        *pp = p;
1.1       tholo     324:        return n;
                    325: }
                    326:
1.2       millert   327: #define isoctdigit(c) ((c) >= '0' && (c) <= '7')       /* multiple use of arg */
1.1       tholo     328:
1.28      millert   329: int quoted(const uschar **pp)  /* pick up next thing after a \\ */
1.1       tholo     330:                        /* and increment *pp */
                    331: {
1.28      millert   332:        const uschar *p = *pp;
1.1       tholo     333:        int c;
                    334:
                    335:        if ((c = *p++) == 't')
                    336:                c = '\t';
                    337:        else if (c == 'n')
                    338:                c = '\n';
                    339:        else if (c == 'f')
                    340:                c = '\f';
                    341:        else if (c == 'r')
                    342:                c = '\r';
                    343:        else if (c == 'b')
                    344:                c = '\b';
1.25      millert   345:        else if (c == 'v')
                    346:                c = '\v';
1.20      millert   347:        else if (c == 'a')
1.25      millert   348:                c = '\a';
1.1       tholo     349:        else if (c == '\\')
                    350:                c = '\\';
                    351:        else if (c == 'x') {    /* hexadecimal goo follows */
                    352:                c = hexstr(&p); /* this adds a null if number is invalid */
                    353:        } else if (isoctdigit(c)) {     /* \d \dd \ddd */
                    354:                int n = c - '0';
                    355:                if (isoctdigit(*p)) {
                    356:                        n = 8 * n + *p++ - '0';
                    357:                        if (isoctdigit(*p))
                    358:                                n = 8 * n + *p++ - '0';
                    359:                }
                    360:                c = n;
                    361:        } /* else */
                    362:                /* c = c; */
                    363:        *pp = p;
                    364:        return c;
                    365: }
                    366:
1.11      millert   367: char *cclenter(const char *argp)       /* add a character class */
1.1       tholo     368: {
                    369:        int i, c, c2;
1.28      millert   370:        const uschar *op, *p = (const uschar *) argp;
                    371:        uschar *bp;
1.25      millert   372:        static uschar *buf = NULL;
1.5       kstailey  373:        static int bufsz = 100;
1.1       tholo     374:
                    375:        op = p;
1.27      millert   376:        if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1.8       millert   377:                FATAL("out of space for character class [%.10s...] 1", p);
1.5       kstailey  378:        bp = buf;
                    379:        for (i = 0; (c = *p++) != 0; ) {
1.1       tholo     380:                if (c == '\\') {
1.17      millert   381:                        c = quoted(&p);
1.5       kstailey  382:                } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1       tholo     383:                        if (*p != 0) {
1.5       kstailey  384:                                c = bp[-1];
1.1       tholo     385:                                c2 = *p++;
                    386:                                if (c2 == '\\')
1.17      millert   387:                                        c2 = quoted(&p);
1.1       tholo     388:                                if (c > c2) {   /* empty; ignore */
1.5       kstailey  389:                                        bp--;
1.1       tholo     390:                                        i--;
                    391:                                        continue;
                    392:                                }
                    393:                                while (c < c2) {
1.15      millert   394:                                        if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
1.8       millert   395:                                                FATAL("out of space for character class [%.10s...] 2", p);
1.5       kstailey  396:                                        *bp++ = ++c;
1.1       tholo     397:                                        i++;
                    398:                                }
                    399:                                continue;
                    400:                        }
                    401:                }
1.15      millert   402:                if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
1.8       millert   403:                        FATAL("out of space for character class [%.10s...] 3", p);
1.5       kstailey  404:                *bp++ = c;
1.1       tholo     405:                i++;
                    406:        }
1.5       kstailey  407:        *bp = 0;
1.32    ! millert   408:        DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
1.1       tholo     409:        xfree(op);
1.10      millert   410:        return (char *) tostring((char *) buf);
1.1       tholo     411: }
                    412:
1.11      millert   413: void overflo(const char *s)
1.1       tholo     414: {
1.27      millert   415:        FATAL("regular expression too big: out of space in %.30s...", s);
1.1       tholo     416: }
                    417:
                    418: void cfoll(fa *f, Node *v)     /* enter follow set of each leaf of vertex v into lfollow[leaf] */
                    419: {
                    420:        int i;
1.4       millert   421:        int *p;
1.1       tholo     422:
                    423:        switch (type(v)) {
1.15      millert   424:        ELEAF
1.1       tholo     425:        LEAF
1.7       millert   426:                f->re[info(v)].ltype = type(v);
                    427:                f->re[info(v)].lval.np = right(v);
1.1       tholo     428:                while (f->accept >= maxsetvec) {        /* guessing here! */
1.27      millert   429:                        resizesetvec(__func__);
1.1       tholo     430:                }
                    431:                for (i = 0; i <= f->accept; i++)
                    432:                        setvec[i] = 0;
                    433:                setcnt = 0;
                    434:                follow(v);      /* computes setvec and setcnt */
1.27      millert   435:                p = intalloc(setcnt + 1, __func__);
1.7       millert   436:                f->re[info(v)].lfollow = p;
1.1       tholo     437:                *p = setcnt;
                    438:                for (i = f->accept; i >= 0; i--)
                    439:                        if (setvec[i] == 1)
                    440:                                *++p = i;
                    441:                break;
                    442:        UNARY
                    443:                cfoll(f,left(v));
                    444:                break;
                    445:        case CAT:
                    446:        case OR:
                    447:                cfoll(f,left(v));
                    448:                cfoll(f,right(v));
                    449:                break;
1.31      millert   450:        case ZERO:
                    451:                break;
1.1       tholo     452:        default:        /* can't happen */
1.8       millert   453:                FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1       tholo     454:        }
                    455: }
                    456:
                    457: int first(Node *p)     /* collects initially active leaves of p into setvec */
1.15      millert   458:                        /* returns 0 if p matches empty string */
1.1       tholo     459: {
1.4       millert   460:        int b, lp;
1.1       tholo     461:
                    462:        switch (type(p)) {
1.15      millert   463:        ELEAF
1.1       tholo     464:        LEAF
1.7       millert   465:                lp = info(p);   /* look for high-water mark of subscripts */
1.1       tholo     466:                while (setcnt >= maxsetvec || lp >= maxsetvec) {        /* guessing here! */
1.27      millert   467:                        resizesetvec(__func__);
1.1       tholo     468:                }
1.15      millert   469:                if (type(p) == EMPTYRE) {
                    470:                        setvec[lp] = 0;
                    471:                        return(0);
                    472:                }
1.1       tholo     473:                if (setvec[lp] != 1) {
                    474:                        setvec[lp] = 1;
                    475:                        setcnt++;
                    476:                }
                    477:                if (type(p) == CCL && (*(char *) right(p)) == '\0')
                    478:                        return(0);              /* empty CCL */
1.30      millert   479:                return(1);
1.1       tholo     480:        case PLUS:
1.30      millert   481:                if (first(left(p)) == 0)
                    482:                        return(0);
1.1       tholo     483:                return(1);
                    484:        case STAR:
                    485:        case QUEST:
                    486:                first(left(p));
                    487:                return(0);
                    488:        case CAT:
                    489:                if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
                    490:                return(1);
                    491:        case OR:
                    492:                b = first(right(p));
                    493:                if (first(left(p)) == 0 || b == 0) return(0);
                    494:                return(1);
1.31      millert   495:        case ZERO:
                    496:                return 0;
1.1       tholo     497:        }
1.8       millert   498:        FATAL("can't happen: unknown type %d in first", type(p));       /* can't happen */
1.1       tholo     499:        return(-1);
                    500: }
                    501:
                    502: void follow(Node *v)   /* collects leaves that can follow v into setvec */
                    503: {
                    504:        Node *p;
                    505:
                    506:        if (type(v) == FINAL)
                    507:                return;
                    508:        p = parent(v);
                    509:        switch (type(p)) {
                    510:        case STAR:
                    511:        case PLUS:
                    512:                first(v);
                    513:                follow(p);
                    514:                return;
                    515:
                    516:        case OR:
                    517:        case QUEST:
                    518:                follow(p);
                    519:                return;
                    520:
                    521:        case CAT:
                    522:                if (v == left(p)) {     /* v is left child of p */
                    523:                        if (first(right(p)) == 0) {
                    524:                                follow(p);
                    525:                                return;
                    526:                        }
                    527:                } else          /* v is right child */
                    528:                        follow(p);
                    529:                return;
                    530:        }
                    531: }
                    532:
1.11      millert   533: int member(int c, const char *sarg)    /* is c in s? */
1.1       tholo     534: {
1.28      millert   535:        const uschar *s = (const uschar *) sarg;
1.10      millert   536:
1.1       tholo     537:        while (*s)
                    538:                if (c == *s++)
                    539:                        return(1);
                    540:        return(0);
                    541: }
                    542:
1.11      millert   543: int match(fa *f, const char *p0)       /* shortest match ? */
1.1       tholo     544: {
                    545:        int s, ns;
1.28      millert   546:        const uschar *p = (const uschar *) p0;
1.1       tholo     547:
1.27      millert   548:        s = f->initstat;
                    549:        assert (s < f->state_count);
                    550:
1.1       tholo     551:        if (f->out[s])
                    552:                return(1);
                    553:        do {
1.15      millert   554:                /* assert(*p < NCHARS); */
1.1       tholo     555:                if ((ns = f->gototab[s][*p]) != 0)
                    556:                        s = ns;
                    557:                else
                    558:                        s = cgoto(f, s, *p);
                    559:                if (f->out[s])
                    560:                        return(1);
                    561:        } while (*p++ != 0);
                    562:        return(0);
                    563: }
                    564:
1.11      millert   565: int pmatch(fa *f, const char *p0)      /* longest match, for sub */
1.1       tholo     566: {
                    567:        int s, ns;
1.28      millert   568:        const uschar *p = (const uschar *) p0;
                    569:        const uschar *q;
1.1       tholo     570:
1.27      millert   571:        s = f->initstat;
                    572:        assert(s < f->state_count);
                    573:
1.28      millert   574:        patbeg = (const char *)p;
1.1       tholo     575:        patlen = -1;
                    576:        do {
                    577:                q = p;
                    578:                do {
                    579:                        if (f->out[s])          /* final state */
                    580:                                patlen = q-p;
1.15      millert   581:                        /* assert(*q < NCHARS); */
1.1       tholo     582:                        if ((ns = f->gototab[s][*q]) != 0)
                    583:                                s = ns;
                    584:                        else
                    585:                                s = cgoto(f, s, *q);
1.27      millert   586:
                    587:                        assert(s < f->state_count);
                    588:
1.9       deraadt   589:                        if (s == 1) {   /* no transition */
1.1       tholo     590:                                if (patlen >= 0) {
1.28      millert   591:                                        patbeg = (const char *) p;
1.1       tholo     592:                                        return(1);
                    593:                                }
                    594:                                else
                    595:                                        goto nextin;    /* no match */
1.9       deraadt   596:                        }
1.1       tholo     597:                } while (*q++ != 0);
                    598:                if (f->out[s])
                    599:                        patlen = q-p-1; /* don't count $ */
                    600:                if (patlen >= 0) {
1.28      millert   601:                        patbeg = (const char *) p;
1.1       tholo     602:                        return(1);
                    603:                }
                    604:        nextin:
                    605:                s = 2;
1.27      millert   606:        } while (*p++);
1.1       tholo     607:        return (0);
                    608: }
                    609:
1.11      millert   610: int nematch(fa *f, const char *p0)     /* non-empty match, for sub */
1.1       tholo     611: {
                    612:        int s, ns;
1.28      millert   613:        const uschar *p = (const uschar *) p0;
                    614:        const uschar *q;
1.1       tholo     615:
1.27      millert   616:        s = f->initstat;
                    617:        assert(s < f->state_count);
                    618:
1.28      millert   619:        patbeg = (const char *)p;
1.1       tholo     620:        patlen = -1;
                    621:        while (*p) {
                    622:                q = p;
                    623:                do {
                    624:                        if (f->out[s])          /* final state */
                    625:                                patlen = q-p;
1.15      millert   626:                        /* assert(*q < NCHARS); */
1.1       tholo     627:                        if ((ns = f->gototab[s][*q]) != 0)
                    628:                                s = ns;
                    629:                        else
                    630:                                s = cgoto(f, s, *q);
1.9       deraadt   631:                        if (s == 1) {   /* no transition */
1.1       tholo     632:                                if (patlen > 0) {
1.28      millert   633:                                        patbeg = (const char *) p;
1.1       tholo     634:                                        return(1);
                    635:                                } else
                    636:                                        goto nnextin;   /* no nonempty match */
1.9       deraadt   637:                        }
1.1       tholo     638:                } while (*q++ != 0);
                    639:                if (f->out[s])
                    640:                        patlen = q-p-1; /* don't count $ */
                    641:                if (patlen > 0 ) {
1.28      millert   642:                        patbeg = (const char *) p;
1.1       tholo     643:                        return(1);
                    644:                }
                    645:        nnextin:
                    646:                s = 2;
                    647:                p++;
                    648:        }
                    649:        return (0);
                    650: }
                    651:
1.26      millert   652:
                    653: /*
                    654:  * NAME
                    655:  *     fnematch
                    656:  *
                    657:  * DESCRIPTION
                    658:  *     A stream-fed version of nematch which transfers characters to a
                    659:  *     null-terminated buffer. All characters up to and including the last
                    660:  *     character of the matching text or EOF are placed in the buffer. If
                    661:  *     a match is found, patbeg and patlen are set appropriately.
                    662:  *
                    663:  * RETURN VALUES
1.29      millert   664:  *     false    No match found.
                    665:  *     true     Match found.
1.26      millert   666:  */
                    667:
1.29      millert   668: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26      millert   669: {
                    670:        char *buf = *pbuf;
                    671:        int bufsize = *pbufsize;
                    672:        int c, i, j, k, ns, s;
                    673:
                    674:        s = pfa->initstat;
                    675:        patlen = 0;
                    676:
                    677:        /*
                    678:         * All indices relative to buf.
                    679:         * i <= j <= k <= bufsize
                    680:         *
                    681:         * i: origin of active substring
                    682:         * j: current character
                    683:         * k: destination of next getc()
                    684:         */
                    685:        i = -1, k = 0;
                    686:         do {
                    687:                j = i++;
                    688:                do {
                    689:                        if (++j == k) {
                    690:                                if (k == bufsize)
                    691:                                        if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
                    692:                                                FATAL("stream '%.30s...' too long", buf);
                    693:                                buf[k++] = (c = getc(f)) != EOF ? c : 0;
                    694:                        }
                    695:                        c = buf[j];
                    696:                        /* assert(c < NCHARS); */
                    697:
                    698:                        if ((ns = pfa->gototab[s][c]) != 0)
                    699:                                s = ns;
                    700:                        else
                    701:                                s = cgoto(pfa, s, c);
                    702:
                    703:                        if (pfa->out[s]) {      /* final state */
                    704:                                patlen = j - i + 1;
                    705:                                if (c == 0)     /* don't count $ */
                    706:                                        patlen--;
                    707:                        }
                    708:                } while (buf[j] && s != 1);
                    709:                s = 2;
                    710:        } while (buf[i] && !patlen);
                    711:
                    712:        /* adjbuf() may have relocated a resized buffer. Inform the world. */
                    713:        *pbuf = buf;
                    714:        *pbufsize = bufsize;
                    715:
                    716:        if (patlen) {
                    717:                patbeg = buf + i;
                    718:                /*
                    719:                 * Under no circumstances is the last character fed to
                    720:                 * the automaton part of the match. It is EOF's nullbyte,
                    721:                 * or it sent the automaton into a state with no further
                    722:                 * transitions available (s==1), or both. Room for a
                    723:                 * terminating nullbyte is guaranteed.
                    724:                 *
                    725:                 * ungetc any chars after the end of matching text
                    726:                 * (except for EOF's nullbyte, if present) and null
                    727:                 * terminate the buffer.
                    728:                 */
                    729:                do
                    730:                        if (buf[--k] && ungetc(buf[k], f) == EOF)
                    731:                                FATAL("unable to ungetc '%c'", buf[k]);
                    732:                while (k > i + patlen);
1.30      millert   733:                buf[k] = '\0';
1.29      millert   734:                return true;
1.26      millert   735:        }
                    736:        else
1.29      millert   737:                return false;
1.26      millert   738: }
                    739:
1.11      millert   740: Node *reparse(const char *p)   /* parses regular expression pointed to by p */
1.1       tholo     741: {                      /* uses relex() to scan regular expression */
                    742:        Node *np;
                    743:
1.32    ! millert   744:        DPRINTF("reparse <%s>\n", p);
1.28      millert   745:        lastre = prestr = (const uschar *) p;   /* prestr points to string to be parsed */
1.1       tholo     746:        rtok = relex();
1.11      millert   747:        /* GNU compatibility: an empty regexp matches anything */
1.15      millert   748:        if (rtok == '\0') {
1.11      millert   749:                /* FATAL("empty regular expression"); previous */
1.15      millert   750:                return(op2(EMPTYRE, NIL, NIL));
                    751:        }
1.1       tholo     752:        np = regexp();
                    753:        if (rtok != '\0')
1.8       millert   754:                FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1       tholo     755:        return(np);
                    756: }
                    757:
                    758: Node *regexp(void)     /* top-level parse of reg expr */
                    759: {
                    760:        return (alt(concat(primary())));
                    761: }
                    762:
                    763: Node *primary(void)
                    764: {
                    765:        Node *np;
1.23      millert   766:        int savelastatom;
1.1       tholo     767:
                    768:        switch (rtok) {
                    769:        case CHAR:
1.23      millert   770:                lastatom = starttok;
1.7       millert   771:                np = op2(CHAR, NIL, itonp(rlxval));
1.1       tholo     772:                rtok = relex();
                    773:                return (unary(np));
                    774:        case ALL:
                    775:                rtok = relex();
                    776:                return (unary(op2(ALL, NIL, NIL)));
1.15      millert   777:        case EMPTYRE:
                    778:                rtok = relex();
1.23      millert   779:                return (unary(op2(EMPTYRE, NIL, NIL)));
1.1       tholo     780:        case DOT:
1.23      millert   781:                lastatom = starttok;
1.1       tholo     782:                rtok = relex();
                    783:                return (unary(op2(DOT, NIL, NIL)));
                    784:        case CCL:
1.28      millert   785:                np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23      millert   786:                lastatom = starttok;
1.1       tholo     787:                rtok = relex();
                    788:                return (unary(np));
                    789:        case NCCL:
1.28      millert   790:                np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23      millert   791:                lastatom = starttok;
1.1       tholo     792:                rtok = relex();
                    793:                return (unary(np));
                    794:        case '^':
                    795:                rtok = relex();
1.7       millert   796:                return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1       tholo     797:        case '$':
                    798:                rtok = relex();
                    799:                return (unary(op2(CHAR, NIL, NIL)));
                    800:        case '(':
1.23      millert   801:                lastatom = starttok;
                    802:                savelastatom = starttok - basestr; /* Retain over recursion */
1.1       tholo     803:                rtok = relex();
                    804:                if (rtok == ')') {      /* special pleading for () */
                    805:                        rtok = relex();
                    806:                        return unary(op2(CCL, NIL, (Node *) tostring("")));
                    807:                }
                    808:                np = regexp();
                    809:                if (rtok == ')') {
1.23      millert   810:                        lastatom = basestr + savelastatom; /* Restore */
1.1       tholo     811:                        rtok = relex();
                    812:                        return (unary(np));
                    813:                }
                    814:                else
1.8       millert   815:                        FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1       tholo     816:        default:
1.8       millert   817:                FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1       tholo     818:        }
                    819:        return 0;       /*NOTREACHED*/
                    820: }
                    821:
                    822: Node *concat(Node *np)
                    823: {
                    824:        switch (rtok) {
1.23      millert   825:        case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1       tholo     826:                return (concat(op2(CAT, np, primary())));
1.23      millert   827:        case EMPTYRE:
                    828:                rtok = relex();
                    829:                return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
                    830:                                primary())));
1.1       tholo     831:        }
                    832:        return (np);
                    833: }
                    834:
                    835: Node *alt(Node *np)
                    836: {
                    837:        if (rtok == OR) {
                    838:                rtok = relex();
                    839:                return (alt(op2(OR, np, concat(primary()))));
                    840:        }
                    841:        return (np);
                    842: }
                    843:
                    844: Node *unary(Node *np)
                    845: {
                    846:        switch (rtok) {
                    847:        case STAR:
                    848:                rtok = relex();
                    849:                return (unary(op2(STAR, np, NIL)));
                    850:        case PLUS:
                    851:                rtok = relex();
                    852:                return (unary(op2(PLUS, np, NIL)));
                    853:        case QUEST:
                    854:                rtok = relex();
                    855:                return (unary(op2(QUEST, np, NIL)));
1.31      millert   856:        case ZERO:
                    857:                rtok = relex();
                    858:                return (unary(op2(ZERO, np, NIL)));
1.1       tholo     859:        default:
                    860:                return (np);
                    861:        }
                    862: }
                    863:
1.11      millert   864: /*
                    865:  * Character class definitions conformant to the POSIX locale as
                    866:  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
                    867:  * and operating character sets are both ASCII (ISO646) or supersets
                    868:  * thereof.
                    869:  *
                    870:  * Note that to avoid overflowing the temporary buffer used in
                    871:  * relex(), the expanded character class (prior to range expansion)
                    872:  * must be less than twice the size of their full name.
                    873:  */
1.12      millert   874:
                    875: /* Because isblank doesn't show up in any of the header files on any
                    876:  * system i use, it's defined here.  if some other locale has a richer
                    877:  * definition of "blank", define HAS_ISBLANK and provide your own
                    878:  * version.
                    879:  * the parentheses here are an attempt to find a path through the maze
                    880:  * of macro definition and/or function and/or version provided.  thanks
                    881:  * to nelson beebe for the suggestion; let's see if it works everywhere.
                    882:  */
                    883:
                    884: #ifndef HAS_ISBLANK
                    885:
1.17      millert   886: int (xisblank)(int c)
1.12      millert   887: {
                    888:        return c==' ' || c=='\t';
                    889: }
                    890:
                    891: #endif
                    892:
1.31      millert   893: static const struct charclass {
1.11      millert   894:        const char *cc_name;
                    895:        int cc_namelen;
1.12      millert   896:        int (*cc_func)(int);
1.11      millert   897: } charclasses[] = {
1.12      millert   898:        { "alnum",      5,      isalnum },
                    899:        { "alpha",      5,      isalpha },
1.17      millert   900: #ifndef HAS_ISBLANK
1.21      millert   901:        { "blank",      5,      xisblank },
1.17      millert   902: #else
1.12      millert   903:        { "blank",      5,      isblank },
1.17      millert   904: #endif
1.12      millert   905:        { "cntrl",      5,      iscntrl },
                    906:        { "digit",      5,      isdigit },
                    907:        { "graph",      5,      isgraph },
                    908:        { "lower",      5,      islower },
                    909:        { "print",      5,      isprint },
                    910:        { "punct",      5,      ispunct },
                    911:        { "space",      5,      isspace },
                    912:        { "upper",      5,      isupper },
                    913:        { "xdigit",     6,      isxdigit },
1.11      millert   914:        { NULL,         0,      NULL },
                    915: };
                    916:
1.23      millert   917: #define REPEAT_SIMPLE          0
                    918: #define REPEAT_PLUS_APPENDED   1
                    919: #define REPEAT_WITH_Q          2
                    920: #define REPEAT_ZERO            3
                    921:
                    922: static int
                    923: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
                    924:               int atomlen, int firstnum, int secondnum, int special_case)
                    925: {
                    926:        int i, j;
1.26      millert   927:        uschar *buf = NULL;
1.23      millert   928:        int ret = 1;
1.31      millert   929:        int init_q = (firstnum == 0);           /* first added char will be ? */
1.23      millert   930:        int n_q_reps = secondnum-firstnum;      /* m>n, so reduce until {1,m-n} left  */
                    931:        int prefix_length = reptok - basestr;   /* prefix includes first rep    */
1.28      millert   932:        int suffix_length = strlen((const char *) reptok) - reptoklen;  /* string after rep specifier   */
1.23      millert   933:        int size = prefix_length +  suffix_length;
                    934:
                    935:        if (firstnum > 1) {     /* add room for reps 2 through firstnum */
                    936:                size += atomlen*(firstnum-1);
                    937:        }
                    938:
                    939:        /* Adjust size of buffer for special cases */
                    940:        if (special_case == REPEAT_PLUS_APPENDED) {
                    941:                size++;         /* for the final + */
                    942:        } else if (special_case == REPEAT_WITH_Q) {
                    943:                size += init_q + (atomlen+1)* n_q_reps;
                    944:        } else if (special_case == REPEAT_ZERO) {
                    945:                size += 2;      /* just a null ERE: () */
                    946:        }
1.27      millert   947:        if ((buf = malloc(size + 1)) == NULL)
1.23      millert   948:                FATAL("out of space in reg expr %.10s..", lastre);
                    949:        memcpy(buf, basestr, prefix_length);    /* copy prefix  */
                    950:        j = prefix_length;
                    951:        if (special_case == REPEAT_ZERO) {
                    952:                j -= atomlen;
                    953:                buf[j++] = '(';
                    954:                buf[j++] = ')';
                    955:        }
1.30      millert   956:        for (i = 1; i < firstnum; i++) {        /* copy x reps  */
1.23      millert   957:                memcpy(&buf[j], atom, atomlen);
                    958:                j += atomlen;
                    959:        }
                    960:        if (special_case == REPEAT_PLUS_APPENDED) {
                    961:                buf[j++] = '+';
                    962:        } else if (special_case == REPEAT_WITH_Q) {
1.29      millert   963:                if (init_q)
                    964:                        buf[j++] = '?';
1.30      millert   965:                for (i = init_q; i < n_q_reps; i++) {   /* copy x? reps */
1.23      millert   966:                        memcpy(&buf[j], atom, atomlen);
                    967:                        j += atomlen;
                    968:                        buf[j++] = '?';
                    969:                }
                    970:        }
                    971:        memcpy(&buf[j], reptok+reptoklen, suffix_length);
                    972:        if (special_case == REPEAT_ZERO) {
                    973:                buf[j+suffix_length] = '\0';
                    974:        } else {
                    975:                buf[size] = '\0';
                    976:        }
                    977:        /* free old basestr */
                    978:        if (firstbasestr != basestr) {
                    979:                if (basestr)
                    980:                        xfree(basestr);
                    981:        }
                    982:        basestr = buf;
                    983:        prestr  = buf + prefix_length;
                    984:        if (special_case == REPEAT_ZERO) {
                    985:                prestr  -= atomlen;
                    986:                ret++;
                    987:        }
                    988:        return ret;
                    989: }
                    990:
                    991: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
                    992:                  int atomlen, int firstnum, int secondnum)
                    993: {
                    994:        /*
                    995:           In general, the repetition specifier or "bound" is replaced here
                    996:           by an equivalent ERE string, repeating the immediately previous atom
                    997:           and appending ? and + as needed. Note that the first copy of the
                    998:           atom is left in place, except in the special_case of a zero-repeat
                    999:           (i.e., {0}).
                   1000:         */
                   1001:        if (secondnum < 0) {    /* means {n,} -> repeat n-1 times followed by PLUS */
                   1002:                if (firstnum < 2) {
                   1003:                        /* 0 or 1: should be handled before you get here */
                   1004:                        FATAL("internal error");
                   1005:                } else {
                   1006:                        return replace_repeat(reptok, reptoklen, atom, atomlen,
                   1007:                                firstnum, secondnum, REPEAT_PLUS_APPENDED);
                   1008:                }
                   1009:        } else if (firstnum == secondnum) {     /* {n} or {n,n} -> simply repeat n-1 times */
                   1010:                if (firstnum == 0) {    /* {0} or {0,0} */
                   1011:                        /* This case is unusual because the resulting
                   1012:                           replacement string might actually be SMALLER than
                   1013:                           the original ERE */
                   1014:                        return replace_repeat(reptok, reptoklen, atom, atomlen,
                   1015:                                        firstnum, secondnum, REPEAT_ZERO);
                   1016:                } else {                /* (firstnum >= 1) */
                   1017:                        return replace_repeat(reptok, reptoklen, atom, atomlen,
                   1018:                                        firstnum, secondnum, REPEAT_SIMPLE);
                   1019:                }
                   1020:        } else if (firstnum < secondnum) {      /* {n,m} -> repeat n-1 times then alternate  */
                   1021:                /*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?   */
                   1022:                return replace_repeat(reptok, reptoklen, atom, atomlen,
                   1023:                                        firstnum, secondnum, REPEAT_WITH_Q);
                   1024:        } else {        /* Error - shouldn't be here (n>m) */
                   1025:                FATAL("internal error");
                   1026:        }
                   1027:        return 0;
                   1028: }
1.11      millert  1029:
1.1       tholo    1030: int relex(void)                /* lexical analyzer for reparse */
                   1031: {
1.5       kstailey 1032:        int c, n;
1.1       tholo    1033:        int cflag;
1.25      millert  1034:        static uschar *buf = NULL;
1.5       kstailey 1035:        static int bufsz = 100;
1.10      millert  1036:        uschar *bp;
1.31      millert  1037:        const struct charclass *cc;
1.12      millert  1038:        int i;
1.29      millert  1039:        int num, m;
                   1040:        bool commafound, digitfound;
1.23      millert  1041:        const uschar *startreptok;
1.24      millert  1042:        static int parens = 0;
1.23      millert  1043:
                   1044: rescan:
                   1045:        starttok = prestr;
1.1       tholo    1046:
                   1047:        switch (c = *prestr++) {
                   1048:        case '|': return OR;
                   1049:        case '*': return STAR;
                   1050:        case '+': return PLUS;
                   1051:        case '?': return QUEST;
                   1052:        case '.': return DOT;
                   1053:        case '\0': prestr--; return '\0';
                   1054:        case '^':
                   1055:        case '$':
1.24      millert  1056:                return c;
1.1       tholo    1057:        case '(':
1.24      millert  1058:                parens++;
                   1059:                return c;
1.1       tholo    1060:        case ')':
1.24      millert  1061:                if (parens) {
                   1062:                        parens--;
                   1063:                        return c;
                   1064:                }
                   1065:                /* unmatched close parenthesis; per POSIX, treat as literal */
                   1066:                rlxval = c;
                   1067:                return CHAR;
1.1       tholo    1068:        case '\\':
1.17      millert  1069:                rlxval = quoted(&prestr);
1.1       tholo    1070:                return CHAR;
                   1071:        default:
                   1072:                rlxval = c;
                   1073:                return CHAR;
1.25      millert  1074:        case '[':
1.27      millert  1075:                if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1.8       millert  1076:                        FATAL("out of space in reg expr %.10s..", lastre);
1.5       kstailey 1077:                bp = buf;
1.1       tholo    1078:                if (*prestr == '^') {
                   1079:                        cflag = 1;
                   1080:                        prestr++;
                   1081:                }
                   1082:                else
                   1083:                        cflag = 0;
1.11      millert  1084:                n = 2 * strlen((const char *) prestr)+1;
1.15      millert  1085:                if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8       millert  1086:                        FATAL("out of space for reg expr %.10s...", lastre);
1.1       tholo    1087:                for (; ; ) {
                   1088:                        if ((c = *prestr++) == '\\') {
1.5       kstailey 1089:                                *bp++ = '\\';
1.1       tholo    1090:                                if ((c = *prestr++) == '\0')
1.8       millert  1091:                                        FATAL("nonterminated character class %.20s...", lastre);
1.5       kstailey 1092:                                *bp++ = c;
1.10      millert  1093:                        /* } else if (c == '\n') { */
                   1094:                        /*      FATAL("newline in character class %.20s...", lastre); */
1.11      millert  1095:                        } else if (c == '[' && *prestr == ':') {
                   1096:                                /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
                   1097:                                for (cc = charclasses; cc->cc_name; cc++)
                   1098:                                        if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
                   1099:                                                break;
                   1100:                                if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
                   1101:                                    prestr[2 + cc->cc_namelen] == ']') {
                   1102:                                        prestr += cc->cc_namelen + 3;
1.22      millert  1103:                                        /*
                   1104:                                         * BUG: We begin at 1, instead of 0, since we
                   1105:                                         * would otherwise prematurely terminate the
                   1106:                                         * string for classes like [[:cntrl:]]. This
                   1107:                                         * means that we can't match the NUL character,
                   1108:                                         * not without first adapting the entire
                   1109:                                         * program to track each string's length.
                   1110:                                         */
1.23      millert  1111:                                        for (i = 1; i <= UCHAR_MAX; i++) {
1.15      millert  1112:                                                if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1.12      millert  1113:                                                    FATAL("out of space for reg expr %.10s...", lastre);
                   1114:                                                if (cc->cc_func(i)) {
1.32    ! millert  1115:                                                        /* escape backslash */
        !          1116:                                                        if (i == '\\') {
        !          1117:                                                                *bp++ = '\\';
        !          1118:                                                                n++;
        !          1119:                                                        }
        !          1120:
1.12      millert  1121:                                                        *bp++ = i;
                   1122:                                                        n++;
                   1123:                                                }
                   1124:                                        }
1.11      millert  1125:                                } else
                   1126:                                        *bp++ = c;
1.23      millert  1127:                        } else if (c == '[' && *prestr == '.') {
                   1128:                                char collate_char;
                   1129:                                prestr++;
                   1130:                                collate_char = *prestr++;
                   1131:                                if (*prestr == '.' && prestr[1] == ']') {
                   1132:                                        prestr += 2;
                   1133:                                        /* Found it: map via locale TBD: for
                   1134:                                           now, simply return this char.  This
                   1135:                                           is sufficient to pass conformance
                   1136:                                           test awk.ex 156
                   1137:                                         */
                   1138:                                        if (*prestr == ']') {
                   1139:                                                prestr++;
                   1140:                                                rlxval = collate_char;
                   1141:                                                return CHAR;
                   1142:                                        }
                   1143:                                }
                   1144:                        } else if (c == '[' && *prestr == '=') {
                   1145:                                char equiv_char;
                   1146:                                prestr++;
                   1147:                                equiv_char = *prestr++;
                   1148:                                if (*prestr == '=' && prestr[1] == ']') {
                   1149:                                        prestr += 2;
                   1150:                                        /* Found it: map via locale TBD: for now
                   1151:                                           simply return this char. This is
                   1152:                                           sufficient to pass conformance test
                   1153:                                           awk.ex 156
                   1154:                                         */
                   1155:                                        if (*prestr == ']') {
                   1156:                                                prestr++;
                   1157:                                                rlxval = equiv_char;
                   1158:                                                return CHAR;
                   1159:                                        }
                   1160:                                }
1.1       tholo    1161:                        } else if (c == '\0') {
1.8       millert  1162:                                FATAL("nonterminated character class %.20s", lastre);
1.5       kstailey 1163:                        } else if (bp == buf) { /* 1st char is special */
                   1164:                                *bp++ = c;
1.1       tholo    1165:                        } else if (c == ']') {
1.5       kstailey 1166:                                *bp++ = 0;
1.10      millert  1167:                                rlxstr = (uschar *) tostring((char *) buf);
1.1       tholo    1168:                                if (cflag == 0)
                   1169:                                        return CCL;
                   1170:                                else
                   1171:                                        return NCCL;
                   1172:                        } else
1.5       kstailey 1173:                                *bp++ = c;
1.1       tholo    1174:                }
1.23      millert  1175:                break;
                   1176:        case '{':
                   1177:                if (isdigit(*(prestr))) {
                   1178:                        num = 0;        /* Process as a repetition */
                   1179:                        n = -1; m = -1;
1.29      millert  1180:                        commafound = false;
                   1181:                        digitfound = false;
1.23      millert  1182:                        startreptok = prestr-1;
                   1183:                        /* Remember start of previous atom here ? */
                   1184:                } else {                /* just a { char, not a repetition */
                   1185:                        rlxval = c;
                   1186:                        return CHAR;
                   1187:                 }
                   1188:                for (; ; ) {
                   1189:                        if ((c = *prestr++) == '}') {
                   1190:                                if (commafound) {
                   1191:                                        if (digitfound) { /* {n,m} */
                   1192:                                                m = num;
1.30      millert  1193:                                                if (m < n)
1.23      millert  1194:                                                        FATAL("illegal repetition expression: class %.20s",
                   1195:                                                                lastre);
1.30      millert  1196:                                                if (n == 0 && m == 1) {
1.23      millert  1197:                                                        return QUEST;
                   1198:                                                }
                   1199:                                        } else {        /* {n,} */
1.30      millert  1200:                                                if (n == 0)
                   1201:                                                        return STAR;
                   1202:                                                else if (n == 1)
                   1203:                                                        return PLUS;
1.23      millert  1204:                                        }
                   1205:                                } else {
                   1206:                                        if (digitfound) { /* {n} same as {n,n} */
                   1207:                                                n = num;
                   1208:                                                m = num;
                   1209:                                        } else {        /* {} */
                   1210:                                                FATAL("illegal repetition expression: class %.20s",
                   1211:                                                        lastre);
                   1212:                                        }
                   1213:                                }
                   1214:                                if (repeat(starttok, prestr-starttok, lastatom,
                   1215:                                           startreptok - lastatom, n, m) > 0) {
1.30      millert  1216:                                        if (n == 0 && m == 0) {
1.31      millert  1217:                                                return ZERO;
1.23      millert  1218:                                        }
                   1219:                                        /* must rescan input for next token */
                   1220:                                        goto rescan;
                   1221:                                }
                   1222:                                /* Failed to replace: eat up {...} characters
                   1223:                                   and treat like just PLUS */
                   1224:                                return PLUS;
                   1225:                        } else if (c == '\0') {
                   1226:                                FATAL("nonterminated character class %.20s",
                   1227:                                        lastre);
                   1228:                        } else if (isdigit(c)) {
                   1229:                                num = 10 * num + c - '0';
1.29      millert  1230:                                digitfound = true;
1.23      millert  1231:                        } else if (c == ',') {
                   1232:                                if (commafound)
                   1233:                                        FATAL("illegal repetition expression: class %.20s",
                   1234:                                                lastre);
                   1235:                                /* looking for {n,} or {n,m} */
1.29      millert  1236:                                commafound = true;
1.23      millert  1237:                                n = num;
1.29      millert  1238:                                digitfound = false; /* reset */
1.23      millert  1239:                                num = 0;
                   1240:                        } else {
                   1241:                                FATAL("illegal repetition expression: class %.20s",
                   1242:                                        lastre);
                   1243:                        }
                   1244:                }
                   1245:                break;
1.1       tholo    1246:        }
                   1247: }
                   1248:
                   1249: int cgoto(fa *f, int s, int c)
                   1250: {
1.27      millert  1251:        int *p, *q;
1.1       tholo    1252:        int i, j, k;
                   1253:
1.12      millert  1254:        assert(c == HAT || c < NCHARS);
1.1       tholo    1255:        while (f->accept >= maxsetvec) {        /* guessing here! */
1.27      millert  1256:                resizesetvec(__func__);
1.1       tholo    1257:        }
                   1258:        for (i = 0; i <= f->accept; i++)
                   1259:                setvec[i] = 0;
                   1260:        setcnt = 0;
1.27      millert  1261:        resize_state(f, s);
1.1       tholo    1262:        /* compute positions of gototab[s,c] into setvec */
                   1263:        p = f->posns[s];
                   1264:        for (i = 1; i <= *p; i++) {
                   1265:                if ((k = f->re[p[i]].ltype) != FINAL) {
1.7       millert  1266:                        if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1       tholo    1267:                         || (k == DOT && c != 0 && c != HAT)
                   1268:                         || (k == ALL && c != 0)
1.15      millert  1269:                         || (k == EMPTYRE && c != 0)
1.10      millert  1270:                         || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
                   1271:                         || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1.1       tholo    1272:                                q = f->re[p[i]].lfollow;
                   1273:                                for (j = 1; j <= *q; j++) {
                   1274:                                        if (q[j] >= maxsetvec) {
1.27      millert  1275:                                                resizesetvec(__func__);
1.1       tholo    1276:                                        }
                   1277:                                        if (setvec[q[j]] == 0) {
                   1278:                                                setcnt++;
                   1279:                                                setvec[q[j]] = 1;
                   1280:                                        }
                   1281:                                }
                   1282:                        }
                   1283:                }
                   1284:        }
                   1285:        /* determine if setvec is a previous state */
                   1286:        tmpset[0] = setcnt;
                   1287:        j = 1;
                   1288:        for (i = f->accept; i >= 0; i--)
                   1289:                if (setvec[i]) {
                   1290:                        tmpset[j++] = i;
                   1291:                }
1.27      millert  1292:        resize_state(f, f->curstat > s ? f->curstat : s);
1.1       tholo    1293:        /* tmpset == previous state? */
                   1294:        for (i = 1; i <= f->curstat; i++) {
                   1295:                p = f->posns[i];
                   1296:                if ((k = tmpset[0]) != p[0])
                   1297:                        goto different;
                   1298:                for (j = 1; j <= k; j++)
                   1299:                        if (tmpset[j] != p[j])
                   1300:                                goto different;
                   1301:                /* setvec is state i */
1.30      millert  1302:                if (c != HAT)
                   1303:                        f->gototab[s][c] = i;
1.1       tholo    1304:                return i;
                   1305:          different:;
                   1306:        }
                   1307:
                   1308:        /* add tmpset to current set of states */
1.27      millert  1309:        ++(f->curstat);
                   1310:        resize_state(f, f->curstat);
1.1       tholo    1311:        for (i = 0; i < NCHARS; i++)
                   1312:                f->gototab[f->curstat][i] = 0;
                   1313:        xfree(f->posns[f->curstat]);
1.27      millert  1314:        p = intalloc(setcnt + 1, __func__);
1.1       tholo    1315:
                   1316:        f->posns[f->curstat] = p;
1.30      millert  1317:        if (c != HAT)
                   1318:                f->gototab[s][c] = f->curstat;
1.1       tholo    1319:        for (i = 0; i <= setcnt; i++)
                   1320:                p[i] = tmpset[i];
                   1321:        if (setvec[f->accept])
                   1322:                f->out[f->curstat] = 1;
                   1323:        else
                   1324:                f->out[f->curstat] = 0;
                   1325:        return f->curstat;
                   1326: }
                   1327:
                   1328:
                   1329: void freefa(fa *f)     /* free a finite automaton */
                   1330: {
                   1331:        int i;
                   1332:
                   1333:        if (f == NULL)
                   1334:                return;
1.27      millert  1335:        for (i = 0; i < f->state_count; i++)
                   1336:                xfree(f->gototab[i])
1.1       tholo    1337:        for (i = 0; i <= f->curstat; i++)
                   1338:                xfree(f->posns[i]);
                   1339:        for (i = 0; i <= f->accept; i++) {
                   1340:                xfree(f->re[i].lfollow);
                   1341:                if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30      millert  1342:                        xfree(f->re[i].lval.np);
1.1       tholo    1343:        }
                   1344:        xfree(f->restr);
1.27      millert  1345:        xfree(f->out);
                   1346:        xfree(f->posns);
                   1347:        xfree(f->gototab);
1.1       tholo    1348:        xfree(f);
                   1349: }