=================================================================== RCS file: /cvsrepo/anoncvs/cvs/src/usr.bin/awk/b.c,v retrieving revision 1.22 retrieving revision 1.23 diff -c -r1.22 -r1.23 *** src/usr.bin/awk/b.c 2020/06/10 21:01:32 1.22 --- src/usr.bin/awk/b.c 2020/06/10 21:01:50 1.23 *************** *** 1,4 **** ! /* $OpenBSD: b.c,v 1.22 2020/06/10 21:01:32 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved --- 1,4 ---- ! /* $OpenBSD: b.c,v 1.23 2020/06/10 21:01:50 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved *************** *** 28,33 **** --- 28,34 ---- #define DEBUG #include + #include #include #include #include *************** *** 66,71 **** --- 67,77 ---- static uschar *rlxstr; static uschar *prestr; /* current position in current re */ static uschar *lastre; /* origin of last re */ + static uschar *lastatom; /* origin of last Atom */ + static uschar *starttok; + static uschar *basestr; /* starts with original, replaced during + repetition processing */ + static uschar *firstbasestr; static int setcnt; static int poscnt; *************** *** 125,130 **** --- 131,138 ---- Node *p, *p1; fa *f; + firstbasestr = (uschar *) s; + basestr = firstbasestr; p = reparse(s); p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); /* put ALL STAR in front of reg. exp. */ *************** *** 146,151 **** --- 154,163 ---- f->initstat = makeinit(f, anchor); f->anchor = anchor; f->restr = (uschar *) tostring(s); + if (firstbasestr != basestr) { + if (basestr) + xfree(basestr); + } return f; } *************** *** 637,645 **** --- 649,659 ---- Node *primary(void) { Node *np; + int savelastatom; switch (rtok) { case CHAR: + lastatom = starttok; np = op2(CHAR, NIL, itonp(rlxval)); rtok = relex(); return (unary(np)); *************** *** 648,663 **** return (unary(op2(ALL, NIL, NIL))); case EMPTYRE: rtok = relex(); ! return (unary(op2(ALL, NIL, NIL))); case DOT: rtok = relex(); return (unary(op2(DOT, NIL, NIL))); case CCL: np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); rtok = relex(); return (unary(np)); case NCCL: np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); rtok = relex(); return (unary(np)); case '^': --- 662,680 ---- return (unary(op2(ALL, NIL, NIL))); case EMPTYRE: rtok = relex(); ! return (unary(op2(EMPTYRE, NIL, NIL))); case DOT: + lastatom = starttok; rtok = relex(); return (unary(op2(DOT, NIL, NIL))); case CCL: np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); + lastatom = starttok; rtok = relex(); return (unary(np)); case NCCL: np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); + lastatom = starttok; rtok = relex(); return (unary(np)); case '^': *************** *** 667,672 **** --- 684,691 ---- rtok = relex(); return (unary(op2(CHAR, NIL, NIL))); case '(': + lastatom = starttok; + savelastatom = starttok - basestr; /* Retain over recursion */ rtok = relex(); if (rtok == ')') { /* special pleading for () */ rtok = relex(); *************** *** 674,679 **** --- 693,699 ---- } np = regexp(); if (rtok == ')') { + lastatom = basestr + savelastatom; /* Restore */ rtok = relex(); return (unary(np)); } *************** *** 688,695 **** Node *concat(Node *np) { switch (rtok) { ! case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(': return (concat(op2(CAT, np, primary()))); } return (np); } --- 708,719 ---- Node *concat(Node *np) { switch (rtok) { ! case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': return (concat(op2(CAT, np, primary()))); + case EMPTYRE: + rtok = relex(); + return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), + primary()))); } return (np); } *************** *** 773,779 **** --- 797,914 ---- { NULL, 0, NULL }, }; + #define REPEAT_SIMPLE 0 + #define REPEAT_PLUS_APPENDED 1 + #define REPEAT_WITH_Q 2 + #define REPEAT_ZERO 3 + static int + replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, + int atomlen, int firstnum, int secondnum, int special_case) + { + int i, j; + uschar *buf = 0; + int ret = 1; + int init_q = (firstnum==0); /* first added char will be ? */ + int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ + int prefix_length = reptok - basestr; /* prefix includes first rep */ + int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */ + int size = prefix_length + suffix_length; + + if (firstnum > 1) { /* add room for reps 2 through firstnum */ + size += atomlen*(firstnum-1); + } + + /* Adjust size of buffer for special cases */ + if (special_case == REPEAT_PLUS_APPENDED) { + size++; /* for the final + */ + } else if (special_case == REPEAT_WITH_Q) { + size += init_q + (atomlen+1)* n_q_reps; + } else if (special_case == REPEAT_ZERO) { + size += 2; /* just a null ERE: () */ + } + if ((buf = (uschar *) malloc(size+1)) == NULL) + FATAL("out of space in reg expr %.10s..", lastre); + memcpy(buf, basestr, prefix_length); /* copy prefix */ + j = prefix_length; + if (special_case == REPEAT_ZERO) { + j -= atomlen; + buf[j++] = '('; + buf[j++] = ')'; + } + for (i=1; i < firstnum; i++) { /* copy x reps */ + memcpy(&buf[j], atom, atomlen); + j += atomlen; + } + if (special_case == REPEAT_PLUS_APPENDED) { + buf[j++] = '+'; + } else if (special_case == REPEAT_WITH_Q) { + if (init_q) buf[j++] = '?'; + for (i=0; i < n_q_reps; i++) { /* copy x? reps */ + memcpy(&buf[j], atom, atomlen); + j += atomlen; + buf[j++] = '?'; + } + } + memcpy(&buf[j], reptok+reptoklen, suffix_length); + if (special_case == REPEAT_ZERO) { + buf[j+suffix_length] = '\0'; + } else { + buf[size] = '\0'; + } + /* free old basestr */ + if (firstbasestr != basestr) { + if (basestr) + xfree(basestr); + } + basestr = buf; + prestr = buf + prefix_length; + if (special_case == REPEAT_ZERO) { + prestr -= atomlen; + ret++; + } + return ret; + } + + static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, + int atomlen, int firstnum, int secondnum) + { + /* + In general, the repetition specifier or "bound" is replaced here + by an equivalent ERE string, repeating the immediately previous atom + and appending ? and + as needed. Note that the first copy of the + atom is left in place, except in the special_case of a zero-repeat + (i.e., {0}). + */ + if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ + if (firstnum < 2) { + /* 0 or 1: should be handled before you get here */ + FATAL("internal error"); + } else { + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_PLUS_APPENDED); + } + } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ + if (firstnum == 0) { /* {0} or {0,0} */ + /* This case is unusual because the resulting + replacement string might actually be SMALLER than + the original ERE */ + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_ZERO); + } else { /* (firstnum >= 1) */ + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_SIMPLE); + } + } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ + /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_WITH_Q); + } else { /* Error - shouldn't be here (n>m) */ + FATAL("internal error"); + } + return 0; + } + int relex(void) /* lexical analyzer for reparse */ { int c, n; *************** *** 783,789 **** --- 918,929 ---- uschar *bp; struct charclass *cc; int i; + int num, m, commafound, digitfound; + const uschar *startreptok; + rescan: + starttok = prestr; + switch (c = *prestr++) { case '|': return OR; case '*': return STAR; *************** *** 839,845 **** * not without first adapting the entire * program to track each string's length. */ ! for (i = 1; i < NCHARS; i++) { if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) FATAL("out of space for reg expr %.10s...", lastre); if (cc->cc_func(i)) { --- 979,985 ---- * not without first adapting the entire * program to track each string's length. */ ! for (i = 1; i <= UCHAR_MAX; i++) { if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) FATAL("out of space for reg expr %.10s...", lastre); if (cc->cc_func(i)) { *************** *** 849,854 **** --- 989,1028 ---- } } else *bp++ = c; + } else if (c == '[' && *prestr == '.') { + char collate_char; + prestr++; + collate_char = *prestr++; + if (*prestr == '.' && prestr[1] == ']') { + prestr += 2; + /* Found it: map via locale TBD: for + now, simply return this char. This + is sufficient to pass conformance + test awk.ex 156 + */ + if (*prestr == ']') { + prestr++; + rlxval = collate_char; + return CHAR; + } + } + } else if (c == '[' && *prestr == '=') { + char equiv_char; + prestr++; + equiv_char = *prestr++; + if (*prestr == '=' && prestr[1] == ']') { + prestr += 2; + /* Found it: map via locale TBD: for now + simply return this char. This is + sufficient to pass conformance test + awk.ex 156 + */ + if (*prestr == ']') { + prestr++; + rlxval = equiv_char; + return CHAR; + } + } } else if (c == '\0') { FATAL("nonterminated character class %.20s", lastre); } else if (bp == buf) { /* 1st char is special */ *************** *** 863,868 **** --- 1037,1111 ---- } else *bp++ = c; } + break; + case '{': + if (isdigit(*(prestr))) { + num = 0; /* Process as a repetition */ + n = -1; m = -1; + commafound = 0; + digitfound = 0; + startreptok = prestr-1; + /* Remember start of previous atom here ? */ + } else { /* just a { char, not a repetition */ + rlxval = c; + return CHAR; + } + for (; ; ) { + if ((c = *prestr++) == '}') { + if (commafound) { + if (digitfound) { /* {n,m} */ + m = num; + if (m 0) { + if ((n==0) && (m==0)) { + return EMPTYRE; + } + /* must rescan input for next token */ + goto rescan; + } + /* Failed to replace: eat up {...} characters + and treat like just PLUS */ + return PLUS; + } else if (c == '\0') { + FATAL("nonterminated character class %.20s", + lastre); + } else if (isdigit(c)) { + num = 10 * num + c - '0'; + digitfound = 1; + } else if (c == ',') { + if (commafound) + FATAL("illegal repetition expression: class %.20s", + lastre); + /* looking for {n,} or {n,m} */ + commafound = 1; + n = num; + digitfound = 0; /* reset */ + num = 0; + } else { + FATAL("illegal repetition expression: class %.20s", + lastre); + } + } + break; } }