===================================================================
RCS file: /cvsrepo/anoncvs/cvs/src/usr.bin/awk/b.c,v
retrieving revision 1.22
retrieving revision 1.23
diff -c -r1.22 -r1.23
*** src/usr.bin/awk/b.c	2020/06/10 21:01:32	1.22
--- src/usr.bin/awk/b.c	2020/06/10 21:01:50	1.23
***************
*** 1,4 ****
! /*	$OpenBSD: b.c,v 1.22 2020/06/10 21:01:32 millert Exp $	*/
  /****************************************************************
  Copyright (C) Lucent Technologies 1997
  All Rights Reserved
--- 1,4 ----
! /*	$OpenBSD: b.c,v 1.23 2020/06/10 21:01:50 millert Exp $	*/
  /****************************************************************
  Copyright (C) Lucent Technologies 1997
  All Rights Reserved
***************
*** 28,33 ****
--- 28,34 ----
  #define	DEBUG
  
  #include <ctype.h>
+ #include <limits.h>
  #include <stdio.h>
  #include <string.h>
  #include <stdlib.h>
***************
*** 66,71 ****
--- 67,77 ----
  static uschar	*rlxstr;
  static uschar	*prestr;	/* current position in current re */
  static uschar	*lastre;	/* origin of last re */
+ static uschar	*lastatom;	/* origin of last Atom */
+ static uschar	*starttok;
+ static uschar 	*basestr;	/* starts with original, replaced during
+ 				   repetition processing */
+ static uschar 	*firstbasestr;
  
  static	int setcnt;
  static	int poscnt;
***************
*** 125,130 ****
--- 131,138 ----
  	Node *p, *p1;
  	fa *f;
  
+ 	firstbasestr = (uschar *) s;
+ 	basestr = firstbasestr;
  	p = reparse(s);
  	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
  		/* put ALL STAR in front of reg.  exp. */
***************
*** 146,151 ****
--- 154,163 ----
  	f->initstat = makeinit(f, anchor);
  	f->anchor = anchor;
  	f->restr = (uschar *) tostring(s);
+ 	if (firstbasestr != basestr) {
+ 		if (basestr)
+ 			xfree(basestr);
+ 	}
  	return f;
  }
  
***************
*** 637,645 ****
--- 649,659 ----
  Node *primary(void)
  {
  	Node *np;
+ 	int savelastatom;
  
  	switch (rtok) {
  	case CHAR:
+ 		lastatom = starttok;
  		np = op2(CHAR, NIL, itonp(rlxval));
  		rtok = relex();
  		return (unary(np));
***************
*** 648,663 ****
  		return (unary(op2(ALL, NIL, NIL)));
  	case EMPTYRE:
  		rtok = relex();
! 		return (unary(op2(ALL, NIL, NIL)));
  	case DOT:
  		rtok = relex();
  		return (unary(op2(DOT, NIL, NIL)));
  	case CCL:
  		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
  		rtok = relex();
  		return (unary(np));
  	case NCCL:
  		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
  		rtok = relex();
  		return (unary(np));
  	case '^':
--- 662,680 ----
  		return (unary(op2(ALL, NIL, NIL)));
  	case EMPTYRE:
  		rtok = relex();
! 		return (unary(op2(EMPTYRE, NIL, NIL)));
  	case DOT:
+ 		lastatom = starttok;
  		rtok = relex();
  		return (unary(op2(DOT, NIL, NIL)));
  	case CCL:
  		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
+ 		lastatom = starttok;
  		rtok = relex();
  		return (unary(np));
  	case NCCL:
  		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
+ 		lastatom = starttok;
  		rtok = relex();
  		return (unary(np));
  	case '^':
***************
*** 667,672 ****
--- 684,691 ----
  		rtok = relex();
  		return (unary(op2(CHAR, NIL, NIL)));
  	case '(':
+ 		lastatom = starttok;
+ 		savelastatom = starttok - basestr; /* Retain over recursion */
  		rtok = relex();
  		if (rtok == ')') {	/* special pleading for () */
  			rtok = relex();
***************
*** 674,679 ****
--- 693,699 ----
  		}
  		np = regexp();
  		if (rtok == ')') {
+ 			lastatom = basestr + savelastatom; /* Restore */
  			rtok = relex();
  			return (unary(np));
  		}
***************
*** 688,695 ****
  Node *concat(Node *np)
  {
  	switch (rtok) {
! 	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
  		return (concat(op2(CAT, np, primary())));
  	}
  	return (np);
  }
--- 708,719 ----
  Node *concat(Node *np)
  {
  	switch (rtok) {
! 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
  		return (concat(op2(CAT, np, primary())));
+ 	case EMPTYRE:
+ 		rtok = relex();
+ 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
+ 				primary())));
  	}
  	return (np);
  }
***************
*** 773,779 ****
--- 797,914 ----
  	{ NULL,		0,	NULL },
  };
  
+ #define REPEAT_SIMPLE		0
+ #define REPEAT_PLUS_APPENDED	1
+ #define REPEAT_WITH_Q		2
+ #define REPEAT_ZERO		3
  
+ static int
+ replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
+ 	       int atomlen, int firstnum, int secondnum, int special_case)
+ {
+ 	int i, j;
+ 	uschar *buf = 0;
+ 	int ret = 1;
+ 	int init_q = (firstnum==0);		/* first added char will be ? */
+ 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
+ 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
+ 	int suffix_length = strlen((char *) reptok) - reptoklen;	/* string after rep specifier	*/
+ 	int size = prefix_length +  suffix_length;
+ 
+ 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
+ 		size += atomlen*(firstnum-1);
+ 	}
+ 
+ 	/* Adjust size of buffer for special cases */
+ 	if (special_case == REPEAT_PLUS_APPENDED) {
+ 		size++;		/* for the final + */
+ 	} else if (special_case == REPEAT_WITH_Q) {
+ 		size += init_q + (atomlen+1)* n_q_reps;
+ 	} else if (special_case == REPEAT_ZERO) {
+ 		size += 2;	/* just a null ERE: () */
+ 	}
+ 	if ((buf = (uschar *) malloc(size+1)) == NULL)
+ 		FATAL("out of space in reg expr %.10s..", lastre);
+ 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
+ 	j = prefix_length;
+ 	if (special_case == REPEAT_ZERO) {
+ 		j -= atomlen;
+ 		buf[j++] = '(';
+ 		buf[j++] = ')';
+ 	}
+ 	for (i=1; i < firstnum; i++) {		/* copy x reps 	*/
+ 		memcpy(&buf[j], atom, atomlen);
+ 		j += atomlen;
+ 	}
+ 	if (special_case == REPEAT_PLUS_APPENDED) {
+ 		buf[j++] = '+';
+ 	} else if (special_case == REPEAT_WITH_Q) {
+ 		if (init_q) buf[j++] = '?';
+ 		for (i=0; i < n_q_reps; i++) {	/* copy x? reps */
+ 			memcpy(&buf[j], atom, atomlen);
+ 			j += atomlen;
+ 			buf[j++] = '?';
+ 		}
+ 	}
+ 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
+ 	if (special_case == REPEAT_ZERO) {
+ 		buf[j+suffix_length] = '\0';
+ 	} else {
+ 		buf[size] = '\0';
+ 	}
+ 	/* free old basestr */
+ 	if (firstbasestr != basestr) {
+ 		if (basestr)
+ 			xfree(basestr);
+ 	}
+ 	basestr = buf;
+ 	prestr  = buf + prefix_length;
+ 	if (special_case == REPEAT_ZERO) {
+ 		prestr  -= atomlen;
+ 		ret++;
+ 	}
+ 	return ret;
+ }
+ 
+ static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
+ 		  int atomlen, int firstnum, int secondnum)
+ {
+ 	/*
+ 	   In general, the repetition specifier or "bound" is replaced here
+ 	   by an equivalent ERE string, repeating the immediately previous atom
+ 	   and appending ? and + as needed. Note that the first copy of the
+ 	   atom is left in place, except in the special_case of a zero-repeat
+ 	   (i.e., {0}).
+ 	 */
+ 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
+ 		if (firstnum < 2) {
+ 			/* 0 or 1: should be handled before you get here */
+ 			FATAL("internal error");
+ 		} else {
+ 			return replace_repeat(reptok, reptoklen, atom, atomlen,
+ 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
+ 		}
+ 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
+ 		if (firstnum == 0) {	/* {0} or {0,0} */
+ 			/* This case is unusual because the resulting
+ 			   replacement string might actually be SMALLER than
+ 			   the original ERE */
+ 			return replace_repeat(reptok, reptoklen, atom, atomlen,
+ 					firstnum, secondnum, REPEAT_ZERO);
+ 		} else {		/* (firstnum >= 1) */
+ 			return replace_repeat(reptok, reptoklen, atom, atomlen,
+ 					firstnum, secondnum, REPEAT_SIMPLE);
+ 		}
+ 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
+ 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
+ 		return replace_repeat(reptok, reptoklen, atom, atomlen,
+ 					firstnum, secondnum, REPEAT_WITH_Q);
+ 	} else {	/* Error - shouldn't be here (n>m) */
+ 		FATAL("internal error");
+ 	}
+ 	return 0;
+ }
+ 
  int relex(void)		/* lexical analyzer for reparse */
  {
  	int c, n;
***************
*** 783,789 ****
--- 918,929 ----
  	uschar *bp;
  	struct charclass *cc;
  	int i;
+ 	int num, m, commafound, digitfound;
+ 	const uschar *startreptok;
  
+ rescan:
+ 	starttok = prestr;
+ 
  	switch (c = *prestr++) {
  	case '|': return OR;
  	case '*': return STAR;
***************
*** 839,845 ****
  					 * not without first adapting the entire
  					 * program to track each string's length.
  					 */
! 					for (i = 1; i < NCHARS; i++) {
  						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
  						    FATAL("out of space for reg expr %.10s...", lastre);
  						if (cc->cc_func(i)) {
--- 979,985 ----
  					 * not without first adapting the entire
  					 * program to track each string's length.
  					 */
! 					for (i = 1; i <= UCHAR_MAX; i++) {
  						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
  						    FATAL("out of space for reg expr %.10s...", lastre);
  						if (cc->cc_func(i)) {
***************
*** 849,854 ****
--- 989,1028 ----
  					}
  				} else
  					*bp++ = c;
+ 			} else if (c == '[' && *prestr == '.') {
+ 				char collate_char;
+ 				prestr++;
+ 				collate_char = *prestr++;
+ 				if (*prestr == '.' && prestr[1] == ']') {
+ 					prestr += 2;
+ 					/* Found it: map via locale TBD: for
+ 					   now, simply return this char.  This
+ 					   is sufficient to pass conformance
+ 					   test awk.ex 156
+ 					 */
+ 					if (*prestr == ']') {
+ 						prestr++;
+ 						rlxval = collate_char;
+ 						return CHAR;
+ 					}
+ 				}
+ 			} else if (c == '[' && *prestr == '=') {
+ 				char equiv_char;
+ 				prestr++;
+ 				equiv_char = *prestr++;
+ 				if (*prestr == '=' && prestr[1] == ']') {
+ 					prestr += 2;
+ 					/* Found it: map via locale TBD: for now
+ 					   simply return this char. This is
+ 					   sufficient to pass conformance test
+ 					   awk.ex 156
+ 					 */
+ 					if (*prestr == ']') {
+ 						prestr++;
+ 						rlxval = equiv_char;
+ 						return CHAR;
+ 					}
+ 				}
  			} else if (c == '\0') {
  				FATAL("nonterminated character class %.20s", lastre);
  			} else if (bp == buf) {	/* 1st char is special */
***************
*** 863,868 ****
--- 1037,1111 ----
  			} else
  				*bp++ = c;
  		}
+ 		break;
+ 	case '{':
+ 		if (isdigit(*(prestr))) {
+ 			num = 0;	/* Process as a repetition */
+ 			n = -1; m = -1;
+ 			commafound = 0;
+ 			digitfound = 0;
+ 			startreptok = prestr-1;
+ 			/* Remember start of previous atom here ? */
+ 		} else {        	/* just a { char, not a repetition */
+ 			rlxval = c;
+ 			return CHAR;
+                 }
+ 		for (; ; ) {
+ 			if ((c = *prestr++) == '}') {
+ 				if (commafound) {
+ 					if (digitfound) { /* {n,m} */
+ 						m = num;
+ 						if (m<n)
+ 							FATAL("illegal repetition expression: class %.20s",
+ 								lastre);
+ 						if ((n==0) && (m==1)) {
+ 							return QUEST;
+ 						}
+ 					} else {	/* {n,} */
+ 						if (n==0) return STAR;
+ 						if (n==1) return PLUS;
+ 					}
+ 				} else {
+ 					if (digitfound) { /* {n} same as {n,n} */
+ 						n = num;
+ 						m = num;
+ 					} else {	/* {} */
+ 						FATAL("illegal repetition expression: class %.20s",
+ 							lastre);
+ 					}
+ 				}
+ 				if (repeat(starttok, prestr-starttok, lastatom,
+ 					   startreptok - lastatom, n, m) > 0) {
+ 					if ((n==0) && (m==0)) {
+ 						return EMPTYRE;
+ 					}
+ 					/* must rescan input for next token */
+ 					goto rescan;
+ 				}
+ 				/* Failed to replace: eat up {...} characters
+ 				   and treat like just PLUS */
+ 				return PLUS;
+ 			} else if (c == '\0') {
+ 				FATAL("nonterminated character class %.20s",
+ 					lastre);
+ 			} else if (isdigit(c)) {
+ 				num = 10 * num + c - '0';
+ 				digitfound = 1;
+ 			} else if (c == ',') {
+ 				if (commafound)
+ 					FATAL("illegal repetition expression: class %.20s",
+ 						lastre);
+ 				/* looking for {n,} or {n,m} */
+ 				commafound = 1;
+ 				n = num;
+ 				digitfound = 0; /* reset */
+ 				num = 0;
+ 			} else {
+ 				FATAL("illegal repetition expression: class %.20s",
+ 					lastre);
+ 			}
+ 		}
+ 		break;
  	}
  }