Annotation of src/usr.bin/less/regexp.c, Revision 1.8
1.1 etheisen 1: /*
2: * regcomp and regexec -- regsub and regerror are elsewhere
3: *
4: * Copyright (c) 1986 by University of Toronto.
5: * Written by Henry Spencer. Not derived from licensed software.
6: *
7: * Permission is granted to anyone to use this software for any
8: * purpose on any computer system, and to redistribute it freely,
9: * subject to the following restrictions:
10: *
11: * 1. The author is not responsible for the consequences of use of
12: * this software, no matter how awful, even if they arise
13: * from defects in it.
14: *
15: * 2. The origin of this software must not be misrepresented, either
16: * by explicit claim or by omission.
17: *
18: * 3. Altered versions must be plainly marked as such, and must not
19: * be misrepresented as being the original software.
20: *
21: * Beware that some of this code is subtly aware of the way operator
22: * precedence is structured in regular expressions. Serious changes in
23: * regular-expression syntax might require a total rethink.
24: *
25: * *** NOTE: this code has been altered slightly for use in Tcl. ***
26: * Slightly modified by David MacKenzie to undo most of the changes for TCL.
1.7 millert 27: * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
1.1 etheisen 28: */
29:
1.7 millert 30: #include "less.h"
31: #if HAVE_STDIO_H
1.1 etheisen 32: #include <stdio.h>
1.7 millert 33: #endif
34: #if HAVE_STDLIB_H
35: #include <stdlib.h>
36: #endif
37: #if HAVE_STRING_H
38: #include <string.h>
39: #endif
1.1 etheisen 40: #include "regexp.h"
41:
42: /*
43: * The "internal use only" fields in regexp.h are present to pass info from
44: * compile to execute that permits the execute phase to run lots faster on
45: * simple cases. They are:
46: *
47: * regstart char that must begin a match; '\0' if none obvious
48: * reganch is the match anchored (at beginning-of-line only)?
49: * regmust string (pointer into program) that match must include, or NULL
50: * regmlen length of regmust string
51: *
52: * Regstart and reganch permit very fast decisions on suitable starting points
53: * for a match, cutting down the work a lot. Regmust permits fast rejection
54: * of lines that cannot possibly match. The regmust tests are costly enough
55: * that regcomp() supplies a regmust only if the r.e. contains something
56: * potentially expensive (at present, the only such thing detected is * or +
57: * at the start of the r.e., which can involve a lot of backup). Regmlen is
58: * supplied because the test in regexec() needs it and regcomp() is
59: * computing it anyway.
60: */
61:
62: /*
63: * Structure for regexp "program". This is essentially a linear encoding
64: * of a nondeterministic finite-state machine (aka syntax charts or
65: * "railroad normal form" in parsing technology). Each node is an opcode
66: * plus a "next" pointer, possibly plus an operand. "Next" pointers of
67: * all nodes except BRANCH implement concatenation; a "next" pointer with
68: * a BRANCH on both ends of it is connecting two alternatives. (Here we
69: * have one of the subtle syntax dependencies: an individual BRANCH (as
70: * opposed to a collection of them) is never concatenated with anything
71: * because of operator precedence.) The operand of some types of node is
72: * a literal string; for others, it is a node leading into a sub-FSM. In
73: * particular, the operand of a BRANCH node is the first node of the branch.
74: * (NB this is *not* a tree structure: the tail of the branch connects
75: * to the thing following the set of BRANCHes.) The opcodes are:
76: */
77:
78: /* definition number opnd? meaning */
1.7 millert 79: #undef EOL
1.1 etheisen 80: #define END 0 /* no End of program. */
81: #define BOL 1 /* no Match "" at beginning of line. */
82: #define EOL 2 /* no Match "" at end of line. */
83: #define ANY 3 /* no Match any one character. */
84: #define ANYOF 4 /* str Match any character in this string. */
85: #define ANYBUT 5 /* str Match any character not in this string. */
86: #define BRANCH 6 /* node Match this alternative, or the next... */
87: #define BACK 7 /* no Match "", "next" ptr points backward. */
88: #define EXACTLY 8 /* str Match this string. */
89: #define NOTHING 9 /* no Match empty string. */
90: #define STAR 10 /* node Match this (simple) thing 0 or more times. */
91: #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
92: #define OPEN 20 /* no Mark this point in input as start of #n. */
93: /* OPEN+1 is number 1, etc. */
94: #define CLOSE 30 /* no Analogous to OPEN. */
95:
96: /*
97: * Opcode notes:
98: *
99: * BRANCH The set of branches constituting a single choice are hooked
100: * together with their "next" pointers, since precedence prevents
101: * anything being concatenated to any individual branch. The
102: * "next" pointer of the last BRANCH in a choice points to the
103: * thing following the whole choice. This is also where the
104: * final "next" pointer of each individual branch points; each
105: * branch starts with the operand node of a BRANCH node.
106: *
107: * BACK Normal "next" pointers all implicitly point forward; BACK
108: * exists to make loop structures possible.
109: *
110: * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
111: * BRANCH structures using BACK. Simple cases (one character
112: * per match) are implemented with STAR and PLUS for speed
113: * and to minimize recursive plunges.
114: *
115: * OPEN,CLOSE ...are numbered at compile time.
116: */
117:
118: /*
119: * A node is one char of opcode followed by two chars of "next" pointer.
120: * "Next" pointers are stored as two 8-bit pieces, high order first. The
121: * value is a positive offset from the opcode of the node containing it.
122: * An operand, if any, simply follows the node. (Note that much of the
123: * code generation knows about this implicit relationship.)
124: *
125: * Using two bytes for the "next" pointer is vast overkill for most things,
126: * but allows patterns to get big without disasters.
127: */
128: #define OP(p) (*(p))
129: #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
130: #define OPERAND(p) ((p) + 3)
131:
132: /*
133: * See regmagic.h for one further detail of program structure.
134: */
135:
136:
137: /*
138: * Utility definitions.
139: */
140: #ifndef CHARBITS
141: #define UCHARAT(p) ((int)*(unsigned char *)(p))
142: #else
143: #define UCHARAT(p) ((int)*(p)&CHARBITS)
144: #endif
145:
146: #define FAIL(m) { regerror(m); return(NULL); }
147: #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
148: #define META "^$.[()|?+*\\"
149:
150: /*
151: * Flags to be passed up and down.
152: */
153: #define HASWIDTH 01 /* Known never to match null string. */
154: #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
155: #define SPSTART 04 /* Starts with * or +. */
156: #define WORST 0 /* Worst case. */
157:
158: /*
159: * Global work variables for regcomp().
160: */
161: static char *regparse; /* Input-scan pointer. */
162: static int regnpar; /* () count. */
163: static char regdummy;
164: static char *regcode; /* Code-emit pointer; ®dummy = don't. */
165: static long regsize; /* Code size. */
166:
167: /*
168: * The first byte of the regexp internal "program" is actually this magic
169: * number; the start node begins in the second byte.
170: */
171: #define MAGIC 0234
172:
173:
174: /*
175: * Forward declarations for regcomp()'s friends.
176: */
177: #ifndef STATIC
178: #define STATIC static
179: #endif
180: STATIC char *reg();
181: STATIC char *regbranch();
182: STATIC char *regpiece();
183: STATIC char *regatom();
184: STATIC char *regnode();
185: STATIC char *regnext();
186: STATIC void regc();
187: STATIC void reginsert();
188: STATIC void regtail();
189: STATIC void regoptail();
190: #ifdef STRCSPN
191: STATIC int strcspn();
192: #endif
193:
194: /*
195: - regcomp - compile a regular expression into internal code
196: *
197: * We can't allocate space until we know how big the compiled form will be,
198: * but we can't compile it (and thus know how big it is) until we've got a
199: * place to put the code. So we cheat: we compile it twice, once with code
200: * generation turned off and size counting turned on, and once "for real".
201: * This also means that we don't allocate space until we are sure that the
202: * thing really will compile successfully, and we never have to move the
203: * code and thus invalidate pointers into it. (Note that it has to be in
204: * one piece because free() must be able to free it all.)
205: *
206: * Beware that the optimization-preparation code in here knows about some
207: * of the structure of the compiled regexp.
208: */
209: regexp *
210: regcomp(exp)
211: char *exp;
212: {
1.7 millert 213: register regexp *r;
214: register char *scan;
215: register char *longest;
216: register int len;
1.1 etheisen 217: int flags;
218:
219: if (exp == NULL)
220: FAIL("NULL argument");
221:
222: /* First pass: determine size, legality. */
223: regparse = exp;
224: regnpar = 1;
225: regsize = 0L;
226: regcode = ®dummy;
227: regc(MAGIC);
228: if (reg(0, &flags) == NULL)
229: return(NULL);
230:
231: /* Small enough for pointer-storage convention? */
232: if (regsize >= 32767L) /* Probably could be 65535L. */
233: FAIL("regexp too big");
234:
235: /* Allocate space. */
236: r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
237: if (r == NULL)
238: FAIL("out of space");
239:
240: /* Second pass: emit code. */
241: regparse = exp;
242: regnpar = 1;
243: regcode = r->program;
244: regc(MAGIC);
1.8 ! jsg 245: if (reg(0, &flags) == NULL) {
! 246: free(r);
1.1 etheisen 247: return(NULL);
1.8 ! jsg 248: }
1.1 etheisen 249:
250: /* Dig out information for optimizations. */
251: r->regstart = '\0'; /* Worst-case defaults. */
252: r->reganch = 0;
253: r->regmust = NULL;
254: r->regmlen = 0;
255: scan = r->program+1; /* First BRANCH. */
256: if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
257: scan = OPERAND(scan);
258:
259: /* Starting-point info. */
260: if (OP(scan) == EXACTLY)
261: r->regstart = *OPERAND(scan);
262: else if (OP(scan) == BOL)
263: r->reganch++;
264:
265: /*
266: * If there's something expensive in the r.e., find the
267: * longest literal string that must appear and make it the
268: * regmust. Resolve ties in favor of later strings, since
269: * the regstart check works with the beginning of the r.e.
270: * and avoiding duplication strengthens checking. Not a
271: * strong reason, but sufficient in the absence of others.
272: */
273: if (flags&SPSTART) {
274: longest = NULL;
275: len = 0;
276: for (; scan != NULL; scan = regnext(scan))
277: if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
278: longest = OPERAND(scan);
279: len = strlen(OPERAND(scan));
280: }
281: r->regmust = longest;
282: r->regmlen = len;
283: }
284: }
285:
286: return(r);
287: }
288:
289: /*
290: - reg - regular expression, i.e. main body or parenthesized thing
291: *
292: * Caller must absorb opening parenthesis.
293: *
294: * Combining parenthesis handling with the base level of regular expression
295: * is a trifle forced, but the need to tie the tails of the branches to what
296: * follows makes it hard to avoid.
297: */
298: static char *
299: reg(paren, flagp)
300: int paren; /* Parenthesized? */
301: int *flagp;
302: {
1.7 millert 303: register char *ret;
304: register char *br;
305: register char *ender;
306: register int parno = 0;
1.1 etheisen 307: int flags;
308:
309: *flagp = HASWIDTH; /* Tentatively. */
310:
311: /* Make an OPEN node, if parenthesized. */
312: if (paren) {
313: if (regnpar >= NSUBEXP)
314: FAIL("too many ()");
315: parno = regnpar;
316: regnpar++;
317: ret = regnode(OPEN+parno);
318: } else
319: ret = NULL;
320:
321: /* Pick up the branches, linking them together. */
322: br = regbranch(&flags);
323: if (br == NULL)
324: return(NULL);
325: if (ret != NULL)
326: regtail(ret, br); /* OPEN -> first. */
327: else
328: ret = br;
329: if (!(flags&HASWIDTH))
330: *flagp &= ~HASWIDTH;
331: *flagp |= flags&SPSTART;
332: while (*regparse == '|') {
333: regparse++;
334: br = regbranch(&flags);
335: if (br == NULL)
336: return(NULL);
337: regtail(ret, br); /* BRANCH -> BRANCH. */
338: if (!(flags&HASWIDTH))
339: *flagp &= ~HASWIDTH;
340: *flagp |= flags&SPSTART;
341: }
342:
343: /* Make a closing node, and hook it on the end. */
344: ender = regnode((paren) ? CLOSE+parno : END);
345: regtail(ret, ender);
346:
347: /* Hook the tails of the branches to the closing node. */
348: for (br = ret; br != NULL; br = regnext(br))
349: regoptail(br, ender);
350:
351: /* Check for proper termination. */
352: if (paren && *regparse++ != ')') {
353: FAIL("unmatched ()");
354: } else if (!paren && *regparse != '\0') {
355: if (*regparse == ')') {
356: FAIL("unmatched ()");
357: } else
358: FAIL("junk on end"); /* "Can't happen". */
359: /* NOTREACHED */
360: }
361:
362: return(ret);
363: }
364:
365: /*
366: - regbranch - one alternative of an | operator
367: *
368: * Implements the concatenation operator.
369: */
370: static char *
371: regbranch(flagp)
372: int *flagp;
373: {
1.7 millert 374: register char *ret;
375: register char *chain;
376: register char *latest;
1.1 etheisen 377: int flags;
378:
379: *flagp = WORST; /* Tentatively. */
380:
381: ret = regnode(BRANCH);
382: chain = NULL;
383: while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
384: latest = regpiece(&flags);
385: if (latest == NULL)
386: return(NULL);
387: *flagp |= flags&HASWIDTH;
388: if (chain == NULL) /* First piece. */
389: *flagp |= flags&SPSTART;
390: else
391: regtail(chain, latest);
392: chain = latest;
393: }
394: if (chain == NULL) /* Loop ran zero times. */
395: (void) regnode(NOTHING);
396:
397: return(ret);
398: }
399:
400: /*
401: - regpiece - something followed by possible [*+?]
402: *
403: * Note that the branching code sequences used for ? and the general cases
404: * of * and + are somewhat optimized: they use the same NOTHING node as
405: * both the endmarker for their branch list and the body of the last branch.
406: * It might seem that this node could be dispensed with entirely, but the
407: * endmarker role is not redundant.
408: */
409: static char *
410: regpiece(flagp)
411: int *flagp;
412: {
1.7 millert 413: register char *ret;
414: register char op;
415: register char *next;
1.1 etheisen 416: int flags;
417:
418: ret = regatom(&flags);
419: if (ret == NULL)
420: return(NULL);
421:
422: op = *regparse;
423: if (!ISMULT(op)) {
424: *flagp = flags;
425: return(ret);
426: }
427:
428: if (!(flags&HASWIDTH) && op != '?')
429: FAIL("*+ operand could be empty");
430: *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
431:
432: if (op == '*' && (flags&SIMPLE))
433: reginsert(STAR, ret);
434: else if (op == '*') {
435: /* Emit x* as (x&|), where & means "self". */
436: reginsert(BRANCH, ret); /* Either x */
437: regoptail(ret, regnode(BACK)); /* and loop */
438: regoptail(ret, ret); /* back */
439: regtail(ret, regnode(BRANCH)); /* or */
440: regtail(ret, regnode(NOTHING)); /* null. */
441: } else if (op == '+' && (flags&SIMPLE))
442: reginsert(PLUS, ret);
443: else if (op == '+') {
444: /* Emit x+ as x(&|), where & means "self". */
445: next = regnode(BRANCH); /* Either */
446: regtail(ret, next);
447: regtail(regnode(BACK), ret); /* loop back */
448: regtail(next, regnode(BRANCH)); /* or */
449: regtail(ret, regnode(NOTHING)); /* null. */
450: } else if (op == '?') {
451: /* Emit x? as (x|) */
452: reginsert(BRANCH, ret); /* Either x */
453: regtail(ret, regnode(BRANCH)); /* or */
454: next = regnode(NOTHING); /* null. */
455: regtail(ret, next);
456: regoptail(ret, next);
457: }
458: regparse++;
459: if (ISMULT(*regparse))
460: FAIL("nested *?+");
461:
462: return(ret);
463: }
464:
465: /*
466: - regatom - the lowest level
467: *
468: * Optimization: gobbles an entire sequence of ordinary characters so that
469: * it can turn them into a single node, which is smaller to store and
470: * faster to run. Backslashed characters are exceptions, each becoming a
471: * separate node; the code is simpler that way and it's not worth fixing.
472: */
473: static char *
474: regatom(flagp)
475: int *flagp;
476: {
1.7 millert 477: register char *ret;
1.1 etheisen 478: int flags;
479:
480: *flagp = WORST; /* Tentatively. */
481:
482: switch (*regparse++) {
483: case '^':
484: ret = regnode(BOL);
485: break;
486: case '$':
487: ret = regnode(EOL);
488: break;
489: case '.':
490: ret = regnode(ANY);
491: *flagp |= HASWIDTH|SIMPLE;
492: break;
493: case '[': {
1.7 millert 494: register int clss;
495: register int classend;
1.1 etheisen 496:
497: if (*regparse == '^') { /* Complement of range. */
498: ret = regnode(ANYBUT);
499: regparse++;
500: } else
501: ret = regnode(ANYOF);
502: if (*regparse == ']' || *regparse == '-')
503: regc(*regparse++);
504: while (*regparse != '\0' && *regparse != ']') {
505: if (*regparse == '-') {
506: regparse++;
507: if (*regparse == ']' || *regparse == '\0')
508: regc('-');
509: else {
510: clss = UCHARAT(regparse-2)+1;
511: classend = UCHARAT(regparse);
512: if (clss > classend+1)
513: FAIL("invalid [] range");
514: for (; clss <= classend; clss++)
515: regc(clss);
516: regparse++;
517: }
518: } else
519: regc(*regparse++);
520: }
521: regc('\0');
522: if (*regparse != ']')
523: FAIL("unmatched []");
524: regparse++;
525: *flagp |= HASWIDTH|SIMPLE;
526: }
527: break;
528: case '(':
529: ret = reg(1, &flags);
530: if (ret == NULL)
531: return(NULL);
532: *flagp |= flags&(HASWIDTH|SPSTART);
533: break;
534: case '\0':
535: case '|':
536: case ')':
537: FAIL("internal urp"); /* Supposed to be caught earlier. */
538: /* NOTREACHED */
539: break;
540: case '?':
541: case '+':
542: case '*':
543: FAIL("?+* follows nothing");
544: /* NOTREACHED */
545: break;
546: case '\\':
547: if (*regparse == '\0')
548: FAIL("trailing \\");
549: ret = regnode(EXACTLY);
550: regc(*regparse++);
551: regc('\0');
552: *flagp |= HASWIDTH|SIMPLE;
553: break;
554: default: {
1.7 millert 555: register int len;
556: register char ender;
1.1 etheisen 557:
558: regparse--;
559: len = strcspn(regparse, META);
560: if (len <= 0)
561: FAIL("internal disaster");
562: ender = *(regparse+len);
563: if (len > 1 && ISMULT(ender))
564: len--; /* Back off clear of ?+* operand. */
565: *flagp |= HASWIDTH;
566: if (len == 1)
567: *flagp |= SIMPLE;
568: ret = regnode(EXACTLY);
569: while (len > 0) {
570: regc(*regparse++);
571: len--;
572: }
573: regc('\0');
574: }
575: break;
576: }
577:
578: return(ret);
579: }
580:
581: /*
582: - regnode - emit a node
583: */
584: static char * /* Location. */
585: regnode(op)
586: char op;
587: {
1.7 millert 588: register char *ret;
589: register char *ptr;
1.1 etheisen 590:
591: ret = regcode;
592: if (ret == ®dummy) {
593: regsize += 3;
594: return(ret);
595: }
596:
597: ptr = ret;
598: *ptr++ = op;
599: *ptr++ = '\0'; /* Null "next" pointer. */
600: *ptr++ = '\0';
601: regcode = ptr;
602:
603: return(ret);
604: }
605:
606: /*
607: - regc - emit (if appropriate) a byte of code
608: */
609: static void
610: regc(b)
611: char b;
612: {
613: if (regcode != ®dummy)
614: *regcode++ = b;
615: else
616: regsize++;
617: }
618:
619: /*
620: - reginsert - insert an operator in front of already-emitted operand
621: *
622: * Means relocating the operand.
623: */
624: static void
625: reginsert(op, opnd)
626: char op;
627: char *opnd;
628: {
1.7 millert 629: register char *src;
630: register char *dst;
631: register char *place;
1.1 etheisen 632:
633: if (regcode == ®dummy) {
634: regsize += 3;
635: return;
636: }
637:
638: src = regcode;
639: regcode += 3;
640: dst = regcode;
641: while (src > opnd)
642: *--dst = *--src;
643:
644: place = opnd; /* Op node, where operand used to be. */
645: *place++ = op;
646: *place++ = '\0';
647: *place++ = '\0';
648: }
649:
650: /*
651: - regtail - set the next-pointer at the end of a node chain
652: */
653: static void
654: regtail(p, val)
655: char *p;
656: char *val;
657: {
1.7 millert 658: register char *scan;
659: register char *temp;
660: register int offset;
1.1 etheisen 661:
662: if (p == ®dummy)
663: return;
664:
665: /* Find last node. */
666: scan = p;
667: for (;;) {
668: temp = regnext(scan);
669: if (temp == NULL)
670: break;
671: scan = temp;
672: }
673:
674: if (OP(scan) == BACK)
675: offset = scan - val;
676: else
677: offset = val - scan;
678: *(scan+1) = (offset>>8)&0377;
679: *(scan+2) = offset&0377;
680: }
681:
682: /*
683: - regoptail - regtail on operand of first argument; nop if operandless
684: */
685: static void
686: regoptail(p, val)
687: char *p;
688: char *val;
689: {
690: /* "Operandless" and "op != BRANCH" are synonymous in practice. */
691: if (p == NULL || p == ®dummy || OP(p) != BRANCH)
692: return;
693: regtail(OPERAND(p), val);
694: }
695:
696: /*
697: * regexec and friends
698: */
699:
700: /*
701: * Global work variables for regexec().
702: */
703: static char *reginput; /* String-input pointer. */
704: static char *regbol; /* Beginning of input, for ^ check. */
705: static char **regstartp; /* Pointer to startp array. */
706: static char **regendp; /* Ditto for endp. */
707:
708: /*
709: * Forwards.
710: */
711: STATIC int regtry();
712: STATIC int regmatch();
713: STATIC int regrepeat();
714:
715: #ifdef DEBUG
716: int regnarrate = 0;
717: void regdump();
718: STATIC char *regprop();
719: #endif
720:
721: /*
722: - regexec - match a regexp against a string
723: */
724: int
1.7 millert 725: regexec2(prog, string, notbol)
726: register regexp *prog;
727: register char *string;
728: int notbol;
1.1 etheisen 729: {
1.7 millert 730: register char *s;
1.1 etheisen 731:
732: /* Be paranoid... */
733: if (prog == NULL || string == NULL) {
734: regerror("NULL parameter");
735: return(0);
736: }
737:
738: /* Check validity of program. */
739: if (UCHARAT(prog->program) != MAGIC) {
740: regerror("corrupted program");
741: return(0);
742: }
743:
744: /* If there is a "must appear" string, look for it. */
745: if (prog->regmust != NULL) {
746: s = string;
747: while ((s = strchr(s, prog->regmust[0])) != NULL) {
748: if (strncmp(s, prog->regmust, prog->regmlen) == 0)
749: break; /* Found it. */
750: s++;
751: }
752: if (s == NULL) /* Not present. */
753: return(0);
754: }
755:
756: /* Mark beginning of line for ^ . */
1.7 millert 757: if (notbol)
758: regbol = NULL;
759: else
760: regbol = string;
1.1 etheisen 761:
762: /* Simplest case: anchored match need be tried only once. */
763: if (prog->reganch)
764: return(regtry(prog, string));
765:
766: /* Messy cases: unanchored match. */
767: s = string;
768: if (prog->regstart != '\0')
769: /* We know what char it must start with. */
770: while ((s = strchr(s, prog->regstart)) != NULL) {
771: if (regtry(prog, s))
772: return(1);
773: s++;
774: }
775: else
776: /* We don't -- general case. */
777: do {
778: if (regtry(prog, s))
779: return(1);
780: } while (*s++ != '\0');
781:
782: /* Failure. */
783: return(0);
784: }
785:
1.7 millert 786: int
787: regexec(prog, string)
788: register regexp *prog;
789: register char *string;
790: {
791: return regexec2(prog, string, 0);
792: }
793:
1.1 etheisen 794: /*
795: - regtry - try match at specific point
796: */
797: static int /* 0 failure, 1 success */
798: regtry(prog, string)
799: regexp *prog;
800: char *string;
801: {
1.7 millert 802: register int i;
803: register char **sp;
804: register char **ep;
1.1 etheisen 805:
806: reginput = string;
807: regstartp = prog->startp;
808: regendp = prog->endp;
809:
810: sp = prog->startp;
811: ep = prog->endp;
812: for (i = NSUBEXP; i > 0; i--) {
813: *sp++ = NULL;
814: *ep++ = NULL;
815: }
816: if (regmatch(prog->program + 1)) {
817: prog->startp[0] = string;
818: prog->endp[0] = reginput;
819: return(1);
820: } else
821: return(0);
822: }
823:
824: /*
825: - regmatch - main matching routine
826: *
827: * Conceptually the strategy is simple: check to see whether the current
828: * node matches, call self recursively to see whether the rest matches,
829: * and then act accordingly. In practice we make some effort to avoid
830: * recursion, in particular by going through "ordinary" nodes (that don't
831: * need to know whether the rest of the match failed) by a loop instead of
832: * by recursion.
833: */
834: static int /* 0 failure, 1 success */
835: regmatch(prog)
836: char *prog;
837: {
1.7 millert 838: register char *scan; /* Current node. */
1.1 etheisen 839: char *next; /* Next node. */
840:
841: scan = prog;
842: #ifdef DEBUG
843: if (scan != NULL && regnarrate)
844: fprintf(stderr, "%s(\n", regprop(scan));
845: #endif
846: while (scan != NULL) {
847: #ifdef DEBUG
848: if (regnarrate)
849: fprintf(stderr, "%s...\n", regprop(scan));
850: #endif
851: next = regnext(scan);
852:
853: switch (OP(scan)) {
854: case BOL:
855: if (reginput != regbol)
856: return(0);
857: break;
858: case EOL:
859: if (*reginput != '\0')
860: return(0);
861: break;
862: case ANY:
863: if (*reginput == '\0')
864: return(0);
865: reginput++;
866: break;
867: case EXACTLY: {
1.7 millert 868: register int len;
869: register char *opnd;
1.1 etheisen 870:
871: opnd = OPERAND(scan);
872: /* Inline the first character, for speed. */
873: if (*opnd != *reginput)
874: return(0);
875: len = strlen(opnd);
876: if (len > 1 && strncmp(opnd, reginput, len) != 0)
877: return(0);
878: reginput += len;
879: }
880: break;
881: case ANYOF:
882: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
883: return(0);
884: reginput++;
885: break;
886: case ANYBUT:
887: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
888: return(0);
889: reginput++;
890: break;
891: case NOTHING:
892: break;
893: case BACK:
894: break;
895: case OPEN+1:
896: case OPEN+2:
897: case OPEN+3:
898: case OPEN+4:
899: case OPEN+5:
900: case OPEN+6:
901: case OPEN+7:
902: case OPEN+8:
903: case OPEN+9: {
1.7 millert 904: register int no;
905: register char *save;
1.1 etheisen 906:
907: no = OP(scan) - OPEN;
908: save = reginput;
909:
910: if (regmatch(next)) {
911: /*
912: * Don't set startp if some later
913: * invocation of the same parentheses
914: * already has.
915: */
916: if (regstartp[no] == NULL)
917: regstartp[no] = save;
918: return(1);
919: } else
920: return(0);
921: }
922: /* NOTREACHED */
923: break;
924: case CLOSE+1:
925: case CLOSE+2:
926: case CLOSE+3:
927: case CLOSE+4:
928: case CLOSE+5:
929: case CLOSE+6:
930: case CLOSE+7:
931: case CLOSE+8:
932: case CLOSE+9: {
1.7 millert 933: register int no;
934: register char *save;
1.1 etheisen 935:
936: no = OP(scan) - CLOSE;
937: save = reginput;
938:
939: if (regmatch(next)) {
940: /*
941: * Don't set endp if some later
942: * invocation of the same parentheses
943: * already has.
944: */
945: if (regendp[no] == NULL)
946: regendp[no] = save;
947: return(1);
948: } else
949: return(0);
950: }
951: /* NOTREACHED */
952: break;
953: case BRANCH: {
1.7 millert 954: register char *save;
1.1 etheisen 955:
956: if (OP(next) != BRANCH) /* No choice. */
957: next = OPERAND(scan); /* Avoid recursion. */
958: else {
959: do {
960: save = reginput;
961: if (regmatch(OPERAND(scan)))
962: return(1);
963: reginput = save;
964: scan = regnext(scan);
965: } while (scan != NULL && OP(scan) == BRANCH);
966: return(0);
967: /* NOTREACHED */
968: }
969: }
970: /* NOTREACHED */
971: break;
972: case STAR:
973: case PLUS: {
1.7 millert 974: register char nextch;
975: register int no;
976: register char *save;
977: register int min;
1.1 etheisen 978:
979: /*
980: * Lookahead to avoid useless match attempts
981: * when we know what character comes next.
982: */
983: nextch = '\0';
984: if (OP(next) == EXACTLY)
985: nextch = *OPERAND(next);
986: min = (OP(scan) == STAR) ? 0 : 1;
987: save = reginput;
988: no = regrepeat(OPERAND(scan));
989: while (no >= min) {
990: /* If it could work, try it. */
991: if (nextch == '\0' || *reginput == nextch)
992: if (regmatch(next))
993: return(1);
994: /* Couldn't or didn't -- back up. */
995: no--;
996: reginput = save + no;
997: }
998: return(0);
999: }
1000: /* NOTREACHED */
1001: break;
1002: case END:
1003: return(1); /* Success! */
1004: /* NOTREACHED */
1005: break;
1006: default:
1007: regerror("memory corruption");
1008: return(0);
1009: /* NOTREACHED */
1010: break;
1011: }
1012:
1013: scan = next;
1014: }
1015:
1016: /*
1017: * We get here only if there's trouble -- normally "case END" is
1018: * the terminating point.
1019: */
1020: regerror("corrupted pointers");
1021: return(0);
1022: }
1023:
1024: /*
1025: - regrepeat - repeatedly match something simple, report how many
1026: */
1027: static int
1028: regrepeat(p)
1029: char *p;
1030: {
1.7 millert 1031: register int count = 0;
1032: register char *scan;
1033: register char *opnd;
1.1 etheisen 1034:
1035: scan = reginput;
1036: opnd = OPERAND(p);
1037: switch (OP(p)) {
1038: case ANY:
1039: count = strlen(scan);
1040: scan += count;
1041: break;
1042: case EXACTLY:
1043: while (*opnd == *scan) {
1044: count++;
1045: scan++;
1046: }
1047: break;
1048: case ANYOF:
1049: while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1050: count++;
1051: scan++;
1052: }
1053: break;
1054: case ANYBUT:
1055: while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1056: count++;
1057: scan++;
1058: }
1059: break;
1060: default: /* Oh dear. Called inappropriately. */
1061: regerror("internal foulup");
1062: count = 0; /* Best compromise. */
1063: break;
1064: }
1065: reginput = scan;
1066:
1067: return(count);
1068: }
1069:
1070: /*
1071: - regnext - dig the "next" pointer out of a node
1072: */
1073: static char *
1074: regnext(p)
1.7 millert 1075: register char *p;
1.1 etheisen 1076: {
1.7 millert 1077: register int offset;
1.1 etheisen 1078:
1079: if (p == ®dummy)
1080: return(NULL);
1081:
1082: offset = NEXT(p);
1083: if (offset == 0)
1084: return(NULL);
1085:
1086: if (OP(p) == BACK)
1087: return(p-offset);
1088: else
1089: return(p+offset);
1090: }
1091:
1092: #ifdef DEBUG
1093:
1094: STATIC char *regprop();
1095:
1096: /*
1097: - regdump - dump a regexp onto stdout in vaguely comprehensible form
1098: */
1099: void
1100: regdump(r)
1101: regexp *r;
1102: {
1.7 millert 1103: register char *s;
1104: register char op = EXACTLY; /* Arbitrary non-END op. */
1105: register char *next;
1.1 etheisen 1106:
1107:
1108: s = r->program + 1;
1109: while (op != END) { /* While that wasn't END last time... */
1110: op = OP(s);
1111: printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1112: next = regnext(s);
1113: if (next == NULL) /* Next ptr. */
1114: printf("(0)");
1115: else
1116: printf("(%d)", (s-r->program)+(next-s));
1117: s += 3;
1118: if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1119: /* Literal string, where present. */
1120: while (*s != '\0') {
1121: putchar(*s);
1122: s++;
1123: }
1124: s++;
1125: }
1126: putchar('\n');
1127: }
1128:
1129: /* Header fields of interest. */
1130: if (r->regstart != '\0')
1131: printf("start `%c' ", r->regstart);
1132: if (r->reganch)
1133: printf("anchored ");
1134: if (r->regmust != NULL)
1135: printf("must have \"%s\"", r->regmust);
1136: printf("\n");
1137: }
1138:
1139: /*
1140: - regprop - printable representation of opcode
1141: */
1142: static char *
1143: regprop(op)
1144: char *op;
1145: {
1.7 millert 1146: register char *p;
1.1 etheisen 1147: static char buf[50];
1148:
1.7 millert 1149: (void) strlcpy(buf, ":", sizeof(buf));
1.1 etheisen 1150:
1151: switch (OP(op)) {
1152: case BOL:
1153: p = "BOL";
1154: break;
1155: case EOL:
1156: p = "EOL";
1157: break;
1158: case ANY:
1159: p = "ANY";
1160: break;
1161: case ANYOF:
1162: p = "ANYOF";
1163: break;
1164: case ANYBUT:
1165: p = "ANYBUT";
1166: break;
1167: case BRANCH:
1168: p = "BRANCH";
1169: break;
1170: case EXACTLY:
1171: p = "EXACTLY";
1172: break;
1173: case NOTHING:
1174: p = "NOTHING";
1175: break;
1176: case BACK:
1177: p = "BACK";
1178: break;
1179: case END:
1180: p = "END";
1181: break;
1182: case OPEN+1:
1183: case OPEN+2:
1184: case OPEN+3:
1185: case OPEN+4:
1186: case OPEN+5:
1187: case OPEN+6:
1188: case OPEN+7:
1189: case OPEN+8:
1190: case OPEN+9:
1.7 millert 1191: snprintf(buf+strlen(buf), sizeof(buf)-strlen(buf),
1.6 deraadt 1192: "OPEN%d", OP(op)-OPEN);
1.1 etheisen 1193: p = NULL;
1194: break;
1195: case CLOSE+1:
1196: case CLOSE+2:
1197: case CLOSE+3:
1198: case CLOSE+4:
1199: case CLOSE+5:
1200: case CLOSE+6:
1201: case CLOSE+7:
1202: case CLOSE+8:
1203: case CLOSE+9:
1.7 millert 1204: snprintf(buf+strlen(buf), sizeof(buf)-strlen(buf),
1.6 deraadt 1205: "CLOSE%d", OP(op)-CLOSE);
1.1 etheisen 1206: p = NULL;
1207: break;
1208: case STAR:
1209: p = "STAR";
1210: break;
1211: case PLUS:
1212: p = "PLUS";
1213: break;
1214: default:
1215: regerror("corrupted opcode");
1216: break;
1217: }
1218: if (p != NULL)
1.7 millert 1219: (void) strlcat(buf, p, sizeof(buf));
1.1 etheisen 1220: return(buf);
1221: }
1222: #endif
1223:
1224: /*
1225: * The following is provided for those people who do not have strcspn() in
1226: * their C libraries. They should get off their butts and do something
1227: * about it; at least one public-domain implementation of those (highly
1228: * useful) string routines has been published on Usenet.
1229: */
1230: #ifdef STRCSPN
1231: /*
1232: * strcspn - find length of initial segment of s1 consisting entirely
1233: * of characters not from s2
1234: */
1235:
1236: static int
1237: strcspn(s1, s2)
1238: char *s1;
1239: char *s2;
1240: {
1.7 millert 1241: register char *scan1;
1242: register char *scan2;
1243: register int count;
1.1 etheisen 1244:
1245: count = 0;
1246: for (scan1 = s1; *scan1 != '\0'; scan1++) {
1247: for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1248: if (*scan1 == *scan2++)
1249: return(count);
1250: count++;
1251: }
1252: return(count);
1253: }
1254: #endif