Annotation of src/usr.bin/less/regexp.c, Revision 1.7
1.1 etheisen 1: /*
2: * regcomp and regexec -- regsub and regerror are elsewhere
3: *
4: * Copyright (c) 1986 by University of Toronto.
5: * Written by Henry Spencer. Not derived from licensed software.
6: *
7: * Permission is granted to anyone to use this software for any
8: * purpose on any computer system, and to redistribute it freely,
9: * subject to the following restrictions:
10: *
11: * 1. The author is not responsible for the consequences of use of
12: * this software, no matter how awful, even if they arise
13: * from defects in it.
14: *
15: * 2. The origin of this software must not be misrepresented, either
16: * by explicit claim or by omission.
17: *
18: * 3. Altered versions must be plainly marked as such, and must not
19: * be misrepresented as being the original software.
20: *
21: * Beware that some of this code is subtly aware of the way operator
22: * precedence is structured in regular expressions. Serious changes in
23: * regular-expression syntax might require a total rethink.
24: *
25: * *** NOTE: this code has been altered slightly for use in Tcl. ***
26: * Slightly modified by David MacKenzie to undo most of the changes for TCL.
1.7 ! millert 27: * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
1.1 etheisen 28: */
29:
1.7 ! millert 30: #include "less.h"
! 31: #if HAVE_STDIO_H
1.1 etheisen 32: #include <stdio.h>
1.7 ! millert 33: #endif
! 34: #if HAVE_STDLIB_H
! 35: #include <stdlib.h>
! 36: #endif
! 37: #if HAVE_STRING_H
! 38: #include <string.h>
! 39: #endif
1.1 etheisen 40: #include "regexp.h"
41:
42: /*
43: * The "internal use only" fields in regexp.h are present to pass info from
44: * compile to execute that permits the execute phase to run lots faster on
45: * simple cases. They are:
46: *
47: * regstart char that must begin a match; '\0' if none obvious
48: * reganch is the match anchored (at beginning-of-line only)?
49: * regmust string (pointer into program) that match must include, or NULL
50: * regmlen length of regmust string
51: *
52: * Regstart and reganch permit very fast decisions on suitable starting points
53: * for a match, cutting down the work a lot. Regmust permits fast rejection
54: * of lines that cannot possibly match. The regmust tests are costly enough
55: * that regcomp() supplies a regmust only if the r.e. contains something
56: * potentially expensive (at present, the only such thing detected is * or +
57: * at the start of the r.e., which can involve a lot of backup). Regmlen is
58: * supplied because the test in regexec() needs it and regcomp() is
59: * computing it anyway.
60: */
61:
62: /*
63: * Structure for regexp "program". This is essentially a linear encoding
64: * of a nondeterministic finite-state machine (aka syntax charts or
65: * "railroad normal form" in parsing technology). Each node is an opcode
66: * plus a "next" pointer, possibly plus an operand. "Next" pointers of
67: * all nodes except BRANCH implement concatenation; a "next" pointer with
68: * a BRANCH on both ends of it is connecting two alternatives. (Here we
69: * have one of the subtle syntax dependencies: an individual BRANCH (as
70: * opposed to a collection of them) is never concatenated with anything
71: * because of operator precedence.) The operand of some types of node is
72: * a literal string; for others, it is a node leading into a sub-FSM. In
73: * particular, the operand of a BRANCH node is the first node of the branch.
74: * (NB this is *not* a tree structure: the tail of the branch connects
75: * to the thing following the set of BRANCHes.) The opcodes are:
76: */
77:
78: /* definition number opnd? meaning */
1.7 ! millert 79: #undef EOL
1.1 etheisen 80: #define END 0 /* no End of program. */
81: #define BOL 1 /* no Match "" at beginning of line. */
82: #define EOL 2 /* no Match "" at end of line. */
83: #define ANY 3 /* no Match any one character. */
84: #define ANYOF 4 /* str Match any character in this string. */
85: #define ANYBUT 5 /* str Match any character not in this string. */
86: #define BRANCH 6 /* node Match this alternative, or the next... */
87: #define BACK 7 /* no Match "", "next" ptr points backward. */
88: #define EXACTLY 8 /* str Match this string. */
89: #define NOTHING 9 /* no Match empty string. */
90: #define STAR 10 /* node Match this (simple) thing 0 or more times. */
91: #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
92: #define OPEN 20 /* no Mark this point in input as start of #n. */
93: /* OPEN+1 is number 1, etc. */
94: #define CLOSE 30 /* no Analogous to OPEN. */
95:
96: /*
97: * Opcode notes:
98: *
99: * BRANCH The set of branches constituting a single choice are hooked
100: * together with their "next" pointers, since precedence prevents
101: * anything being concatenated to any individual branch. The
102: * "next" pointer of the last BRANCH in a choice points to the
103: * thing following the whole choice. This is also where the
104: * final "next" pointer of each individual branch points; each
105: * branch starts with the operand node of a BRANCH node.
106: *
107: * BACK Normal "next" pointers all implicitly point forward; BACK
108: * exists to make loop structures possible.
109: *
110: * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
111: * BRANCH structures using BACK. Simple cases (one character
112: * per match) are implemented with STAR and PLUS for speed
113: * and to minimize recursive plunges.
114: *
115: * OPEN,CLOSE ...are numbered at compile time.
116: */
117:
118: /*
119: * A node is one char of opcode followed by two chars of "next" pointer.
120: * "Next" pointers are stored as two 8-bit pieces, high order first. The
121: * value is a positive offset from the opcode of the node containing it.
122: * An operand, if any, simply follows the node. (Note that much of the
123: * code generation knows about this implicit relationship.)
124: *
125: * Using two bytes for the "next" pointer is vast overkill for most things,
126: * but allows patterns to get big without disasters.
127: */
128: #define OP(p) (*(p))
129: #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
130: #define OPERAND(p) ((p) + 3)
131:
132: /*
133: * See regmagic.h for one further detail of program structure.
134: */
135:
136:
137: /*
138: * Utility definitions.
139: */
140: #ifndef CHARBITS
141: #define UCHARAT(p) ((int)*(unsigned char *)(p))
142: #else
143: #define UCHARAT(p) ((int)*(p)&CHARBITS)
144: #endif
145:
146: #define FAIL(m) { regerror(m); return(NULL); }
147: #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
148: #define META "^$.[()|?+*\\"
149:
150: /*
151: * Flags to be passed up and down.
152: */
153: #define HASWIDTH 01 /* Known never to match null string. */
154: #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
155: #define SPSTART 04 /* Starts with * or +. */
156: #define WORST 0 /* Worst case. */
157:
158: /*
159: * Global work variables for regcomp().
160: */
161: static char *regparse; /* Input-scan pointer. */
162: static int regnpar; /* () count. */
163: static char regdummy;
164: static char *regcode; /* Code-emit pointer; ®dummy = don't. */
165: static long regsize; /* Code size. */
166:
167: /*
168: * The first byte of the regexp internal "program" is actually this magic
169: * number; the start node begins in the second byte.
170: */
171: #define MAGIC 0234
172:
173:
174: /*
175: * Forward declarations for regcomp()'s friends.
176: */
177: #ifndef STATIC
178: #define STATIC static
179: #endif
180: STATIC char *reg();
181: STATIC char *regbranch();
182: STATIC char *regpiece();
183: STATIC char *regatom();
184: STATIC char *regnode();
185: STATIC char *regnext();
186: STATIC void regc();
187: STATIC void reginsert();
188: STATIC void regtail();
189: STATIC void regoptail();
190: #ifdef STRCSPN
191: STATIC int strcspn();
192: #endif
193:
194: /*
195: - regcomp - compile a regular expression into internal code
196: *
197: * We can't allocate space until we know how big the compiled form will be,
198: * but we can't compile it (and thus know how big it is) until we've got a
199: * place to put the code. So we cheat: we compile it twice, once with code
200: * generation turned off and size counting turned on, and once "for real".
201: * This also means that we don't allocate space until we are sure that the
202: * thing really will compile successfully, and we never have to move the
203: * code and thus invalidate pointers into it. (Note that it has to be in
204: * one piece because free() must be able to free it all.)
205: *
206: * Beware that the optimization-preparation code in here knows about some
207: * of the structure of the compiled regexp.
208: */
209: regexp *
210: regcomp(exp)
211: char *exp;
212: {
1.7 ! millert 213: register regexp *r;
! 214: register char *scan;
! 215: register char *longest;
! 216: register int len;
1.1 etheisen 217: int flags;
218:
219: if (exp == NULL)
220: FAIL("NULL argument");
221:
222: /* First pass: determine size, legality. */
223: regparse = exp;
224: regnpar = 1;
225: regsize = 0L;
226: regcode = ®dummy;
227: regc(MAGIC);
228: if (reg(0, &flags) == NULL)
229: return(NULL);
230:
231: /* Small enough for pointer-storage convention? */
232: if (regsize >= 32767L) /* Probably could be 65535L. */
233: FAIL("regexp too big");
234:
235: /* Allocate space. */
236: r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
237: if (r == NULL)
238: FAIL("out of space");
239:
240: /* Second pass: emit code. */
241: regparse = exp;
242: regnpar = 1;
243: regcode = r->program;
244: regc(MAGIC);
245: if (reg(0, &flags) == NULL)
246: return(NULL);
247:
248: /* Dig out information for optimizations. */
249: r->regstart = '\0'; /* Worst-case defaults. */
250: r->reganch = 0;
251: r->regmust = NULL;
252: r->regmlen = 0;
253: scan = r->program+1; /* First BRANCH. */
254: if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
255: scan = OPERAND(scan);
256:
257: /* Starting-point info. */
258: if (OP(scan) == EXACTLY)
259: r->regstart = *OPERAND(scan);
260: else if (OP(scan) == BOL)
261: r->reganch++;
262:
263: /*
264: * If there's something expensive in the r.e., find the
265: * longest literal string that must appear and make it the
266: * regmust. Resolve ties in favor of later strings, since
267: * the regstart check works with the beginning of the r.e.
268: * and avoiding duplication strengthens checking. Not a
269: * strong reason, but sufficient in the absence of others.
270: */
271: if (flags&SPSTART) {
272: longest = NULL;
273: len = 0;
274: for (; scan != NULL; scan = regnext(scan))
275: if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
276: longest = OPERAND(scan);
277: len = strlen(OPERAND(scan));
278: }
279: r->regmust = longest;
280: r->regmlen = len;
281: }
282: }
283:
284: return(r);
285: }
286:
287: /*
288: - reg - regular expression, i.e. main body or parenthesized thing
289: *
290: * Caller must absorb opening parenthesis.
291: *
292: * Combining parenthesis handling with the base level of regular expression
293: * is a trifle forced, but the need to tie the tails of the branches to what
294: * follows makes it hard to avoid.
295: */
296: static char *
297: reg(paren, flagp)
298: int paren; /* Parenthesized? */
299: int *flagp;
300: {
1.7 ! millert 301: register char *ret;
! 302: register char *br;
! 303: register char *ender;
! 304: register int parno = 0;
1.1 etheisen 305: int flags;
306:
307: *flagp = HASWIDTH; /* Tentatively. */
308:
309: /* Make an OPEN node, if parenthesized. */
310: if (paren) {
311: if (regnpar >= NSUBEXP)
312: FAIL("too many ()");
313: parno = regnpar;
314: regnpar++;
315: ret = regnode(OPEN+parno);
316: } else
317: ret = NULL;
318:
319: /* Pick up the branches, linking them together. */
320: br = regbranch(&flags);
321: if (br == NULL)
322: return(NULL);
323: if (ret != NULL)
324: regtail(ret, br); /* OPEN -> first. */
325: else
326: ret = br;
327: if (!(flags&HASWIDTH))
328: *flagp &= ~HASWIDTH;
329: *flagp |= flags&SPSTART;
330: while (*regparse == '|') {
331: regparse++;
332: br = regbranch(&flags);
333: if (br == NULL)
334: return(NULL);
335: regtail(ret, br); /* BRANCH -> BRANCH. */
336: if (!(flags&HASWIDTH))
337: *flagp &= ~HASWIDTH;
338: *flagp |= flags&SPSTART;
339: }
340:
341: /* Make a closing node, and hook it on the end. */
342: ender = regnode((paren) ? CLOSE+parno : END);
343: regtail(ret, ender);
344:
345: /* Hook the tails of the branches to the closing node. */
346: for (br = ret; br != NULL; br = regnext(br))
347: regoptail(br, ender);
348:
349: /* Check for proper termination. */
350: if (paren && *regparse++ != ')') {
351: FAIL("unmatched ()");
352: } else if (!paren && *regparse != '\0') {
353: if (*regparse == ')') {
354: FAIL("unmatched ()");
355: } else
356: FAIL("junk on end"); /* "Can't happen". */
357: /* NOTREACHED */
358: }
359:
360: return(ret);
361: }
362:
363: /*
364: - regbranch - one alternative of an | operator
365: *
366: * Implements the concatenation operator.
367: */
368: static char *
369: regbranch(flagp)
370: int *flagp;
371: {
1.7 ! millert 372: register char *ret;
! 373: register char *chain;
! 374: register char *latest;
1.1 etheisen 375: int flags;
376:
377: *flagp = WORST; /* Tentatively. */
378:
379: ret = regnode(BRANCH);
380: chain = NULL;
381: while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
382: latest = regpiece(&flags);
383: if (latest == NULL)
384: return(NULL);
385: *flagp |= flags&HASWIDTH;
386: if (chain == NULL) /* First piece. */
387: *flagp |= flags&SPSTART;
388: else
389: regtail(chain, latest);
390: chain = latest;
391: }
392: if (chain == NULL) /* Loop ran zero times. */
393: (void) regnode(NOTHING);
394:
395: return(ret);
396: }
397:
398: /*
399: - regpiece - something followed by possible [*+?]
400: *
401: * Note that the branching code sequences used for ? and the general cases
402: * of * and + are somewhat optimized: they use the same NOTHING node as
403: * both the endmarker for their branch list and the body of the last branch.
404: * It might seem that this node could be dispensed with entirely, but the
405: * endmarker role is not redundant.
406: */
407: static char *
408: regpiece(flagp)
409: int *flagp;
410: {
1.7 ! millert 411: register char *ret;
! 412: register char op;
! 413: register char *next;
1.1 etheisen 414: int flags;
415:
416: ret = regatom(&flags);
417: if (ret == NULL)
418: return(NULL);
419:
420: op = *regparse;
421: if (!ISMULT(op)) {
422: *flagp = flags;
423: return(ret);
424: }
425:
426: if (!(flags&HASWIDTH) && op != '?')
427: FAIL("*+ operand could be empty");
428: *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
429:
430: if (op == '*' && (flags&SIMPLE))
431: reginsert(STAR, ret);
432: else if (op == '*') {
433: /* Emit x* as (x&|), where & means "self". */
434: reginsert(BRANCH, ret); /* Either x */
435: regoptail(ret, regnode(BACK)); /* and loop */
436: regoptail(ret, ret); /* back */
437: regtail(ret, regnode(BRANCH)); /* or */
438: regtail(ret, regnode(NOTHING)); /* null. */
439: } else if (op == '+' && (flags&SIMPLE))
440: reginsert(PLUS, ret);
441: else if (op == '+') {
442: /* Emit x+ as x(&|), where & means "self". */
443: next = regnode(BRANCH); /* Either */
444: regtail(ret, next);
445: regtail(regnode(BACK), ret); /* loop back */
446: regtail(next, regnode(BRANCH)); /* or */
447: regtail(ret, regnode(NOTHING)); /* null. */
448: } else if (op == '?') {
449: /* Emit x? as (x|) */
450: reginsert(BRANCH, ret); /* Either x */
451: regtail(ret, regnode(BRANCH)); /* or */
452: next = regnode(NOTHING); /* null. */
453: regtail(ret, next);
454: regoptail(ret, next);
455: }
456: regparse++;
457: if (ISMULT(*regparse))
458: FAIL("nested *?+");
459:
460: return(ret);
461: }
462:
463: /*
464: - regatom - the lowest level
465: *
466: * Optimization: gobbles an entire sequence of ordinary characters so that
467: * it can turn them into a single node, which is smaller to store and
468: * faster to run. Backslashed characters are exceptions, each becoming a
469: * separate node; the code is simpler that way and it's not worth fixing.
470: */
471: static char *
472: regatom(flagp)
473: int *flagp;
474: {
1.7 ! millert 475: register char *ret;
1.1 etheisen 476: int flags;
477:
478: *flagp = WORST; /* Tentatively. */
479:
480: switch (*regparse++) {
481: case '^':
482: ret = regnode(BOL);
483: break;
484: case '$':
485: ret = regnode(EOL);
486: break;
487: case '.':
488: ret = regnode(ANY);
489: *flagp |= HASWIDTH|SIMPLE;
490: break;
491: case '[': {
1.7 ! millert 492: register int clss;
! 493: register int classend;
1.1 etheisen 494:
495: if (*regparse == '^') { /* Complement of range. */
496: ret = regnode(ANYBUT);
497: regparse++;
498: } else
499: ret = regnode(ANYOF);
500: if (*regparse == ']' || *regparse == '-')
501: regc(*regparse++);
502: while (*regparse != '\0' && *regparse != ']') {
503: if (*regparse == '-') {
504: regparse++;
505: if (*regparse == ']' || *regparse == '\0')
506: regc('-');
507: else {
508: clss = UCHARAT(regparse-2)+1;
509: classend = UCHARAT(regparse);
510: if (clss > classend+1)
511: FAIL("invalid [] range");
512: for (; clss <= classend; clss++)
513: regc(clss);
514: regparse++;
515: }
516: } else
517: regc(*regparse++);
518: }
519: regc('\0');
520: if (*regparse != ']')
521: FAIL("unmatched []");
522: regparse++;
523: *flagp |= HASWIDTH|SIMPLE;
524: }
525: break;
526: case '(':
527: ret = reg(1, &flags);
528: if (ret == NULL)
529: return(NULL);
530: *flagp |= flags&(HASWIDTH|SPSTART);
531: break;
532: case '\0':
533: case '|':
534: case ')':
535: FAIL("internal urp"); /* Supposed to be caught earlier. */
536: /* NOTREACHED */
537: break;
538: case '?':
539: case '+':
540: case '*':
541: FAIL("?+* follows nothing");
542: /* NOTREACHED */
543: break;
544: case '\\':
545: if (*regparse == '\0')
546: FAIL("trailing \\");
547: ret = regnode(EXACTLY);
548: regc(*regparse++);
549: regc('\0');
550: *flagp |= HASWIDTH|SIMPLE;
551: break;
552: default: {
1.7 ! millert 553: register int len;
! 554: register char ender;
1.1 etheisen 555:
556: regparse--;
557: len = strcspn(regparse, META);
558: if (len <= 0)
559: FAIL("internal disaster");
560: ender = *(regparse+len);
561: if (len > 1 && ISMULT(ender))
562: len--; /* Back off clear of ?+* operand. */
563: *flagp |= HASWIDTH;
564: if (len == 1)
565: *flagp |= SIMPLE;
566: ret = regnode(EXACTLY);
567: while (len > 0) {
568: regc(*regparse++);
569: len--;
570: }
571: regc('\0');
572: }
573: break;
574: }
575:
576: return(ret);
577: }
578:
579: /*
580: - regnode - emit a node
581: */
582: static char * /* Location. */
583: regnode(op)
584: char op;
585: {
1.7 ! millert 586: register char *ret;
! 587: register char *ptr;
1.1 etheisen 588:
589: ret = regcode;
590: if (ret == ®dummy) {
591: regsize += 3;
592: return(ret);
593: }
594:
595: ptr = ret;
596: *ptr++ = op;
597: *ptr++ = '\0'; /* Null "next" pointer. */
598: *ptr++ = '\0';
599: regcode = ptr;
600:
601: return(ret);
602: }
603:
604: /*
605: - regc - emit (if appropriate) a byte of code
606: */
607: static void
608: regc(b)
609: char b;
610: {
611: if (regcode != ®dummy)
612: *regcode++ = b;
613: else
614: regsize++;
615: }
616:
617: /*
618: - reginsert - insert an operator in front of already-emitted operand
619: *
620: * Means relocating the operand.
621: */
622: static void
623: reginsert(op, opnd)
624: char op;
625: char *opnd;
626: {
1.7 ! millert 627: register char *src;
! 628: register char *dst;
! 629: register char *place;
1.1 etheisen 630:
631: if (regcode == ®dummy) {
632: regsize += 3;
633: return;
634: }
635:
636: src = regcode;
637: regcode += 3;
638: dst = regcode;
639: while (src > opnd)
640: *--dst = *--src;
641:
642: place = opnd; /* Op node, where operand used to be. */
643: *place++ = op;
644: *place++ = '\0';
645: *place++ = '\0';
646: }
647:
648: /*
649: - regtail - set the next-pointer at the end of a node chain
650: */
651: static void
652: regtail(p, val)
653: char *p;
654: char *val;
655: {
1.7 ! millert 656: register char *scan;
! 657: register char *temp;
! 658: register int offset;
1.1 etheisen 659:
660: if (p == ®dummy)
661: return;
662:
663: /* Find last node. */
664: scan = p;
665: for (;;) {
666: temp = regnext(scan);
667: if (temp == NULL)
668: break;
669: scan = temp;
670: }
671:
672: if (OP(scan) == BACK)
673: offset = scan - val;
674: else
675: offset = val - scan;
676: *(scan+1) = (offset>>8)&0377;
677: *(scan+2) = offset&0377;
678: }
679:
680: /*
681: - regoptail - regtail on operand of first argument; nop if operandless
682: */
683: static void
684: regoptail(p, val)
685: char *p;
686: char *val;
687: {
688: /* "Operandless" and "op != BRANCH" are synonymous in practice. */
689: if (p == NULL || p == ®dummy || OP(p) != BRANCH)
690: return;
691: regtail(OPERAND(p), val);
692: }
693:
694: /*
695: * regexec and friends
696: */
697:
698: /*
699: * Global work variables for regexec().
700: */
701: static char *reginput; /* String-input pointer. */
702: static char *regbol; /* Beginning of input, for ^ check. */
703: static char **regstartp; /* Pointer to startp array. */
704: static char **regendp; /* Ditto for endp. */
705:
706: /*
707: * Forwards.
708: */
709: STATIC int regtry();
710: STATIC int regmatch();
711: STATIC int regrepeat();
712:
713: #ifdef DEBUG
714: int regnarrate = 0;
715: void regdump();
716: STATIC char *regprop();
717: #endif
718:
719: /*
720: - regexec - match a regexp against a string
721: */
722: int
1.7 ! millert 723: regexec2(prog, string, notbol)
! 724: register regexp *prog;
! 725: register char *string;
! 726: int notbol;
1.1 etheisen 727: {
1.7 ! millert 728: register char *s;
1.1 etheisen 729:
730: /* Be paranoid... */
731: if (prog == NULL || string == NULL) {
732: regerror("NULL parameter");
733: return(0);
734: }
735:
736: /* Check validity of program. */
737: if (UCHARAT(prog->program) != MAGIC) {
738: regerror("corrupted program");
739: return(0);
740: }
741:
742: /* If there is a "must appear" string, look for it. */
743: if (prog->regmust != NULL) {
744: s = string;
745: while ((s = strchr(s, prog->regmust[0])) != NULL) {
746: if (strncmp(s, prog->regmust, prog->regmlen) == 0)
747: break; /* Found it. */
748: s++;
749: }
750: if (s == NULL) /* Not present. */
751: return(0);
752: }
753:
754: /* Mark beginning of line for ^ . */
1.7 ! millert 755: if (notbol)
! 756: regbol = NULL;
! 757: else
! 758: regbol = string;
1.1 etheisen 759:
760: /* Simplest case: anchored match need be tried only once. */
761: if (prog->reganch)
762: return(regtry(prog, string));
763:
764: /* Messy cases: unanchored match. */
765: s = string;
766: if (prog->regstart != '\0')
767: /* We know what char it must start with. */
768: while ((s = strchr(s, prog->regstart)) != NULL) {
769: if (regtry(prog, s))
770: return(1);
771: s++;
772: }
773: else
774: /* We don't -- general case. */
775: do {
776: if (regtry(prog, s))
777: return(1);
778: } while (*s++ != '\0');
779:
780: /* Failure. */
781: return(0);
782: }
783:
1.7 ! millert 784: int
! 785: regexec(prog, string)
! 786: register regexp *prog;
! 787: register char *string;
! 788: {
! 789: return regexec2(prog, string, 0);
! 790: }
! 791:
1.1 etheisen 792: /*
793: - regtry - try match at specific point
794: */
795: static int /* 0 failure, 1 success */
796: regtry(prog, string)
797: regexp *prog;
798: char *string;
799: {
1.7 ! millert 800: register int i;
! 801: register char **sp;
! 802: register char **ep;
1.1 etheisen 803:
804: reginput = string;
805: regstartp = prog->startp;
806: regendp = prog->endp;
807:
808: sp = prog->startp;
809: ep = prog->endp;
810: for (i = NSUBEXP; i > 0; i--) {
811: *sp++ = NULL;
812: *ep++ = NULL;
813: }
814: if (regmatch(prog->program + 1)) {
815: prog->startp[0] = string;
816: prog->endp[0] = reginput;
817: return(1);
818: } else
819: return(0);
820: }
821:
822: /*
823: - regmatch - main matching routine
824: *
825: * Conceptually the strategy is simple: check to see whether the current
826: * node matches, call self recursively to see whether the rest matches,
827: * and then act accordingly. In practice we make some effort to avoid
828: * recursion, in particular by going through "ordinary" nodes (that don't
829: * need to know whether the rest of the match failed) by a loop instead of
830: * by recursion.
831: */
832: static int /* 0 failure, 1 success */
833: regmatch(prog)
834: char *prog;
835: {
1.7 ! millert 836: register char *scan; /* Current node. */
1.1 etheisen 837: char *next; /* Next node. */
838:
839: scan = prog;
840: #ifdef DEBUG
841: if (scan != NULL && regnarrate)
842: fprintf(stderr, "%s(\n", regprop(scan));
843: #endif
844: while (scan != NULL) {
845: #ifdef DEBUG
846: if (regnarrate)
847: fprintf(stderr, "%s...\n", regprop(scan));
848: #endif
849: next = regnext(scan);
850:
851: switch (OP(scan)) {
852: case BOL:
853: if (reginput != regbol)
854: return(0);
855: break;
856: case EOL:
857: if (*reginput != '\0')
858: return(0);
859: break;
860: case ANY:
861: if (*reginput == '\0')
862: return(0);
863: reginput++;
864: break;
865: case EXACTLY: {
1.7 ! millert 866: register int len;
! 867: register char *opnd;
1.1 etheisen 868:
869: opnd = OPERAND(scan);
870: /* Inline the first character, for speed. */
871: if (*opnd != *reginput)
872: return(0);
873: len = strlen(opnd);
874: if (len > 1 && strncmp(opnd, reginput, len) != 0)
875: return(0);
876: reginput += len;
877: }
878: break;
879: case ANYOF:
880: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
881: return(0);
882: reginput++;
883: break;
884: case ANYBUT:
885: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
886: return(0);
887: reginput++;
888: break;
889: case NOTHING:
890: break;
891: case BACK:
892: break;
893: case OPEN+1:
894: case OPEN+2:
895: case OPEN+3:
896: case OPEN+4:
897: case OPEN+5:
898: case OPEN+6:
899: case OPEN+7:
900: case OPEN+8:
901: case OPEN+9: {
1.7 ! millert 902: register int no;
! 903: register char *save;
1.1 etheisen 904:
905: no = OP(scan) - OPEN;
906: save = reginput;
907:
908: if (regmatch(next)) {
909: /*
910: * Don't set startp if some later
911: * invocation of the same parentheses
912: * already has.
913: */
914: if (regstartp[no] == NULL)
915: regstartp[no] = save;
916: return(1);
917: } else
918: return(0);
919: }
920: /* NOTREACHED */
921: break;
922: case CLOSE+1:
923: case CLOSE+2:
924: case CLOSE+3:
925: case CLOSE+4:
926: case CLOSE+5:
927: case CLOSE+6:
928: case CLOSE+7:
929: case CLOSE+8:
930: case CLOSE+9: {
1.7 ! millert 931: register int no;
! 932: register char *save;
1.1 etheisen 933:
934: no = OP(scan) - CLOSE;
935: save = reginput;
936:
937: if (regmatch(next)) {
938: /*
939: * Don't set endp if some later
940: * invocation of the same parentheses
941: * already has.
942: */
943: if (regendp[no] == NULL)
944: regendp[no] = save;
945: return(1);
946: } else
947: return(0);
948: }
949: /* NOTREACHED */
950: break;
951: case BRANCH: {
1.7 ! millert 952: register char *save;
1.1 etheisen 953:
954: if (OP(next) != BRANCH) /* No choice. */
955: next = OPERAND(scan); /* Avoid recursion. */
956: else {
957: do {
958: save = reginput;
959: if (regmatch(OPERAND(scan)))
960: return(1);
961: reginput = save;
962: scan = regnext(scan);
963: } while (scan != NULL && OP(scan) == BRANCH);
964: return(0);
965: /* NOTREACHED */
966: }
967: }
968: /* NOTREACHED */
969: break;
970: case STAR:
971: case PLUS: {
1.7 ! millert 972: register char nextch;
! 973: register int no;
! 974: register char *save;
! 975: register int min;
1.1 etheisen 976:
977: /*
978: * Lookahead to avoid useless match attempts
979: * when we know what character comes next.
980: */
981: nextch = '\0';
982: if (OP(next) == EXACTLY)
983: nextch = *OPERAND(next);
984: min = (OP(scan) == STAR) ? 0 : 1;
985: save = reginput;
986: no = regrepeat(OPERAND(scan));
987: while (no >= min) {
988: /* If it could work, try it. */
989: if (nextch == '\0' || *reginput == nextch)
990: if (regmatch(next))
991: return(1);
992: /* Couldn't or didn't -- back up. */
993: no--;
994: reginput = save + no;
995: }
996: return(0);
997: }
998: /* NOTREACHED */
999: break;
1000: case END:
1001: return(1); /* Success! */
1002: /* NOTREACHED */
1003: break;
1004: default:
1005: regerror("memory corruption");
1006: return(0);
1007: /* NOTREACHED */
1008: break;
1009: }
1010:
1011: scan = next;
1012: }
1013:
1014: /*
1015: * We get here only if there's trouble -- normally "case END" is
1016: * the terminating point.
1017: */
1018: regerror("corrupted pointers");
1019: return(0);
1020: }
1021:
1022: /*
1023: - regrepeat - repeatedly match something simple, report how many
1024: */
1025: static int
1026: regrepeat(p)
1027: char *p;
1028: {
1.7 ! millert 1029: register int count = 0;
! 1030: register char *scan;
! 1031: register char *opnd;
1.1 etheisen 1032:
1033: scan = reginput;
1034: opnd = OPERAND(p);
1035: switch (OP(p)) {
1036: case ANY:
1037: count = strlen(scan);
1038: scan += count;
1039: break;
1040: case EXACTLY:
1041: while (*opnd == *scan) {
1042: count++;
1043: scan++;
1044: }
1045: break;
1046: case ANYOF:
1047: while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1048: count++;
1049: scan++;
1050: }
1051: break;
1052: case ANYBUT:
1053: while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1054: count++;
1055: scan++;
1056: }
1057: break;
1058: default: /* Oh dear. Called inappropriately. */
1059: regerror("internal foulup");
1060: count = 0; /* Best compromise. */
1061: break;
1062: }
1063: reginput = scan;
1064:
1065: return(count);
1066: }
1067:
1068: /*
1069: - regnext - dig the "next" pointer out of a node
1070: */
1071: static char *
1072: regnext(p)
1.7 ! millert 1073: register char *p;
1.1 etheisen 1074: {
1.7 ! millert 1075: register int offset;
1.1 etheisen 1076:
1077: if (p == ®dummy)
1078: return(NULL);
1079:
1080: offset = NEXT(p);
1081: if (offset == 0)
1082: return(NULL);
1083:
1084: if (OP(p) == BACK)
1085: return(p-offset);
1086: else
1087: return(p+offset);
1088: }
1089:
1090: #ifdef DEBUG
1091:
1092: STATIC char *regprop();
1093:
1094: /*
1095: - regdump - dump a regexp onto stdout in vaguely comprehensible form
1096: */
1097: void
1098: regdump(r)
1099: regexp *r;
1100: {
1.7 ! millert 1101: register char *s;
! 1102: register char op = EXACTLY; /* Arbitrary non-END op. */
! 1103: register char *next;
1.1 etheisen 1104:
1105:
1106: s = r->program + 1;
1107: while (op != END) { /* While that wasn't END last time... */
1108: op = OP(s);
1109: printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1110: next = regnext(s);
1111: if (next == NULL) /* Next ptr. */
1112: printf("(0)");
1113: else
1114: printf("(%d)", (s-r->program)+(next-s));
1115: s += 3;
1116: if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1117: /* Literal string, where present. */
1118: while (*s != '\0') {
1119: putchar(*s);
1120: s++;
1121: }
1122: s++;
1123: }
1124: putchar('\n');
1125: }
1126:
1127: /* Header fields of interest. */
1128: if (r->regstart != '\0')
1129: printf("start `%c' ", r->regstart);
1130: if (r->reganch)
1131: printf("anchored ");
1132: if (r->regmust != NULL)
1133: printf("must have \"%s\"", r->regmust);
1134: printf("\n");
1135: }
1136:
1137: /*
1138: - regprop - printable representation of opcode
1139: */
1140: static char *
1141: regprop(op)
1142: char *op;
1143: {
1.7 ! millert 1144: register char *p;
1.1 etheisen 1145: static char buf[50];
1146:
1.7 ! millert 1147: (void) strlcpy(buf, ":", sizeof(buf));
1.1 etheisen 1148:
1149: switch (OP(op)) {
1150: case BOL:
1151: p = "BOL";
1152: break;
1153: case EOL:
1154: p = "EOL";
1155: break;
1156: case ANY:
1157: p = "ANY";
1158: break;
1159: case ANYOF:
1160: p = "ANYOF";
1161: break;
1162: case ANYBUT:
1163: p = "ANYBUT";
1164: break;
1165: case BRANCH:
1166: p = "BRANCH";
1167: break;
1168: case EXACTLY:
1169: p = "EXACTLY";
1170: break;
1171: case NOTHING:
1172: p = "NOTHING";
1173: break;
1174: case BACK:
1175: p = "BACK";
1176: break;
1177: case END:
1178: p = "END";
1179: break;
1180: case OPEN+1:
1181: case OPEN+2:
1182: case OPEN+3:
1183: case OPEN+4:
1184: case OPEN+5:
1185: case OPEN+6:
1186: case OPEN+7:
1187: case OPEN+8:
1188: case OPEN+9:
1.7 ! millert 1189: snprintf(buf+strlen(buf), sizeof(buf)-strlen(buf),
1.6 deraadt 1190: "OPEN%d", OP(op)-OPEN);
1.1 etheisen 1191: p = NULL;
1192: break;
1193: case CLOSE+1:
1194: case CLOSE+2:
1195: case CLOSE+3:
1196: case CLOSE+4:
1197: case CLOSE+5:
1198: case CLOSE+6:
1199: case CLOSE+7:
1200: case CLOSE+8:
1201: case CLOSE+9:
1.7 ! millert 1202: snprintf(buf+strlen(buf), sizeof(buf)-strlen(buf),
1.6 deraadt 1203: "CLOSE%d", OP(op)-CLOSE);
1.1 etheisen 1204: p = NULL;
1205: break;
1206: case STAR:
1207: p = "STAR";
1208: break;
1209: case PLUS:
1210: p = "PLUS";
1211: break;
1212: default:
1213: regerror("corrupted opcode");
1214: break;
1215: }
1216: if (p != NULL)
1.7 ! millert 1217: (void) strlcat(buf, p, sizeof(buf));
1.1 etheisen 1218: return(buf);
1219: }
1220: #endif
1221:
1222: /*
1223: * The following is provided for those people who do not have strcspn() in
1224: * their C libraries. They should get off their butts and do something
1225: * about it; at least one public-domain implementation of those (highly
1226: * useful) string routines has been published on Usenet.
1227: */
1228: #ifdef STRCSPN
1229: /*
1230: * strcspn - find length of initial segment of s1 consisting entirely
1231: * of characters not from s2
1232: */
1233:
1234: static int
1235: strcspn(s1, s2)
1236: char *s1;
1237: char *s2;
1238: {
1.7 ! millert 1239: register char *scan1;
! 1240: register char *scan2;
! 1241: register int count;
1.1 etheisen 1242:
1243: count = 0;
1244: for (scan1 = s1; *scan1 != '\0'; scan1++) {
1245: for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1246: if (*scan1 == *scan2++)
1247: return(count);
1248: count++;
1249: }
1250: return(count);
1251: }
1252: #endif