Annotation of src/usr.bin/less/regexp.c, Revision 1.6
1.6 ! deraadt 1: /* $OpenBSD: regexp.c,v 1.5 2003/04/06 18:42:57 deraadt Exp $ */
1.2 niklas 2:
1.1 etheisen 3: /*
4: * regcomp and regexec -- regsub and regerror are elsewhere
5: *
6: * Copyright (c) 1986 by University of Toronto.
7: * Written by Henry Spencer. Not derived from licensed software.
8: *
9: * Permission is granted to anyone to use this software for any
10: * purpose on any computer system, and to redistribute it freely,
11: * subject to the following restrictions:
12: *
13: * 1. The author is not responsible for the consequences of use of
14: * this software, no matter how awful, even if they arise
15: * from defects in it.
16: *
17: * 2. The origin of this software must not be misrepresented, either
18: * by explicit claim or by omission.
19: *
20: * 3. Altered versions must be plainly marked as such, and must not
21: * be misrepresented as being the original software.
22: *
23: * Beware that some of this code is subtly aware of the way operator
24: * precedence is structured in regular expressions. Serious changes in
25: * regular-expression syntax might require a total rethink.
26: *
27: * *** NOTE: this code has been altered slightly for use in Tcl. ***
28: * Slightly modified by David MacKenzie to undo most of the changes for TCL.
29: */
30:
31: #include <stdio.h>
32: #include "regexp.h"
33: char *strchr();
34:
35: /*
36: * The "internal use only" fields in regexp.h are present to pass info from
37: * compile to execute that permits the execute phase to run lots faster on
38: * simple cases. They are:
39: *
40: * regstart char that must begin a match; '\0' if none obvious
41: * reganch is the match anchored (at beginning-of-line only)?
42: * regmust string (pointer into program) that match must include, or NULL
43: * regmlen length of regmust string
44: *
45: * Regstart and reganch permit very fast decisions on suitable starting points
46: * for a match, cutting down the work a lot. Regmust permits fast rejection
47: * of lines that cannot possibly match. The regmust tests are costly enough
48: * that regcomp() supplies a regmust only if the r.e. contains something
49: * potentially expensive (at present, the only such thing detected is * or +
50: * at the start of the r.e., which can involve a lot of backup). Regmlen is
51: * supplied because the test in regexec() needs it and regcomp() is
52: * computing it anyway.
53: */
54:
55: /*
56: * Structure for regexp "program". This is essentially a linear encoding
57: * of a nondeterministic finite-state machine (aka syntax charts or
58: * "railroad normal form" in parsing technology). Each node is an opcode
59: * plus a "next" pointer, possibly plus an operand. "Next" pointers of
60: * all nodes except BRANCH implement concatenation; a "next" pointer with
61: * a BRANCH on both ends of it is connecting two alternatives. (Here we
62: * have one of the subtle syntax dependencies: an individual BRANCH (as
63: * opposed to a collection of them) is never concatenated with anything
64: * because of operator precedence.) The operand of some types of node is
65: * a literal string; for others, it is a node leading into a sub-FSM. In
66: * particular, the operand of a BRANCH node is the first node of the branch.
67: * (NB this is *not* a tree structure: the tail of the branch connects
68: * to the thing following the set of BRANCHes.) The opcodes are:
69: */
70:
71: /* definition number opnd? meaning */
72: #define END 0 /* no End of program. */
73: #define BOL 1 /* no Match "" at beginning of line. */
74: #define EOL 2 /* no Match "" at end of line. */
75: #define ANY 3 /* no Match any one character. */
76: #define ANYOF 4 /* str Match any character in this string. */
77: #define ANYBUT 5 /* str Match any character not in this string. */
78: #define BRANCH 6 /* node Match this alternative, or the next... */
79: #define BACK 7 /* no Match "", "next" ptr points backward. */
80: #define EXACTLY 8 /* str Match this string. */
81: #define NOTHING 9 /* no Match empty string. */
82: #define STAR 10 /* node Match this (simple) thing 0 or more times. */
83: #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
84: #define OPEN 20 /* no Mark this point in input as start of #n. */
85: /* OPEN+1 is number 1, etc. */
86: #define CLOSE 30 /* no Analogous to OPEN. */
87:
88: /*
89: * Opcode notes:
90: *
91: * BRANCH The set of branches constituting a single choice are hooked
92: * together with their "next" pointers, since precedence prevents
93: * anything being concatenated to any individual branch. The
94: * "next" pointer of the last BRANCH in a choice points to the
95: * thing following the whole choice. This is also where the
96: * final "next" pointer of each individual branch points; each
97: * branch starts with the operand node of a BRANCH node.
98: *
99: * BACK Normal "next" pointers all implicitly point forward; BACK
100: * exists to make loop structures possible.
101: *
102: * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
103: * BRANCH structures using BACK. Simple cases (one character
104: * per match) are implemented with STAR and PLUS for speed
105: * and to minimize recursive plunges.
106: *
107: * OPEN,CLOSE ...are numbered at compile time.
108: */
109:
110: /*
111: * A node is one char of opcode followed by two chars of "next" pointer.
112: * "Next" pointers are stored as two 8-bit pieces, high order first. The
113: * value is a positive offset from the opcode of the node containing it.
114: * An operand, if any, simply follows the node. (Note that much of the
115: * code generation knows about this implicit relationship.)
116: *
117: * Using two bytes for the "next" pointer is vast overkill for most things,
118: * but allows patterns to get big without disasters.
119: */
120: #define OP(p) (*(p))
121: #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
122: #define OPERAND(p) ((p) + 3)
123:
124: /*
125: * See regmagic.h for one further detail of program structure.
126: */
127:
128:
129: /*
130: * Utility definitions.
131: */
132: #ifndef CHARBITS
133: #define UCHARAT(p) ((int)*(unsigned char *)(p))
134: #else
135: #define UCHARAT(p) ((int)*(p)&CHARBITS)
136: #endif
137:
138: #define FAIL(m) { regerror(m); return(NULL); }
139: #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
140: #define META "^$.[()|?+*\\"
141:
142: /*
143: * Flags to be passed up and down.
144: */
145: #define HASWIDTH 01 /* Known never to match null string. */
146: #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
147: #define SPSTART 04 /* Starts with * or +. */
148: #define WORST 0 /* Worst case. */
149:
150: /*
151: * Global work variables for regcomp().
152: */
153: static char *regparse; /* Input-scan pointer. */
154: static int regnpar; /* () count. */
155: static char regdummy;
156: static char *regcode; /* Code-emit pointer; ®dummy = don't. */
157: static long regsize; /* Code size. */
158:
159: /*
160: * The first byte of the regexp internal "program" is actually this magic
161: * number; the start node begins in the second byte.
162: */
163: #define MAGIC 0234
164:
165:
166: /*
167: * Forward declarations for regcomp()'s friends.
168: */
169: #ifndef STATIC
170: #define STATIC static
171: #endif
172: STATIC char *reg();
173: STATIC char *regbranch();
174: STATIC char *regpiece();
175: STATIC char *regatom();
176: STATIC char *regnode();
177: STATIC char *regnext();
178: STATIC void regc();
179: STATIC void reginsert();
180: STATIC void regtail();
181: STATIC void regoptail();
182: #ifdef STRCSPN
183: STATIC int strcspn();
184: #endif
185:
186: /*
187: - regcomp - compile a regular expression into internal code
188: *
189: * We can't allocate space until we know how big the compiled form will be,
190: * but we can't compile it (and thus know how big it is) until we've got a
191: * place to put the code. So we cheat: we compile it twice, once with code
192: * generation turned off and size counting turned on, and once "for real".
193: * This also means that we don't allocate space until we are sure that the
194: * thing really will compile successfully, and we never have to move the
195: * code and thus invalidate pointers into it. (Note that it has to be in
196: * one piece because free() must be able to free it all.)
197: *
198: * Beware that the optimization-preparation code in here knows about some
199: * of the structure of the compiled regexp.
200: */
201: regexp *
202: regcomp(exp)
203: char *exp;
204: {
1.3 mpech 205: regexp *r;
206: char *scan;
207: char *longest;
208: int len;
1.1 etheisen 209: int flags;
210:
211: if (exp == NULL)
212: FAIL("NULL argument");
213:
214: /* First pass: determine size, legality. */
215: regparse = exp;
216: regnpar = 1;
217: regsize = 0L;
218: regcode = ®dummy;
219: regc(MAGIC);
220: if (reg(0, &flags) == NULL)
221: return(NULL);
222:
223: /* Small enough for pointer-storage convention? */
224: if (regsize >= 32767L) /* Probably could be 65535L. */
225: FAIL("regexp too big");
226:
227: /* Allocate space. */
228: r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
229: if (r == NULL)
230: FAIL("out of space");
231:
232: /* Second pass: emit code. */
233: regparse = exp;
234: regnpar = 1;
235: regcode = r->program;
236: regc(MAGIC);
237: if (reg(0, &flags) == NULL)
238: return(NULL);
239:
240: /* Dig out information for optimizations. */
241: r->regstart = '\0'; /* Worst-case defaults. */
242: r->reganch = 0;
243: r->regmust = NULL;
244: r->regmlen = 0;
245: scan = r->program+1; /* First BRANCH. */
246: if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
247: scan = OPERAND(scan);
248:
249: /* Starting-point info. */
250: if (OP(scan) == EXACTLY)
251: r->regstart = *OPERAND(scan);
252: else if (OP(scan) == BOL)
253: r->reganch++;
254:
255: /*
256: * If there's something expensive in the r.e., find the
257: * longest literal string that must appear and make it the
258: * regmust. Resolve ties in favor of later strings, since
259: * the regstart check works with the beginning of the r.e.
260: * and avoiding duplication strengthens checking. Not a
261: * strong reason, but sufficient in the absence of others.
262: */
263: if (flags&SPSTART) {
264: longest = NULL;
265: len = 0;
266: for (; scan != NULL; scan = regnext(scan))
267: if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
268: longest = OPERAND(scan);
269: len = strlen(OPERAND(scan));
270: }
271: r->regmust = longest;
272: r->regmlen = len;
273: }
274: }
275:
276: return(r);
277: }
278:
279: /*
280: - reg - regular expression, i.e. main body or parenthesized thing
281: *
282: * Caller must absorb opening parenthesis.
283: *
284: * Combining parenthesis handling with the base level of regular expression
285: * is a trifle forced, but the need to tie the tails of the branches to what
286: * follows makes it hard to avoid.
287: */
288: static char *
289: reg(paren, flagp)
290: int paren; /* Parenthesized? */
291: int *flagp;
292: {
1.3 mpech 293: char *ret;
294: char *br;
295: char *ender;
296: int parno = 0;
1.1 etheisen 297: int flags;
298:
299: *flagp = HASWIDTH; /* Tentatively. */
300:
301: /* Make an OPEN node, if parenthesized. */
302: if (paren) {
303: if (regnpar >= NSUBEXP)
304: FAIL("too many ()");
305: parno = regnpar;
306: regnpar++;
307: ret = regnode(OPEN+parno);
308: } else
309: ret = NULL;
310:
311: /* Pick up the branches, linking them together. */
312: br = regbranch(&flags);
313: if (br == NULL)
314: return(NULL);
315: if (ret != NULL)
316: regtail(ret, br); /* OPEN -> first. */
317: else
318: ret = br;
319: if (!(flags&HASWIDTH))
320: *flagp &= ~HASWIDTH;
321: *flagp |= flags&SPSTART;
322: while (*regparse == '|') {
323: regparse++;
324: br = regbranch(&flags);
325: if (br == NULL)
326: return(NULL);
327: regtail(ret, br); /* BRANCH -> BRANCH. */
328: if (!(flags&HASWIDTH))
329: *flagp &= ~HASWIDTH;
330: *flagp |= flags&SPSTART;
331: }
332:
333: /* Make a closing node, and hook it on the end. */
334: ender = regnode((paren) ? CLOSE+parno : END);
335: regtail(ret, ender);
336:
337: /* Hook the tails of the branches to the closing node. */
338: for (br = ret; br != NULL; br = regnext(br))
339: regoptail(br, ender);
340:
341: /* Check for proper termination. */
342: if (paren && *regparse++ != ')') {
343: FAIL("unmatched ()");
344: } else if (!paren && *regparse != '\0') {
345: if (*regparse == ')') {
346: FAIL("unmatched ()");
347: } else
348: FAIL("junk on end"); /* "Can't happen". */
349: /* NOTREACHED */
350: }
351:
352: return(ret);
353: }
354:
355: /*
356: - regbranch - one alternative of an | operator
357: *
358: * Implements the concatenation operator.
359: */
360: static char *
361: regbranch(flagp)
362: int *flagp;
363: {
1.3 mpech 364: char *ret;
365: char *chain;
366: char *latest;
1.1 etheisen 367: int flags;
368:
369: *flagp = WORST; /* Tentatively. */
370:
371: ret = regnode(BRANCH);
372: chain = NULL;
373: while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
374: latest = regpiece(&flags);
375: if (latest == NULL)
376: return(NULL);
377: *flagp |= flags&HASWIDTH;
378: if (chain == NULL) /* First piece. */
379: *flagp |= flags&SPSTART;
380: else
381: regtail(chain, latest);
382: chain = latest;
383: }
384: if (chain == NULL) /* Loop ran zero times. */
385: (void) regnode(NOTHING);
386:
387: return(ret);
388: }
389:
390: /*
391: - regpiece - something followed by possible [*+?]
392: *
393: * Note that the branching code sequences used for ? and the general cases
394: * of * and + are somewhat optimized: they use the same NOTHING node as
395: * both the endmarker for their branch list and the body of the last branch.
396: * It might seem that this node could be dispensed with entirely, but the
397: * endmarker role is not redundant.
398: */
399: static char *
400: regpiece(flagp)
401: int *flagp;
402: {
1.3 mpech 403: char *ret;
404: char op;
405: char *next;
1.1 etheisen 406: int flags;
407:
408: ret = regatom(&flags);
409: if (ret == NULL)
410: return(NULL);
411:
412: op = *regparse;
413: if (!ISMULT(op)) {
414: *flagp = flags;
415: return(ret);
416: }
417:
418: if (!(flags&HASWIDTH) && op != '?')
419: FAIL("*+ operand could be empty");
420: *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
421:
422: if (op == '*' && (flags&SIMPLE))
423: reginsert(STAR, ret);
424: else if (op == '*') {
425: /* Emit x* as (x&|), where & means "self". */
426: reginsert(BRANCH, ret); /* Either x */
427: regoptail(ret, regnode(BACK)); /* and loop */
428: regoptail(ret, ret); /* back */
429: regtail(ret, regnode(BRANCH)); /* or */
430: regtail(ret, regnode(NOTHING)); /* null. */
431: } else if (op == '+' && (flags&SIMPLE))
432: reginsert(PLUS, ret);
433: else if (op == '+') {
434: /* Emit x+ as x(&|), where & means "self". */
435: next = regnode(BRANCH); /* Either */
436: regtail(ret, next);
437: regtail(regnode(BACK), ret); /* loop back */
438: regtail(next, regnode(BRANCH)); /* or */
439: regtail(ret, regnode(NOTHING)); /* null. */
440: } else if (op == '?') {
441: /* Emit x? as (x|) */
442: reginsert(BRANCH, ret); /* Either x */
443: regtail(ret, regnode(BRANCH)); /* or */
444: next = regnode(NOTHING); /* null. */
445: regtail(ret, next);
446: regoptail(ret, next);
447: }
448: regparse++;
449: if (ISMULT(*regparse))
450: FAIL("nested *?+");
451:
452: return(ret);
453: }
454:
455: /*
456: - regatom - the lowest level
457: *
458: * Optimization: gobbles an entire sequence of ordinary characters so that
459: * it can turn them into a single node, which is smaller to store and
460: * faster to run. Backslashed characters are exceptions, each becoming a
461: * separate node; the code is simpler that way and it's not worth fixing.
462: */
463: static char *
464: regatom(flagp)
465: int *flagp;
466: {
1.3 mpech 467: char *ret;
1.1 etheisen 468: int flags;
469:
470: *flagp = WORST; /* Tentatively. */
471:
472: switch (*regparse++) {
473: case '^':
474: ret = regnode(BOL);
475: break;
476: case '$':
477: ret = regnode(EOL);
478: break;
479: case '.':
480: ret = regnode(ANY);
481: *flagp |= HASWIDTH|SIMPLE;
482: break;
483: case '[': {
1.3 mpech 484: int clss;
485: int classend;
1.1 etheisen 486:
487: if (*regparse == '^') { /* Complement of range. */
488: ret = regnode(ANYBUT);
489: regparse++;
490: } else
491: ret = regnode(ANYOF);
492: if (*regparse == ']' || *regparse == '-')
493: regc(*regparse++);
494: while (*regparse != '\0' && *regparse != ']') {
495: if (*regparse == '-') {
496: regparse++;
497: if (*regparse == ']' || *regparse == '\0')
498: regc('-');
499: else {
500: clss = UCHARAT(regparse-2)+1;
501: classend = UCHARAT(regparse);
502: if (clss > classend+1)
503: FAIL("invalid [] range");
504: for (; clss <= classend; clss++)
505: regc(clss);
506: regparse++;
507: }
508: } else
509: regc(*regparse++);
510: }
511: regc('\0');
512: if (*regparse != ']')
513: FAIL("unmatched []");
514: regparse++;
515: *flagp |= HASWIDTH|SIMPLE;
516: }
517: break;
518: case '(':
519: ret = reg(1, &flags);
520: if (ret == NULL)
521: return(NULL);
522: *flagp |= flags&(HASWIDTH|SPSTART);
523: break;
524: case '\0':
525: case '|':
526: case ')':
527: FAIL("internal urp"); /* Supposed to be caught earlier. */
528: /* NOTREACHED */
529: break;
530: case '?':
531: case '+':
532: case '*':
533: FAIL("?+* follows nothing");
534: /* NOTREACHED */
535: break;
536: case '\\':
537: if (*regparse == '\0')
538: FAIL("trailing \\");
539: ret = regnode(EXACTLY);
540: regc(*regparse++);
541: regc('\0');
542: *flagp |= HASWIDTH|SIMPLE;
543: break;
544: default: {
1.3 mpech 545: int len;
546: char ender;
1.1 etheisen 547:
548: regparse--;
549: len = strcspn(regparse, META);
550: if (len <= 0)
551: FAIL("internal disaster");
552: ender = *(regparse+len);
553: if (len > 1 && ISMULT(ender))
554: len--; /* Back off clear of ?+* operand. */
555: *flagp |= HASWIDTH;
556: if (len == 1)
557: *flagp |= SIMPLE;
558: ret = regnode(EXACTLY);
559: while (len > 0) {
560: regc(*regparse++);
561: len--;
562: }
563: regc('\0');
564: }
565: break;
566: }
567:
568: return(ret);
569: }
570:
571: /*
572: - regnode - emit a node
573: */
574: static char * /* Location. */
575: regnode(op)
576: char op;
577: {
1.3 mpech 578: char *ret;
579: char *ptr;
1.1 etheisen 580:
581: ret = regcode;
582: if (ret == ®dummy) {
583: regsize += 3;
584: return(ret);
585: }
586:
587: ptr = ret;
588: *ptr++ = op;
589: *ptr++ = '\0'; /* Null "next" pointer. */
590: *ptr++ = '\0';
591: regcode = ptr;
592:
593: return(ret);
594: }
595:
596: /*
597: - regc - emit (if appropriate) a byte of code
598: */
599: static void
600: regc(b)
601: char b;
602: {
603: if (regcode != ®dummy)
604: *regcode++ = b;
605: else
606: regsize++;
607: }
608:
609: /*
610: - reginsert - insert an operator in front of already-emitted operand
611: *
612: * Means relocating the operand.
613: */
614: static void
615: reginsert(op, opnd)
616: char op;
617: char *opnd;
618: {
1.3 mpech 619: char *src;
620: char *dst;
621: char *place;
1.1 etheisen 622:
623: if (regcode == ®dummy) {
624: regsize += 3;
625: return;
626: }
627:
628: src = regcode;
629: regcode += 3;
630: dst = regcode;
631: while (src > opnd)
632: *--dst = *--src;
633:
634: place = opnd; /* Op node, where operand used to be. */
635: *place++ = op;
636: *place++ = '\0';
637: *place++ = '\0';
638: }
639:
640: /*
641: - regtail - set the next-pointer at the end of a node chain
642: */
643: static void
644: regtail(p, val)
645: char *p;
646: char *val;
647: {
1.3 mpech 648: char *scan;
649: char *temp;
650: int offset;
1.1 etheisen 651:
652: if (p == ®dummy)
653: return;
654:
655: /* Find last node. */
656: scan = p;
657: for (;;) {
658: temp = regnext(scan);
659: if (temp == NULL)
660: break;
661: scan = temp;
662: }
663:
664: if (OP(scan) == BACK)
665: offset = scan - val;
666: else
667: offset = val - scan;
668: *(scan+1) = (offset>>8)&0377;
669: *(scan+2) = offset&0377;
670: }
671:
672: /*
673: - regoptail - regtail on operand of first argument; nop if operandless
674: */
675: static void
676: regoptail(p, val)
677: char *p;
678: char *val;
679: {
680: /* "Operandless" and "op != BRANCH" are synonymous in practice. */
681: if (p == NULL || p == ®dummy || OP(p) != BRANCH)
682: return;
683: regtail(OPERAND(p), val);
684: }
685:
686: /*
687: * regexec and friends
688: */
689:
690: /*
691: * Global work variables for regexec().
692: */
693: static char *reginput; /* String-input pointer. */
694: static char *regbol; /* Beginning of input, for ^ check. */
695: static char **regstartp; /* Pointer to startp array. */
696: static char **regendp; /* Ditto for endp. */
697:
698: /*
699: * Forwards.
700: */
701: STATIC int regtry();
702: STATIC int regmatch();
703: STATIC int regrepeat();
704:
705: #ifdef DEBUG
706: int regnarrate = 0;
707: void regdump();
708: STATIC char *regprop();
709: #endif
710:
711: /*
712: - regexec - match a regexp against a string
713: */
714: int
715: regexec(prog, string)
1.3 mpech 716: regexp *prog;
717: char *string;
1.1 etheisen 718: {
1.3 mpech 719: char *s;
1.1 etheisen 720:
721: /* Be paranoid... */
722: if (prog == NULL || string == NULL) {
723: regerror("NULL parameter");
724: return(0);
725: }
726:
727: /* Check validity of program. */
728: if (UCHARAT(prog->program) != MAGIC) {
729: regerror("corrupted program");
730: return(0);
731: }
732:
733: /* If there is a "must appear" string, look for it. */
734: if (prog->regmust != NULL) {
735: s = string;
736: while ((s = strchr(s, prog->regmust[0])) != NULL) {
737: if (strncmp(s, prog->regmust, prog->regmlen) == 0)
738: break; /* Found it. */
739: s++;
740: }
741: if (s == NULL) /* Not present. */
742: return(0);
743: }
744:
745: /* Mark beginning of line for ^ . */
746: regbol = string;
747:
748: /* Simplest case: anchored match need be tried only once. */
749: if (prog->reganch)
750: return(regtry(prog, string));
751:
752: /* Messy cases: unanchored match. */
753: s = string;
754: if (prog->regstart != '\0')
755: /* We know what char it must start with. */
756: while ((s = strchr(s, prog->regstart)) != NULL) {
757: if (regtry(prog, s))
758: return(1);
759: s++;
760: }
761: else
762: /* We don't -- general case. */
763: do {
764: if (regtry(prog, s))
765: return(1);
766: } while (*s++ != '\0');
767:
768: /* Failure. */
769: return(0);
770: }
771:
772: /*
773: - regtry - try match at specific point
774: */
775: static int /* 0 failure, 1 success */
776: regtry(prog, string)
777: regexp *prog;
778: char *string;
779: {
1.3 mpech 780: int i;
781: char **sp;
782: char **ep;
1.1 etheisen 783:
784: reginput = string;
785: regstartp = prog->startp;
786: regendp = prog->endp;
787:
788: sp = prog->startp;
789: ep = prog->endp;
790: for (i = NSUBEXP; i > 0; i--) {
791: *sp++ = NULL;
792: *ep++ = NULL;
793: }
794: if (regmatch(prog->program + 1)) {
795: prog->startp[0] = string;
796: prog->endp[0] = reginput;
797: return(1);
798: } else
799: return(0);
800: }
801:
802: /*
803: - regmatch - main matching routine
804: *
805: * Conceptually the strategy is simple: check to see whether the current
806: * node matches, call self recursively to see whether the rest matches,
807: * and then act accordingly. In practice we make some effort to avoid
808: * recursion, in particular by going through "ordinary" nodes (that don't
809: * need to know whether the rest of the match failed) by a loop instead of
810: * by recursion.
811: */
812: static int /* 0 failure, 1 success */
813: regmatch(prog)
814: char *prog;
815: {
1.3 mpech 816: char *scan; /* Current node. */
1.1 etheisen 817: char *next; /* Next node. */
818:
819: scan = prog;
820: #ifdef DEBUG
821: if (scan != NULL && regnarrate)
822: fprintf(stderr, "%s(\n", regprop(scan));
823: #endif
824: while (scan != NULL) {
825: #ifdef DEBUG
826: if (regnarrate)
827: fprintf(stderr, "%s...\n", regprop(scan));
828: #endif
829: next = regnext(scan);
830:
831: switch (OP(scan)) {
832: case BOL:
833: if (reginput != regbol)
834: return(0);
835: break;
836: case EOL:
837: if (*reginput != '\0')
838: return(0);
839: break;
840: case ANY:
841: if (*reginput == '\0')
842: return(0);
843: reginput++;
844: break;
845: case EXACTLY: {
1.3 mpech 846: int len;
847: char *opnd;
1.1 etheisen 848:
849: opnd = OPERAND(scan);
850: /* Inline the first character, for speed. */
851: if (*opnd != *reginput)
852: return(0);
853: len = strlen(opnd);
854: if (len > 1 && strncmp(opnd, reginput, len) != 0)
855: return(0);
856: reginput += len;
857: }
858: break;
859: case ANYOF:
860: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
861: return(0);
862: reginput++;
863: break;
864: case ANYBUT:
865: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
866: return(0);
867: reginput++;
868: break;
869: case NOTHING:
870: break;
871: case BACK:
872: break;
873: case OPEN+1:
874: case OPEN+2:
875: case OPEN+3:
876: case OPEN+4:
877: case OPEN+5:
878: case OPEN+6:
879: case OPEN+7:
880: case OPEN+8:
881: case OPEN+9: {
1.3 mpech 882: int no;
883: char *save;
1.1 etheisen 884:
885: no = OP(scan) - OPEN;
886: save = reginput;
887:
888: if (regmatch(next)) {
889: /*
890: * Don't set startp if some later
891: * invocation of the same parentheses
892: * already has.
893: */
894: if (regstartp[no] == NULL)
895: regstartp[no] = save;
896: return(1);
897: } else
898: return(0);
899: }
900: /* NOTREACHED */
901: break;
902: case CLOSE+1:
903: case CLOSE+2:
904: case CLOSE+3:
905: case CLOSE+4:
906: case CLOSE+5:
907: case CLOSE+6:
908: case CLOSE+7:
909: case CLOSE+8:
910: case CLOSE+9: {
1.3 mpech 911: int no;
912: char *save;
1.1 etheisen 913:
914: no = OP(scan) - CLOSE;
915: save = reginput;
916:
917: if (regmatch(next)) {
918: /*
919: * Don't set endp if some later
920: * invocation of the same parentheses
921: * already has.
922: */
923: if (regendp[no] == NULL)
924: regendp[no] = save;
925: return(1);
926: } else
927: return(0);
928: }
929: /* NOTREACHED */
930: break;
931: case BRANCH: {
1.3 mpech 932: char *save;
1.1 etheisen 933:
934: if (OP(next) != BRANCH) /* No choice. */
935: next = OPERAND(scan); /* Avoid recursion. */
936: else {
937: do {
938: save = reginput;
939: if (regmatch(OPERAND(scan)))
940: return(1);
941: reginput = save;
942: scan = regnext(scan);
943: } while (scan != NULL && OP(scan) == BRANCH);
944: return(0);
945: /* NOTREACHED */
946: }
947: }
948: /* NOTREACHED */
949: break;
950: case STAR:
951: case PLUS: {
1.3 mpech 952: char nextch;
953: int no;
954: char *save;
955: int min;
1.1 etheisen 956:
957: /*
958: * Lookahead to avoid useless match attempts
959: * when we know what character comes next.
960: */
961: nextch = '\0';
962: if (OP(next) == EXACTLY)
963: nextch = *OPERAND(next);
964: min = (OP(scan) == STAR) ? 0 : 1;
965: save = reginput;
966: no = regrepeat(OPERAND(scan));
967: while (no >= min) {
968: /* If it could work, try it. */
969: if (nextch == '\0' || *reginput == nextch)
970: if (regmatch(next))
971: return(1);
972: /* Couldn't or didn't -- back up. */
973: no--;
974: reginput = save + no;
975: }
976: return(0);
977: }
978: /* NOTREACHED */
979: break;
980: case END:
981: return(1); /* Success! */
982: /* NOTREACHED */
983: break;
984: default:
985: regerror("memory corruption");
986: return(0);
987: /* NOTREACHED */
988: break;
989: }
990:
991: scan = next;
992: }
993:
994: /*
995: * We get here only if there's trouble -- normally "case END" is
996: * the terminating point.
997: */
998: regerror("corrupted pointers");
999: return(0);
1000: }
1001:
1002: /*
1003: - regrepeat - repeatedly match something simple, report how many
1004: */
1005: static int
1006: regrepeat(p)
1007: char *p;
1008: {
1.3 mpech 1009: int count = 0;
1010: char *scan;
1011: char *opnd;
1.1 etheisen 1012:
1013: scan = reginput;
1014: opnd = OPERAND(p);
1015: switch (OP(p)) {
1016: case ANY:
1017: count = strlen(scan);
1018: scan += count;
1019: break;
1020: case EXACTLY:
1021: while (*opnd == *scan) {
1022: count++;
1023: scan++;
1024: }
1025: break;
1026: case ANYOF:
1027: while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1028: count++;
1029: scan++;
1030: }
1031: break;
1032: case ANYBUT:
1033: while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1034: count++;
1035: scan++;
1036: }
1037: break;
1038: default: /* Oh dear. Called inappropriately. */
1039: regerror("internal foulup");
1040: count = 0; /* Best compromise. */
1041: break;
1042: }
1043: reginput = scan;
1044:
1045: return(count);
1046: }
1047:
1048: /*
1049: - regnext - dig the "next" pointer out of a node
1050: */
1051: static char *
1052: regnext(p)
1.3 mpech 1053: char *p;
1.1 etheisen 1054: {
1.3 mpech 1055: int offset;
1.1 etheisen 1056:
1057: if (p == ®dummy)
1058: return(NULL);
1059:
1060: offset = NEXT(p);
1061: if (offset == 0)
1062: return(NULL);
1063:
1064: if (OP(p) == BACK)
1065: return(p-offset);
1066: else
1067: return(p+offset);
1068: }
1069:
1070: #ifdef DEBUG
1071:
1072: STATIC char *regprop();
1073:
1074: /*
1075: - regdump - dump a regexp onto stdout in vaguely comprehensible form
1076: */
1077: void
1078: regdump(r)
1079: regexp *r;
1080: {
1.3 mpech 1081: char *s;
1082: char op = EXACTLY; /* Arbitrary non-END op. */
1083: char *next;
1.1 etheisen 1084:
1085:
1086: s = r->program + 1;
1087: while (op != END) { /* While that wasn't END last time... */
1088: op = OP(s);
1089: printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1090: next = regnext(s);
1091: if (next == NULL) /* Next ptr. */
1092: printf("(0)");
1093: else
1094: printf("(%d)", (s-r->program)+(next-s));
1095: s += 3;
1096: if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1097: /* Literal string, where present. */
1098: while (*s != '\0') {
1099: putchar(*s);
1100: s++;
1101: }
1102: s++;
1103: }
1104: putchar('\n');
1105: }
1106:
1107: /* Header fields of interest. */
1108: if (r->regstart != '\0')
1109: printf("start `%c' ", r->regstart);
1110: if (r->reganch)
1111: printf("anchored ");
1112: if (r->regmust != NULL)
1113: printf("must have \"%s\"", r->regmust);
1114: printf("\n");
1115: }
1116:
1117: /*
1118: - regprop - printable representation of opcode
1119: */
1120: static char *
1121: regprop(op)
1122: char *op;
1123: {
1.3 mpech 1124: char *p;
1.1 etheisen 1125: static char buf[50];
1126:
1.4 deraadt 1127: (void) strlcpy(buf, ":", sizeof buf);
1.1 etheisen 1128:
1129: switch (OP(op)) {
1130: case BOL:
1131: p = "BOL";
1132: break;
1133: case EOL:
1134: p = "EOL";
1135: break;
1136: case ANY:
1137: p = "ANY";
1138: break;
1139: case ANYOF:
1140: p = "ANYOF";
1141: break;
1142: case ANYBUT:
1143: p = "ANYBUT";
1144: break;
1145: case BRANCH:
1146: p = "BRANCH";
1147: break;
1148: case EXACTLY:
1149: p = "EXACTLY";
1150: break;
1151: case NOTHING:
1152: p = "NOTHING";
1153: break;
1154: case BACK:
1155: p = "BACK";
1156: break;
1157: case END:
1158: p = "END";
1159: break;
1160: case OPEN+1:
1161: case OPEN+2:
1162: case OPEN+3:
1163: case OPEN+4:
1164: case OPEN+5:
1165: case OPEN+6:
1166: case OPEN+7:
1167: case OPEN+8:
1168: case OPEN+9:
1.6 ! deraadt 1169: snprintf(buf+strlen(buf), sizeof buf - strlen(buf),
! 1170: "OPEN%d", OP(op)-OPEN);
1.1 etheisen 1171: p = NULL;
1172: break;
1173: case CLOSE+1:
1174: case CLOSE+2:
1175: case CLOSE+3:
1176: case CLOSE+4:
1177: case CLOSE+5:
1178: case CLOSE+6:
1179: case CLOSE+7:
1180: case CLOSE+8:
1181: case CLOSE+9:
1.6 ! deraadt 1182: snprintf(buf+strlen(buf), sizeof buf - strlen(buf),
! 1183: "CLOSE%d", OP(op)-CLOSE);
1.1 etheisen 1184: p = NULL;
1185: break;
1186: case STAR:
1187: p = "STAR";
1188: break;
1189: case PLUS:
1190: p = "PLUS";
1191: break;
1192: default:
1193: regerror("corrupted opcode");
1194: break;
1195: }
1196: if (p != NULL)
1.5 deraadt 1197: (void) strlcat(buf, p, sizeof buf);
1.1 etheisen 1198: return(buf);
1199: }
1200: #endif
1201:
1202: /*
1203: * The following is provided for those people who do not have strcspn() in
1204: * their C libraries. They should get off their butts and do something
1205: * about it; at least one public-domain implementation of those (highly
1206: * useful) string routines has been published on Usenet.
1207: */
1208: #ifdef STRCSPN
1209: /*
1210: * strcspn - find length of initial segment of s1 consisting entirely
1211: * of characters not from s2
1212: */
1213:
1214: static int
1215: strcspn(s1, s2)
1216: char *s1;
1217: char *s2;
1218: {
1.3 mpech 1219: char *scan1;
1220: char *scan2;
1221: int count;
1.1 etheisen 1222:
1223: count = 0;
1224: for (scan1 = s1; *scan1 != '\0'; scan1++) {
1225: for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1226: if (*scan1 == *scan2++)
1227: return(count);
1228: count++;
1229: }
1230: return(count);
1231: }
1232: #endif