Annotation of src/usr.bin/less/regexp.c, Revision 1.1.1.1
1.1 etheisen 1: /*
2: * regcomp and regexec -- regsub and regerror are elsewhere
3: *
4: * Copyright (c) 1986 by University of Toronto.
5: * Written by Henry Spencer. Not derived from licensed software.
6: *
7: * Permission is granted to anyone to use this software for any
8: * purpose on any computer system, and to redistribute it freely,
9: * subject to the following restrictions:
10: *
11: * 1. The author is not responsible for the consequences of use of
12: * this software, no matter how awful, even if they arise
13: * from defects in it.
14: *
15: * 2. The origin of this software must not be misrepresented, either
16: * by explicit claim or by omission.
17: *
18: * 3. Altered versions must be plainly marked as such, and must not
19: * be misrepresented as being the original software.
20: *
21: * Beware that some of this code is subtly aware of the way operator
22: * precedence is structured in regular expressions. Serious changes in
23: * regular-expression syntax might require a total rethink.
24: *
25: * *** NOTE: this code has been altered slightly for use in Tcl. ***
26: * Slightly modified by David MacKenzie to undo most of the changes for TCL.
27: */
28:
29: #include <stdio.h>
30: #include "regexp.h"
31: char *strchr();
32:
33: /*
34: * The "internal use only" fields in regexp.h are present to pass info from
35: * compile to execute that permits the execute phase to run lots faster on
36: * simple cases. They are:
37: *
38: * regstart char that must begin a match; '\0' if none obvious
39: * reganch is the match anchored (at beginning-of-line only)?
40: * regmust string (pointer into program) that match must include, or NULL
41: * regmlen length of regmust string
42: *
43: * Regstart and reganch permit very fast decisions on suitable starting points
44: * for a match, cutting down the work a lot. Regmust permits fast rejection
45: * of lines that cannot possibly match. The regmust tests are costly enough
46: * that regcomp() supplies a regmust only if the r.e. contains something
47: * potentially expensive (at present, the only such thing detected is * or +
48: * at the start of the r.e., which can involve a lot of backup). Regmlen is
49: * supplied because the test in regexec() needs it and regcomp() is
50: * computing it anyway.
51: */
52:
53: /*
54: * Structure for regexp "program". This is essentially a linear encoding
55: * of a nondeterministic finite-state machine (aka syntax charts or
56: * "railroad normal form" in parsing technology). Each node is an opcode
57: * plus a "next" pointer, possibly plus an operand. "Next" pointers of
58: * all nodes except BRANCH implement concatenation; a "next" pointer with
59: * a BRANCH on both ends of it is connecting two alternatives. (Here we
60: * have one of the subtle syntax dependencies: an individual BRANCH (as
61: * opposed to a collection of them) is never concatenated with anything
62: * because of operator precedence.) The operand of some types of node is
63: * a literal string; for others, it is a node leading into a sub-FSM. In
64: * particular, the operand of a BRANCH node is the first node of the branch.
65: * (NB this is *not* a tree structure: the tail of the branch connects
66: * to the thing following the set of BRANCHes.) The opcodes are:
67: */
68:
69: /* definition number opnd? meaning */
70: #define END 0 /* no End of program. */
71: #define BOL 1 /* no Match "" at beginning of line. */
72: #define EOL 2 /* no Match "" at end of line. */
73: #define ANY 3 /* no Match any one character. */
74: #define ANYOF 4 /* str Match any character in this string. */
75: #define ANYBUT 5 /* str Match any character not in this string. */
76: #define BRANCH 6 /* node Match this alternative, or the next... */
77: #define BACK 7 /* no Match "", "next" ptr points backward. */
78: #define EXACTLY 8 /* str Match this string. */
79: #define NOTHING 9 /* no Match empty string. */
80: #define STAR 10 /* node Match this (simple) thing 0 or more times. */
81: #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
82: #define OPEN 20 /* no Mark this point in input as start of #n. */
83: /* OPEN+1 is number 1, etc. */
84: #define CLOSE 30 /* no Analogous to OPEN. */
85:
86: /*
87: * Opcode notes:
88: *
89: * BRANCH The set of branches constituting a single choice are hooked
90: * together with their "next" pointers, since precedence prevents
91: * anything being concatenated to any individual branch. The
92: * "next" pointer of the last BRANCH in a choice points to the
93: * thing following the whole choice. This is also where the
94: * final "next" pointer of each individual branch points; each
95: * branch starts with the operand node of a BRANCH node.
96: *
97: * BACK Normal "next" pointers all implicitly point forward; BACK
98: * exists to make loop structures possible.
99: *
100: * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
101: * BRANCH structures using BACK. Simple cases (one character
102: * per match) are implemented with STAR and PLUS for speed
103: * and to minimize recursive plunges.
104: *
105: * OPEN,CLOSE ...are numbered at compile time.
106: */
107:
108: /*
109: * A node is one char of opcode followed by two chars of "next" pointer.
110: * "Next" pointers are stored as two 8-bit pieces, high order first. The
111: * value is a positive offset from the opcode of the node containing it.
112: * An operand, if any, simply follows the node. (Note that much of the
113: * code generation knows about this implicit relationship.)
114: *
115: * Using two bytes for the "next" pointer is vast overkill for most things,
116: * but allows patterns to get big without disasters.
117: */
118: #define OP(p) (*(p))
119: #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
120: #define OPERAND(p) ((p) + 3)
121:
122: /*
123: * See regmagic.h for one further detail of program structure.
124: */
125:
126:
127: /*
128: * Utility definitions.
129: */
130: #ifndef CHARBITS
131: #define UCHARAT(p) ((int)*(unsigned char *)(p))
132: #else
133: #define UCHARAT(p) ((int)*(p)&CHARBITS)
134: #endif
135:
136: #define FAIL(m) { regerror(m); return(NULL); }
137: #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
138: #define META "^$.[()|?+*\\"
139:
140: /*
141: * Flags to be passed up and down.
142: */
143: #define HASWIDTH 01 /* Known never to match null string. */
144: #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
145: #define SPSTART 04 /* Starts with * or +. */
146: #define WORST 0 /* Worst case. */
147:
148: /*
149: * Global work variables for regcomp().
150: */
151: static char *regparse; /* Input-scan pointer. */
152: static int regnpar; /* () count. */
153: static char regdummy;
154: static char *regcode; /* Code-emit pointer; ®dummy = don't. */
155: static long regsize; /* Code size. */
156:
157: /*
158: * The first byte of the regexp internal "program" is actually this magic
159: * number; the start node begins in the second byte.
160: */
161: #define MAGIC 0234
162:
163:
164: /*
165: * Forward declarations for regcomp()'s friends.
166: */
167: #ifndef STATIC
168: #define STATIC static
169: #endif
170: STATIC char *reg();
171: STATIC char *regbranch();
172: STATIC char *regpiece();
173: STATIC char *regatom();
174: STATIC char *regnode();
175: STATIC char *regnext();
176: STATIC void regc();
177: STATIC void reginsert();
178: STATIC void regtail();
179: STATIC void regoptail();
180: #ifdef STRCSPN
181: STATIC int strcspn();
182: #endif
183:
184: /*
185: - regcomp - compile a regular expression into internal code
186: *
187: * We can't allocate space until we know how big the compiled form will be,
188: * but we can't compile it (and thus know how big it is) until we've got a
189: * place to put the code. So we cheat: we compile it twice, once with code
190: * generation turned off and size counting turned on, and once "for real".
191: * This also means that we don't allocate space until we are sure that the
192: * thing really will compile successfully, and we never have to move the
193: * code and thus invalidate pointers into it. (Note that it has to be in
194: * one piece because free() must be able to free it all.)
195: *
196: * Beware that the optimization-preparation code in here knows about some
197: * of the structure of the compiled regexp.
198: */
199: regexp *
200: regcomp(exp)
201: char *exp;
202: {
203: register regexp *r;
204: register char *scan;
205: register char *longest;
206: register int len;
207: int flags;
208:
209: if (exp == NULL)
210: FAIL("NULL argument");
211:
212: /* First pass: determine size, legality. */
213: regparse = exp;
214: regnpar = 1;
215: regsize = 0L;
216: regcode = ®dummy;
217: regc(MAGIC);
218: if (reg(0, &flags) == NULL)
219: return(NULL);
220:
221: /* Small enough for pointer-storage convention? */
222: if (regsize >= 32767L) /* Probably could be 65535L. */
223: FAIL("regexp too big");
224:
225: /* Allocate space. */
226: r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
227: if (r == NULL)
228: FAIL("out of space");
229:
230: /* Second pass: emit code. */
231: regparse = exp;
232: regnpar = 1;
233: regcode = r->program;
234: regc(MAGIC);
235: if (reg(0, &flags) == NULL)
236: return(NULL);
237:
238: /* Dig out information for optimizations. */
239: r->regstart = '\0'; /* Worst-case defaults. */
240: r->reganch = 0;
241: r->regmust = NULL;
242: r->regmlen = 0;
243: scan = r->program+1; /* First BRANCH. */
244: if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
245: scan = OPERAND(scan);
246:
247: /* Starting-point info. */
248: if (OP(scan) == EXACTLY)
249: r->regstart = *OPERAND(scan);
250: else if (OP(scan) == BOL)
251: r->reganch++;
252:
253: /*
254: * If there's something expensive in the r.e., find the
255: * longest literal string that must appear and make it the
256: * regmust. Resolve ties in favor of later strings, since
257: * the regstart check works with the beginning of the r.e.
258: * and avoiding duplication strengthens checking. Not a
259: * strong reason, but sufficient in the absence of others.
260: */
261: if (flags&SPSTART) {
262: longest = NULL;
263: len = 0;
264: for (; scan != NULL; scan = regnext(scan))
265: if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
266: longest = OPERAND(scan);
267: len = strlen(OPERAND(scan));
268: }
269: r->regmust = longest;
270: r->regmlen = len;
271: }
272: }
273:
274: return(r);
275: }
276:
277: /*
278: - reg - regular expression, i.e. main body or parenthesized thing
279: *
280: * Caller must absorb opening parenthesis.
281: *
282: * Combining parenthesis handling with the base level of regular expression
283: * is a trifle forced, but the need to tie the tails of the branches to what
284: * follows makes it hard to avoid.
285: */
286: static char *
287: reg(paren, flagp)
288: int paren; /* Parenthesized? */
289: int *flagp;
290: {
291: register char *ret;
292: register char *br;
293: register char *ender;
294: register int parno = 0;
295: int flags;
296:
297: *flagp = HASWIDTH; /* Tentatively. */
298:
299: /* Make an OPEN node, if parenthesized. */
300: if (paren) {
301: if (regnpar >= NSUBEXP)
302: FAIL("too many ()");
303: parno = regnpar;
304: regnpar++;
305: ret = regnode(OPEN+parno);
306: } else
307: ret = NULL;
308:
309: /* Pick up the branches, linking them together. */
310: br = regbranch(&flags);
311: if (br == NULL)
312: return(NULL);
313: if (ret != NULL)
314: regtail(ret, br); /* OPEN -> first. */
315: else
316: ret = br;
317: if (!(flags&HASWIDTH))
318: *flagp &= ~HASWIDTH;
319: *flagp |= flags&SPSTART;
320: while (*regparse == '|') {
321: regparse++;
322: br = regbranch(&flags);
323: if (br == NULL)
324: return(NULL);
325: regtail(ret, br); /* BRANCH -> BRANCH. */
326: if (!(flags&HASWIDTH))
327: *flagp &= ~HASWIDTH;
328: *flagp |= flags&SPSTART;
329: }
330:
331: /* Make a closing node, and hook it on the end. */
332: ender = regnode((paren) ? CLOSE+parno : END);
333: regtail(ret, ender);
334:
335: /* Hook the tails of the branches to the closing node. */
336: for (br = ret; br != NULL; br = regnext(br))
337: regoptail(br, ender);
338:
339: /* Check for proper termination. */
340: if (paren && *regparse++ != ')') {
341: FAIL("unmatched ()");
342: } else if (!paren && *regparse != '\0') {
343: if (*regparse == ')') {
344: FAIL("unmatched ()");
345: } else
346: FAIL("junk on end"); /* "Can't happen". */
347: /* NOTREACHED */
348: }
349:
350: return(ret);
351: }
352:
353: /*
354: - regbranch - one alternative of an | operator
355: *
356: * Implements the concatenation operator.
357: */
358: static char *
359: regbranch(flagp)
360: int *flagp;
361: {
362: register char *ret;
363: register char *chain;
364: register char *latest;
365: int flags;
366:
367: *flagp = WORST; /* Tentatively. */
368:
369: ret = regnode(BRANCH);
370: chain = NULL;
371: while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
372: latest = regpiece(&flags);
373: if (latest == NULL)
374: return(NULL);
375: *flagp |= flags&HASWIDTH;
376: if (chain == NULL) /* First piece. */
377: *flagp |= flags&SPSTART;
378: else
379: regtail(chain, latest);
380: chain = latest;
381: }
382: if (chain == NULL) /* Loop ran zero times. */
383: (void) regnode(NOTHING);
384:
385: return(ret);
386: }
387:
388: /*
389: - regpiece - something followed by possible [*+?]
390: *
391: * Note that the branching code sequences used for ? and the general cases
392: * of * and + are somewhat optimized: they use the same NOTHING node as
393: * both the endmarker for their branch list and the body of the last branch.
394: * It might seem that this node could be dispensed with entirely, but the
395: * endmarker role is not redundant.
396: */
397: static char *
398: regpiece(flagp)
399: int *flagp;
400: {
401: register char *ret;
402: register char op;
403: register char *next;
404: int flags;
405:
406: ret = regatom(&flags);
407: if (ret == NULL)
408: return(NULL);
409:
410: op = *regparse;
411: if (!ISMULT(op)) {
412: *flagp = flags;
413: return(ret);
414: }
415:
416: if (!(flags&HASWIDTH) && op != '?')
417: FAIL("*+ operand could be empty");
418: *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
419:
420: if (op == '*' && (flags&SIMPLE))
421: reginsert(STAR, ret);
422: else if (op == '*') {
423: /* Emit x* as (x&|), where & means "self". */
424: reginsert(BRANCH, ret); /* Either x */
425: regoptail(ret, regnode(BACK)); /* and loop */
426: regoptail(ret, ret); /* back */
427: regtail(ret, regnode(BRANCH)); /* or */
428: regtail(ret, regnode(NOTHING)); /* null. */
429: } else if (op == '+' && (flags&SIMPLE))
430: reginsert(PLUS, ret);
431: else if (op == '+') {
432: /* Emit x+ as x(&|), where & means "self". */
433: next = regnode(BRANCH); /* Either */
434: regtail(ret, next);
435: regtail(regnode(BACK), ret); /* loop back */
436: regtail(next, regnode(BRANCH)); /* or */
437: regtail(ret, regnode(NOTHING)); /* null. */
438: } else if (op == '?') {
439: /* Emit x? as (x|) */
440: reginsert(BRANCH, ret); /* Either x */
441: regtail(ret, regnode(BRANCH)); /* or */
442: next = regnode(NOTHING); /* null. */
443: regtail(ret, next);
444: regoptail(ret, next);
445: }
446: regparse++;
447: if (ISMULT(*regparse))
448: FAIL("nested *?+");
449:
450: return(ret);
451: }
452:
453: /*
454: - regatom - the lowest level
455: *
456: * Optimization: gobbles an entire sequence of ordinary characters so that
457: * it can turn them into a single node, which is smaller to store and
458: * faster to run. Backslashed characters are exceptions, each becoming a
459: * separate node; the code is simpler that way and it's not worth fixing.
460: */
461: static char *
462: regatom(flagp)
463: int *flagp;
464: {
465: register char *ret;
466: int flags;
467:
468: *flagp = WORST; /* Tentatively. */
469:
470: switch (*regparse++) {
471: case '^':
472: ret = regnode(BOL);
473: break;
474: case '$':
475: ret = regnode(EOL);
476: break;
477: case '.':
478: ret = regnode(ANY);
479: *flagp |= HASWIDTH|SIMPLE;
480: break;
481: case '[': {
482: register int clss;
483: register int classend;
484:
485: if (*regparse == '^') { /* Complement of range. */
486: ret = regnode(ANYBUT);
487: regparse++;
488: } else
489: ret = regnode(ANYOF);
490: if (*regparse == ']' || *regparse == '-')
491: regc(*regparse++);
492: while (*regparse != '\0' && *regparse != ']') {
493: if (*regparse == '-') {
494: regparse++;
495: if (*regparse == ']' || *regparse == '\0')
496: regc('-');
497: else {
498: clss = UCHARAT(regparse-2)+1;
499: classend = UCHARAT(regparse);
500: if (clss > classend+1)
501: FAIL("invalid [] range");
502: for (; clss <= classend; clss++)
503: regc(clss);
504: regparse++;
505: }
506: } else
507: regc(*regparse++);
508: }
509: regc('\0');
510: if (*regparse != ']')
511: FAIL("unmatched []");
512: regparse++;
513: *flagp |= HASWIDTH|SIMPLE;
514: }
515: break;
516: case '(':
517: ret = reg(1, &flags);
518: if (ret == NULL)
519: return(NULL);
520: *flagp |= flags&(HASWIDTH|SPSTART);
521: break;
522: case '\0':
523: case '|':
524: case ')':
525: FAIL("internal urp"); /* Supposed to be caught earlier. */
526: /* NOTREACHED */
527: break;
528: case '?':
529: case '+':
530: case '*':
531: FAIL("?+* follows nothing");
532: /* NOTREACHED */
533: break;
534: case '\\':
535: if (*regparse == '\0')
536: FAIL("trailing \\");
537: ret = regnode(EXACTLY);
538: regc(*regparse++);
539: regc('\0');
540: *flagp |= HASWIDTH|SIMPLE;
541: break;
542: default: {
543: register int len;
544: register char ender;
545:
546: regparse--;
547: len = strcspn(regparse, META);
548: if (len <= 0)
549: FAIL("internal disaster");
550: ender = *(regparse+len);
551: if (len > 1 && ISMULT(ender))
552: len--; /* Back off clear of ?+* operand. */
553: *flagp |= HASWIDTH;
554: if (len == 1)
555: *flagp |= SIMPLE;
556: ret = regnode(EXACTLY);
557: while (len > 0) {
558: regc(*regparse++);
559: len--;
560: }
561: regc('\0');
562: }
563: break;
564: }
565:
566: return(ret);
567: }
568:
569: /*
570: - regnode - emit a node
571: */
572: static char * /* Location. */
573: regnode(op)
574: char op;
575: {
576: register char *ret;
577: register char *ptr;
578:
579: ret = regcode;
580: if (ret == ®dummy) {
581: regsize += 3;
582: return(ret);
583: }
584:
585: ptr = ret;
586: *ptr++ = op;
587: *ptr++ = '\0'; /* Null "next" pointer. */
588: *ptr++ = '\0';
589: regcode = ptr;
590:
591: return(ret);
592: }
593:
594: /*
595: - regc - emit (if appropriate) a byte of code
596: */
597: static void
598: regc(b)
599: char b;
600: {
601: if (regcode != ®dummy)
602: *regcode++ = b;
603: else
604: regsize++;
605: }
606:
607: /*
608: - reginsert - insert an operator in front of already-emitted operand
609: *
610: * Means relocating the operand.
611: */
612: static void
613: reginsert(op, opnd)
614: char op;
615: char *opnd;
616: {
617: register char *src;
618: register char *dst;
619: register char *place;
620:
621: if (regcode == ®dummy) {
622: regsize += 3;
623: return;
624: }
625:
626: src = regcode;
627: regcode += 3;
628: dst = regcode;
629: while (src > opnd)
630: *--dst = *--src;
631:
632: place = opnd; /* Op node, where operand used to be. */
633: *place++ = op;
634: *place++ = '\0';
635: *place++ = '\0';
636: }
637:
638: /*
639: - regtail - set the next-pointer at the end of a node chain
640: */
641: static void
642: regtail(p, val)
643: char *p;
644: char *val;
645: {
646: register char *scan;
647: register char *temp;
648: register int offset;
649:
650: if (p == ®dummy)
651: return;
652:
653: /* Find last node. */
654: scan = p;
655: for (;;) {
656: temp = regnext(scan);
657: if (temp == NULL)
658: break;
659: scan = temp;
660: }
661:
662: if (OP(scan) == BACK)
663: offset = scan - val;
664: else
665: offset = val - scan;
666: *(scan+1) = (offset>>8)&0377;
667: *(scan+2) = offset&0377;
668: }
669:
670: /*
671: - regoptail - regtail on operand of first argument; nop if operandless
672: */
673: static void
674: regoptail(p, val)
675: char *p;
676: char *val;
677: {
678: /* "Operandless" and "op != BRANCH" are synonymous in practice. */
679: if (p == NULL || p == ®dummy || OP(p) != BRANCH)
680: return;
681: regtail(OPERAND(p), val);
682: }
683:
684: /*
685: * regexec and friends
686: */
687:
688: /*
689: * Global work variables for regexec().
690: */
691: static char *reginput; /* String-input pointer. */
692: static char *regbol; /* Beginning of input, for ^ check. */
693: static char **regstartp; /* Pointer to startp array. */
694: static char **regendp; /* Ditto for endp. */
695:
696: /*
697: * Forwards.
698: */
699: STATIC int regtry();
700: STATIC int regmatch();
701: STATIC int regrepeat();
702:
703: #ifdef DEBUG
704: int regnarrate = 0;
705: void regdump();
706: STATIC char *regprop();
707: #endif
708:
709: /*
710: - regexec - match a regexp against a string
711: */
712: int
713: regexec(prog, string)
714: register regexp *prog;
715: register char *string;
716: {
717: register char *s;
718:
719: /* Be paranoid... */
720: if (prog == NULL || string == NULL) {
721: regerror("NULL parameter");
722: return(0);
723: }
724:
725: /* Check validity of program. */
726: if (UCHARAT(prog->program) != MAGIC) {
727: regerror("corrupted program");
728: return(0);
729: }
730:
731: /* If there is a "must appear" string, look for it. */
732: if (prog->regmust != NULL) {
733: s = string;
734: while ((s = strchr(s, prog->regmust[0])) != NULL) {
735: if (strncmp(s, prog->regmust, prog->regmlen) == 0)
736: break; /* Found it. */
737: s++;
738: }
739: if (s == NULL) /* Not present. */
740: return(0);
741: }
742:
743: /* Mark beginning of line for ^ . */
744: regbol = string;
745:
746: /* Simplest case: anchored match need be tried only once. */
747: if (prog->reganch)
748: return(regtry(prog, string));
749:
750: /* Messy cases: unanchored match. */
751: s = string;
752: if (prog->regstart != '\0')
753: /* We know what char it must start with. */
754: while ((s = strchr(s, prog->regstart)) != NULL) {
755: if (regtry(prog, s))
756: return(1);
757: s++;
758: }
759: else
760: /* We don't -- general case. */
761: do {
762: if (regtry(prog, s))
763: return(1);
764: } while (*s++ != '\0');
765:
766: /* Failure. */
767: return(0);
768: }
769:
770: /*
771: - regtry - try match at specific point
772: */
773: static int /* 0 failure, 1 success */
774: regtry(prog, string)
775: regexp *prog;
776: char *string;
777: {
778: register int i;
779: register char **sp;
780: register char **ep;
781:
782: reginput = string;
783: regstartp = prog->startp;
784: regendp = prog->endp;
785:
786: sp = prog->startp;
787: ep = prog->endp;
788: for (i = NSUBEXP; i > 0; i--) {
789: *sp++ = NULL;
790: *ep++ = NULL;
791: }
792: if (regmatch(prog->program + 1)) {
793: prog->startp[0] = string;
794: prog->endp[0] = reginput;
795: return(1);
796: } else
797: return(0);
798: }
799:
800: /*
801: - regmatch - main matching routine
802: *
803: * Conceptually the strategy is simple: check to see whether the current
804: * node matches, call self recursively to see whether the rest matches,
805: * and then act accordingly. In practice we make some effort to avoid
806: * recursion, in particular by going through "ordinary" nodes (that don't
807: * need to know whether the rest of the match failed) by a loop instead of
808: * by recursion.
809: */
810: static int /* 0 failure, 1 success */
811: regmatch(prog)
812: char *prog;
813: {
814: register char *scan; /* Current node. */
815: char *next; /* Next node. */
816:
817: scan = prog;
818: #ifdef DEBUG
819: if (scan != NULL && regnarrate)
820: fprintf(stderr, "%s(\n", regprop(scan));
821: #endif
822: while (scan != NULL) {
823: #ifdef DEBUG
824: if (regnarrate)
825: fprintf(stderr, "%s...\n", regprop(scan));
826: #endif
827: next = regnext(scan);
828:
829: switch (OP(scan)) {
830: case BOL:
831: if (reginput != regbol)
832: return(0);
833: break;
834: case EOL:
835: if (*reginput != '\0')
836: return(0);
837: break;
838: case ANY:
839: if (*reginput == '\0')
840: return(0);
841: reginput++;
842: break;
843: case EXACTLY: {
844: register int len;
845: register char *opnd;
846:
847: opnd = OPERAND(scan);
848: /* Inline the first character, for speed. */
849: if (*opnd != *reginput)
850: return(0);
851: len = strlen(opnd);
852: if (len > 1 && strncmp(opnd, reginput, len) != 0)
853: return(0);
854: reginput += len;
855: }
856: break;
857: case ANYOF:
858: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
859: return(0);
860: reginput++;
861: break;
862: case ANYBUT:
863: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
864: return(0);
865: reginput++;
866: break;
867: case NOTHING:
868: break;
869: case BACK:
870: break;
871: case OPEN+1:
872: case OPEN+2:
873: case OPEN+3:
874: case OPEN+4:
875: case OPEN+5:
876: case OPEN+6:
877: case OPEN+7:
878: case OPEN+8:
879: case OPEN+9: {
880: register int no;
881: register char *save;
882:
883: no = OP(scan) - OPEN;
884: save = reginput;
885:
886: if (regmatch(next)) {
887: /*
888: * Don't set startp if some later
889: * invocation of the same parentheses
890: * already has.
891: */
892: if (regstartp[no] == NULL)
893: regstartp[no] = save;
894: return(1);
895: } else
896: return(0);
897: }
898: /* NOTREACHED */
899: break;
900: case CLOSE+1:
901: case CLOSE+2:
902: case CLOSE+3:
903: case CLOSE+4:
904: case CLOSE+5:
905: case CLOSE+6:
906: case CLOSE+7:
907: case CLOSE+8:
908: case CLOSE+9: {
909: register int no;
910: register char *save;
911:
912: no = OP(scan) - CLOSE;
913: save = reginput;
914:
915: if (regmatch(next)) {
916: /*
917: * Don't set endp if some later
918: * invocation of the same parentheses
919: * already has.
920: */
921: if (regendp[no] == NULL)
922: regendp[no] = save;
923: return(1);
924: } else
925: return(0);
926: }
927: /* NOTREACHED */
928: break;
929: case BRANCH: {
930: register char *save;
931:
932: if (OP(next) != BRANCH) /* No choice. */
933: next = OPERAND(scan); /* Avoid recursion. */
934: else {
935: do {
936: save = reginput;
937: if (regmatch(OPERAND(scan)))
938: return(1);
939: reginput = save;
940: scan = regnext(scan);
941: } while (scan != NULL && OP(scan) == BRANCH);
942: return(0);
943: /* NOTREACHED */
944: }
945: }
946: /* NOTREACHED */
947: break;
948: case STAR:
949: case PLUS: {
950: register char nextch;
951: register int no;
952: register char *save;
953: register int min;
954:
955: /*
956: * Lookahead to avoid useless match attempts
957: * when we know what character comes next.
958: */
959: nextch = '\0';
960: if (OP(next) == EXACTLY)
961: nextch = *OPERAND(next);
962: min = (OP(scan) == STAR) ? 0 : 1;
963: save = reginput;
964: no = regrepeat(OPERAND(scan));
965: while (no >= min) {
966: /* If it could work, try it. */
967: if (nextch == '\0' || *reginput == nextch)
968: if (regmatch(next))
969: return(1);
970: /* Couldn't or didn't -- back up. */
971: no--;
972: reginput = save + no;
973: }
974: return(0);
975: }
976: /* NOTREACHED */
977: break;
978: case END:
979: return(1); /* Success! */
980: /* NOTREACHED */
981: break;
982: default:
983: regerror("memory corruption");
984: return(0);
985: /* NOTREACHED */
986: break;
987: }
988:
989: scan = next;
990: }
991:
992: /*
993: * We get here only if there's trouble -- normally "case END" is
994: * the terminating point.
995: */
996: regerror("corrupted pointers");
997: return(0);
998: }
999:
1000: /*
1001: - regrepeat - repeatedly match something simple, report how many
1002: */
1003: static int
1004: regrepeat(p)
1005: char *p;
1006: {
1007: register int count = 0;
1008: register char *scan;
1009: register char *opnd;
1010:
1011: scan = reginput;
1012: opnd = OPERAND(p);
1013: switch (OP(p)) {
1014: case ANY:
1015: count = strlen(scan);
1016: scan += count;
1017: break;
1018: case EXACTLY:
1019: while (*opnd == *scan) {
1020: count++;
1021: scan++;
1022: }
1023: break;
1024: case ANYOF:
1025: while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1026: count++;
1027: scan++;
1028: }
1029: break;
1030: case ANYBUT:
1031: while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1032: count++;
1033: scan++;
1034: }
1035: break;
1036: default: /* Oh dear. Called inappropriately. */
1037: regerror("internal foulup");
1038: count = 0; /* Best compromise. */
1039: break;
1040: }
1041: reginput = scan;
1042:
1043: return(count);
1044: }
1045:
1046: /*
1047: - regnext - dig the "next" pointer out of a node
1048: */
1049: static char *
1050: regnext(p)
1051: register char *p;
1052: {
1053: register int offset;
1054:
1055: if (p == ®dummy)
1056: return(NULL);
1057:
1058: offset = NEXT(p);
1059: if (offset == 0)
1060: return(NULL);
1061:
1062: if (OP(p) == BACK)
1063: return(p-offset);
1064: else
1065: return(p+offset);
1066: }
1067:
1068: #ifdef DEBUG
1069:
1070: STATIC char *regprop();
1071:
1072: /*
1073: - regdump - dump a regexp onto stdout in vaguely comprehensible form
1074: */
1075: void
1076: regdump(r)
1077: regexp *r;
1078: {
1079: register char *s;
1080: register char op = EXACTLY; /* Arbitrary non-END op. */
1081: register char *next;
1082:
1083:
1084: s = r->program + 1;
1085: while (op != END) { /* While that wasn't END last time... */
1086: op = OP(s);
1087: printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1088: next = regnext(s);
1089: if (next == NULL) /* Next ptr. */
1090: printf("(0)");
1091: else
1092: printf("(%d)", (s-r->program)+(next-s));
1093: s += 3;
1094: if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1095: /* Literal string, where present. */
1096: while (*s != '\0') {
1097: putchar(*s);
1098: s++;
1099: }
1100: s++;
1101: }
1102: putchar('\n');
1103: }
1104:
1105: /* Header fields of interest. */
1106: if (r->regstart != '\0')
1107: printf("start `%c' ", r->regstart);
1108: if (r->reganch)
1109: printf("anchored ");
1110: if (r->regmust != NULL)
1111: printf("must have \"%s\"", r->regmust);
1112: printf("\n");
1113: }
1114:
1115: /*
1116: - regprop - printable representation of opcode
1117: */
1118: static char *
1119: regprop(op)
1120: char *op;
1121: {
1122: register char *p;
1123: static char buf[50];
1124:
1125: (void) strcpy(buf, ":");
1126:
1127: switch (OP(op)) {
1128: case BOL:
1129: p = "BOL";
1130: break;
1131: case EOL:
1132: p = "EOL";
1133: break;
1134: case ANY:
1135: p = "ANY";
1136: break;
1137: case ANYOF:
1138: p = "ANYOF";
1139: break;
1140: case ANYBUT:
1141: p = "ANYBUT";
1142: break;
1143: case BRANCH:
1144: p = "BRANCH";
1145: break;
1146: case EXACTLY:
1147: p = "EXACTLY";
1148: break;
1149: case NOTHING:
1150: p = "NOTHING";
1151: break;
1152: case BACK:
1153: p = "BACK";
1154: break;
1155: case END:
1156: p = "END";
1157: break;
1158: case OPEN+1:
1159: case OPEN+2:
1160: case OPEN+3:
1161: case OPEN+4:
1162: case OPEN+5:
1163: case OPEN+6:
1164: case OPEN+7:
1165: case OPEN+8:
1166: case OPEN+9:
1167: sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1168: p = NULL;
1169: break;
1170: case CLOSE+1:
1171: case CLOSE+2:
1172: case CLOSE+3:
1173: case CLOSE+4:
1174: case CLOSE+5:
1175: case CLOSE+6:
1176: case CLOSE+7:
1177: case CLOSE+8:
1178: case CLOSE+9:
1179: sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1180: p = NULL;
1181: break;
1182: case STAR:
1183: p = "STAR";
1184: break;
1185: case PLUS:
1186: p = "PLUS";
1187: break;
1188: default:
1189: regerror("corrupted opcode");
1190: break;
1191: }
1192: if (p != NULL)
1193: (void) strcat(buf, p);
1194: return(buf);
1195: }
1196: #endif
1197:
1198: /*
1199: * The following is provided for those people who do not have strcspn() in
1200: * their C libraries. They should get off their butts and do something
1201: * about it; at least one public-domain implementation of those (highly
1202: * useful) string routines has been published on Usenet.
1203: */
1204: #ifdef STRCSPN
1205: /*
1206: * strcspn - find length of initial segment of s1 consisting entirely
1207: * of characters not from s2
1208: */
1209:
1210: static int
1211: strcspn(s1, s2)
1212: char *s1;
1213: char *s2;
1214: {
1215: register char *scan1;
1216: register char *scan2;
1217: register int count;
1218:
1219: count = 0;
1220: for (scan1 = s1; *scan1 != '\0'; scan1++) {
1221: for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1222: if (*scan1 == *scan2++)
1223: return(count);
1224: count++;
1225: }
1226: return(count);
1227: }
1228: #endif