Annotation of src/usr.bin/vgrind/regexp.c, Revision 1.6
1.6 ! deraadt 1: /* $OpenBSD: regexp.c,v 1.5 2003/02/19 07:32:36 deraadt Exp $ */
1.1 deraadt 2: /* $NetBSD: regexp.c,v 1.3 1994/11/17 08:28:02 jtc Exp $ */
3:
4: /*
5: * Copyright (c) 1980, 1993
6: * The Regents of the University of California. All rights reserved.
7: *
8: *
9: * Redistribution and use in source and binary forms, with or without
10: * modification, are permitted provided that the following conditions
11: * are met:
12: * 1. Redistributions of source code must retain the above copyright
13: * notice, this list of conditions and the following disclaimer.
14: * 2. Redistributions in binary form must reproduce the above copyright
15: * notice, this list of conditions and the following disclaimer in the
16: * documentation and/or other materials provided with the distribution.
17: * 3. All advertising materials mentioning features or use of this software
18: * must display the following acknowledgement:
19: * This product includes software developed by the University of
20: * California, Berkeley and its contributors.
21: * 4. Neither the name of the University nor the names of its contributors
22: * may be used to endorse or promote products derived from this software
23: * without specific prior written permission.
24: *
25: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35: * SUCH DAMAGE.
36: */
37:
38: #ifndef lint
39: static char copyright[] =
40: "@(#) Copyright (c) 1980, 1993\n\
41: The Regents of the University of California. All rights reserved.\n";
42: #endif /* not lint */
43:
44: #ifndef lint
45: #if 0
46: static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
47: #endif
1.6 ! deraadt 48: static char rcsid[] = "$OpenBSD: regexp.c,v 1.5 2003/02/19 07:32:36 deraadt Exp $";
1.1 deraadt 49: #endif /* not lint */
50:
51: #include <ctype.h>
52: #include <stdlib.h>
53: #include <string.h>
54: #include "extern.h"
55:
56: #define FALSE 0
57: #define TRUE !(FALSE)
58: #define NIL 0
59:
1.4 millert 60: static void expconv(void);
1.1 deraadt 61:
1.5 deraadt 62: boolean x_escaped; /* true if we are currently x_escaped */
63: char *x_start; /* start of string */
1.1 deraadt 64: boolean l_onecase; /* true if upper and lower equivalent */
65:
66: #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
67:
68: /* STRNCMP - like strncmp except that we convert the
69: * first string to lower case before comparing
70: * if l_onecase is set.
71: */
72:
73: int
1.6 ! deraadt 74: STRNCMP(char *s1, char *s2, int len)
1.1 deraadt 75: {
76: if (l_onecase) {
77: do
78: if (*s2 - makelower(*s1))
79: return (*s2 - makelower(*s1));
80: else {
81: s2++;
82: s1++;
83: }
84: while (--len);
85: } else {
86: do
87: if (*s2 - *s1)
88: return (*s2 - *s1);
89: else {
90: s2++;
91: s1++;
92: }
93: while (--len);
94: }
95: return(0);
96: }
97:
98: /* The following routine converts an irregular expression to
99: * internal format.
100: *
101: * Either meta symbols (\a \d or \p) or character strings or
1.5 deraadt 102: * operations ( alternation or parenthesizing ) can be
1.1 deraadt 103: * specified. Each starts with a descriptor byte. The descriptor
104: * byte has STR set for strings, META set for meta symbols
105: * and OPER set for operations.
106: * The descriptor byte can also have the OPT bit set if the object
107: * defined is optional. Also ALT can be set to indicate an alternation.
108: *
1.5 deraadt 109: * For metasymbols the byte following the descriptor byte identifies
1.1 deraadt 110: * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
111: * strings the byte after the descriptor is a character count for
112: * the string:
113: *
114: * meta symbols := descriptor
115: * symbol
116: *
117: * strings := descriptor
118: * character count
119: * the string
120: *
121: * operatins := descriptor
122: * symbol
123: * character count
124: */
125:
126: /*
127: * handy macros for accessing parts of match blocks
128: */
129: #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
130: #define MNEXT(A) (A+2) /* character following a metasymbol block */
131:
132: #define OSYM(A) (*(A+1)) /* symbol in an operation block */
133: #define OCNT(A) (*(A+2)) /* character count */
134: #define ONEXT(A) (A+3) /* next character after the operation */
135: #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
136:
137: #define SCNT(A) (*(A+1)) /* byte count of a string */
138: #define SSTR(A) (A+2) /* address of the string */
139: #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
140:
141: /*
142: * bit flags in the descriptor
143: */
144: #define OPT 1
145: #define STR 2
146: #define META 4
147: #define ALT 8
148: #define OPER 16
149:
150: static char *ccre; /* pointer to current position in converted exp*/
151: static char *ure; /* pointer current position in unconverted exp */
152:
153: char *
1.6 ! deraadt 154: convexp(char *re)
1.1 deraadt 155: {
1.3 mpech 156: char *cre; /* pointer to converted regular expression */
1.1 deraadt 157:
158: /* allocate room for the converted expression */
159: if (re == NIL)
160: return (NIL);
161: if (*re == '\0')
162: return (NIL);
163: cre = malloc (4 * strlen(re) + 3);
164: ccre = cre;
165: ure = re;
166:
167: /* start the conversion with a \a */
168: *cre = META | OPT;
169: MSYM(cre) = 'a';
170: ccre = MNEXT(cre);
171:
172: /* start the conversion (its recursive) */
173: expconv ();
174: *ccre = 0;
175: return (cre);
176: }
177:
178: static void
1.6 ! deraadt 179: expconv(void)
1.1 deraadt 180: {
1.3 mpech 181: char *cs; /* pointer to current symbol in converted exp */
182: char c; /* character being processed */
1.5 deraadt 183: char *acs; /* pointer to last alternate */
1.3 mpech 184: int temp;
1.1 deraadt 185:
186: /* let the conversion begin */
187: acs = NIL;
188: cs = NIL;
189: while (*ure != NIL) {
190: switch (c = *ure++) {
191:
192: case '\\':
193: switch (c = *ure++) {
194:
195: /* escaped characters are just characters */
196: default:
197: if (cs == NIL || (*cs & STR) == 0) {
198: cs = ccre;
199: *cs = STR;
200: SCNT(cs) = 1;
201: ccre += 2;
202: } else
203: SCNT(cs)++;
204: *ccre++ = c;
205: break;
206:
207: /* normal(?) metacharacters */
208: case 'a':
209: case 'd':
210: case 'e':
211: case 'p':
212: if (acs != NIL && acs != cs) {
213: do {
214: temp = OCNT(acs);
215: OCNT(acs) = ccre - acs;
216: acs -= temp;
217: } while (temp != 0);
218: acs = NIL;
219: }
220: cs = ccre;
221: *cs = META;
222: MSYM(cs) = c;
223: ccre = MNEXT(cs);
224: break;
225: }
226: break;
227:
228: /* just put the symbol in */
229: case '^':
230: case '$':
231: if (acs != NIL && acs != cs) {
232: do {
233: temp = OCNT(acs);
234: OCNT(acs) = ccre - acs;
235: acs -= temp;
236: } while (temp != 0);
237: acs = NIL;
238: }
239: cs = ccre;
240: *cs = META;
241: MSYM(cs) = c;
242: ccre = MNEXT(cs);
243: break;
244:
245: /* mark the last match sequence as optional */
246: case '?':
247: if (cs)
248: *cs = *cs | OPT;
249: break;
250:
251: /* recurse and define a subexpression */
252: case '(':
253: if (acs != NIL && acs != cs) {
254: do {
255: temp = OCNT(acs);
256: OCNT(acs) = ccre - acs;
257: acs -= temp;
258: } while (temp != 0);
259: acs = NIL;
260: }
261: cs = ccre;
262: *cs = OPER;
263: OSYM(cs) = '(';
264: ccre = ONEXT(cs);
265: expconv ();
266: OCNT(cs) = ccre - cs; /* offset to next symbol */
267: break;
268:
1.5 deraadt 269: /* return from a recursion */
1.1 deraadt 270: case ')':
271: if (acs != NIL) {
272: do {
273: temp = OCNT(acs);
274: OCNT(acs) = ccre - acs;
275: acs -= temp;
276: } while (temp != 0);
277: acs = NIL;
278: }
279: cs = ccre;
280: *cs = META;
281: MSYM(cs) = c;
282: ccre = MNEXT(cs);
283: return;
284:
285: /* mark the last match sequence as having an alternate */
286: /* the third byte will contain an offset to jump over the */
287: /* alternate match in case the first did not fail */
288: case '|':
289: if (acs != NIL && acs != cs)
290: OCNT(ccre) = ccre - acs; /* make a back pointer */
291: else
292: OCNT(ccre) = 0;
293: *cs |= ALT;
294: cs = ccre;
295: *cs = OPER;
296: OSYM(cs) = '|';
297: ccre = ONEXT(cs);
298: acs = cs; /* remember that the pointer is to be filles */
299: break;
300:
1.5 deraadt 301: /* if its not a metasymbol just build a character string */
1.1 deraadt 302: default:
303: if (cs == NIL || (*cs & STR) == 0) {
304: cs = ccre;
305: *cs = STR;
306: SCNT(cs) = 1;
307: ccre = SSTR(cs);
308: } else
309: SCNT(cs)++;
310: *ccre++ = c;
311: break;
312: }
313: }
314: if (acs != NIL) {
315: do {
316: temp = OCNT(acs);
317: OCNT(acs) = ccre - acs;
318: acs -= temp;
319: } while (temp != 0);
320: acs = NIL;
321: }
322: return;
323: }
1.5 deraadt 324: /* end of converter */
1.1 deraadt 325:
326:
327: /*
1.5 deraadt 328: * The following routine recognises an irregular expression
1.1 deraadt 329: * with the following special characters:
330: *
331: * \? - means last match was optional
332: * \a - matches any number of characters
333: * \d - matches any number of spaces and tabs
334: * \p - matches any number of alphanumeric
335: * characters. The
336: * characters matched will be copied into
337: * the area pointed to by 'name'.
338: * \| - alternation
339: * \( \) - grouping used mostly for alternation and
340: * optionality
341: *
342: * The irregular expression must be translated to internal form
343: * prior to calling this routine
344: *
345: * The value returned is the pointer to the first non \a
346: * character matched.
347: */
348:
349: char *
1.6 ! deraadt 350: expmatch(char *s, char *re, char *mstring)
1.1 deraadt 351: {
1.3 mpech 352: char *cs; /* the current symbol */
353: char *ptr,*s1; /* temporary pointer */
1.1 deraadt 354: boolean matched; /* a temporary boolean */
355:
356: /* initial conditions */
357: if (re == NIL)
358: return (NIL);
359: cs = re;
360: matched = FALSE;
361:
362: /* loop till expression string is exhausted (or at least pretty tired) */
363: while (*cs) {
364: switch (*cs & (OPER | STR | META)) {
365:
366: /* try to match a string */
367: case STR:
368: matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
369: if (matched) {
370:
371: /* hoorah it matches */
372: s += SCNT(cs);
373: cs = SNEXT(cs);
374: } else if (*cs & ALT) {
375:
376: /* alternation, skip to next expression */
377: cs = SNEXT(cs);
378: } else if (*cs & OPT) {
379:
380: /* the match is optional */
381: cs = SNEXT(cs);
382: matched = 1; /* indicate a successful match */
383: } else {
384:
385: /* no match, error return */
386: return (NIL);
387: }
388: break;
389:
390: /* an operator, do something fancy */
391: case OPER:
392: switch (OSYM(cs)) {
393:
394: /* this is an alternation */
395: case '|':
396: if (matched)
397:
398: /* last thing in the alternation was a match, skip ahead */
399: cs = OPTR(cs);
400: else
401:
402: /* no match, keep trying */
403: cs = ONEXT(cs);
404: break;
405:
406: /* this is a grouping, recurse */
407: case '(':
408: ptr = expmatch (s, ONEXT(cs), mstring);
409: if (ptr != NIL) {
410:
411: /* the subexpression matched */
412: matched = 1;
413: s = ptr;
414: } else if (*cs & ALT) {
415:
416: /* alternation, skip to next expression */
417: matched = 0;
418: } else if (*cs & OPT) {
419:
420: /* the match is optional */
421: matched = 1; /* indicate a successful match */
422: } else {
423:
424: /* no match, error return */
425: return (NIL);
426: }
427: cs = OPTR(cs);
428: break;
429: }
430: break;
431:
432: /* try to match a metasymbol */
433: case META:
434: switch (MSYM(cs)) {
435:
436: /* try to match anything and remember what was matched */
437: case 'p':
438: /*
439: * This is really the same as trying the match the
440: * remaining parts of the expression to any subset
441: * of the string.
442: */
443: s1 = s;
444: do {
445: ptr = expmatch (s1, MNEXT(cs), mstring);
446: if (ptr != NIL && s1 != s) {
447:
448: /* we have a match, remember the match */
449: strncpy (mstring, s, s1 - s);
450: mstring[s1 - s] = '\0';
451: return (ptr);
452: } else if (ptr != NIL && (*cs & OPT)) {
453:
1.5 deraadt 454: /* it was optional so no match is ok */
1.1 deraadt 455: return (ptr);
456: } else if (ptr != NIL) {
457:
458: /* not optional and we still matched */
459: return (NIL);
460: }
461: if (!isalnum(*s1) && *s1 != '_')
462: return (NIL);
463: if (*s1 == '\\')
1.5 deraadt 464: x_escaped = x_escaped ? FALSE : TRUE;
1.1 deraadt 465: else
1.5 deraadt 466: x_escaped = FALSE;
1.1 deraadt 467: } while (*s1++);
468: return (NIL);
469:
470: /* try to match anything */
471: case 'a':
472: /*
473: * This is really the same as trying the match the
474: * remaining parts of the expression to any subset
475: * of the string.
476: */
477: s1 = s;
478: do {
479: ptr = expmatch (s1, MNEXT(cs), mstring);
480: if (ptr != NIL && s1 != s) {
481:
482: /* we have a match */
483: return (ptr);
484: } else if (ptr != NIL && (*cs & OPT)) {
485:
1.5 deraadt 486: /* it was optional so no match is ok */
1.1 deraadt 487: return (ptr);
488: } else if (ptr != NIL) {
489:
490: /* not optional and we still matched */
491: return (NIL);
492: }
493: if (*s1 == '\\')
1.5 deraadt 494: x_escaped = x_escaped ? FALSE : TRUE;
1.1 deraadt 495: else
1.5 deraadt 496: x_escaped = FALSE;
1.1 deraadt 497: } while (*s1++);
498: return (NIL);
499:
1.5 deraadt 500: /* fail if we are currently x_escaped */
1.1 deraadt 501: case 'e':
1.5 deraadt 502: if (x_escaped)
1.1 deraadt 503: return(NIL);
504: cs = MNEXT(cs);
505: break;
506:
507: /* match any number of tabs and spaces */
508: case 'd':
509: ptr = s;
510: while (*s == ' ' || *s == '\t')
511: s++;
1.5 deraadt 512: if (s != ptr || s == x_start) {
1.1 deraadt 513:
514: /* match, be happy */
515: matched = 1;
516: cs = MNEXT(cs);
517: } else if (*s == '\n' || *s == '\0') {
518:
519: /* match, be happy */
520: matched = 1;
521: cs = MNEXT(cs);
522: } else if (*cs & ALT) {
523:
524: /* try the next part */
525: matched = 0;
526: cs = MNEXT(cs);
527: } else if (*cs & OPT) {
528:
529: /* doesn't matter */
530: matched = 1;
531: cs = MNEXT(cs);
532: } else
533:
534: /* no match, error return */
535: return (NIL);
536: break;
537:
538: /* check for end of line */
539: case '$':
540: if (*s == '\0' || *s == '\n') {
541:
542: /* match, be happy */
543: s++;
544: matched = 1;
545: cs = MNEXT(cs);
546: } else if (*cs & ALT) {
547:
548: /* try the next part */
549: matched = 0;
550: cs = MNEXT(cs);
551: } else if (*cs & OPT) {
552:
553: /* doesn't matter */
554: matched = 1;
555: cs = MNEXT(cs);
556: } else
557:
558: /* no match, error return */
559: return (NIL);
560: break;
561:
562: /* check for start of line */
563: case '^':
1.5 deraadt 564: if (s == x_start) {
1.1 deraadt 565:
566: /* match, be happy */
567: matched = 1;
568: cs = MNEXT(cs);
569: } else if (*cs & ALT) {
570:
571: /* try the next part */
572: matched = 0;
573: cs = MNEXT(cs);
574: } else if (*cs & OPT) {
575:
576: /* doesn't matter */
577: matched = 1;
578: cs = MNEXT(cs);
579: } else
580:
581: /* no match, error return */
582: return (NIL);
583: break;
584:
585: /* end of a subexpression, return success */
586: case ')':
587: return (s);
588: }
589: break;
590: }
591: }
592: return (s);
593: }