Annotation of src/usr.bin/vgrind/regexp.c, Revision 1.2
1.2 ! deraadt 1: /* $OpenBSD: regexp.c,v 1.3 1994/11/17 08:28:02 jtc Exp $ */
1.1 deraadt 2: /* $NetBSD: regexp.c,v 1.3 1994/11/17 08:28:02 jtc Exp $ */
3:
4: /*
5: * Copyright (c) 1980, 1993
6: * The Regents of the University of California. All rights reserved.
7: *
8: *
9: * Redistribution and use in source and binary forms, with or without
10: * modification, are permitted provided that the following conditions
11: * are met:
12: * 1. Redistributions of source code must retain the above copyright
13: * notice, this list of conditions and the following disclaimer.
14: * 2. Redistributions in binary form must reproduce the above copyright
15: * notice, this list of conditions and the following disclaimer in the
16: * documentation and/or other materials provided with the distribution.
17: * 3. All advertising materials mentioning features or use of this software
18: * must display the following acknowledgement:
19: * This product includes software developed by the University of
20: * California, Berkeley and its contributors.
21: * 4. Neither the name of the University nor the names of its contributors
22: * may be used to endorse or promote products derived from this software
23: * without specific prior written permission.
24: *
25: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35: * SUCH DAMAGE.
36: */
37:
38: #ifndef lint
39: static char copyright[] =
40: "@(#) Copyright (c) 1980, 1993\n\
41: The Regents of the University of California. All rights reserved.\n";
42: #endif /* not lint */
43:
44: #ifndef lint
45: #if 0
46: static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
47: #endif
1.2 ! deraadt 48: static char rcsid[] = "$OpenBSD: regexp.c,v 1.3 1994/11/17 08:28:02 jtc Exp $";
1.1 deraadt 49: #endif /* not lint */
50:
51: #include <ctype.h>
52: #include <stdlib.h>
53: #include <string.h>
54: #include "extern.h"
55:
56: #define FALSE 0
57: #define TRUE !(FALSE)
58: #define NIL 0
59:
60: static void expconv __P((void));
61:
62: boolean _escaped; /* true if we are currently _escaped */
63: char *_start; /* start of string */
64: boolean l_onecase; /* true if upper and lower equivalent */
65:
66: #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
67:
68: /* STRNCMP - like strncmp except that we convert the
69: * first string to lower case before comparing
70: * if l_onecase is set.
71: */
72:
73: int
74: STRNCMP(s1, s2, len)
75: register char *s1,*s2;
76: register int len;
77: {
78: if (l_onecase) {
79: do
80: if (*s2 - makelower(*s1))
81: return (*s2 - makelower(*s1));
82: else {
83: s2++;
84: s1++;
85: }
86: while (--len);
87: } else {
88: do
89: if (*s2 - *s1)
90: return (*s2 - *s1);
91: else {
92: s2++;
93: s1++;
94: }
95: while (--len);
96: }
97: return(0);
98: }
99:
100: /* The following routine converts an irregular expression to
101: * internal format.
102: *
103: * Either meta symbols (\a \d or \p) or character strings or
104: * operations ( alternation or perenthesizing ) can be
105: * specified. Each starts with a descriptor byte. The descriptor
106: * byte has STR set for strings, META set for meta symbols
107: * and OPER set for operations.
108: * The descriptor byte can also have the OPT bit set if the object
109: * defined is optional. Also ALT can be set to indicate an alternation.
110: *
111: * For metasymbols the byte following the descriptor byte identities
112: * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
113: * strings the byte after the descriptor is a character count for
114: * the string:
115: *
116: * meta symbols := descriptor
117: * symbol
118: *
119: * strings := descriptor
120: * character count
121: * the string
122: *
123: * operatins := descriptor
124: * symbol
125: * character count
126: */
127:
128: /*
129: * handy macros for accessing parts of match blocks
130: */
131: #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
132: #define MNEXT(A) (A+2) /* character following a metasymbol block */
133:
134: #define OSYM(A) (*(A+1)) /* symbol in an operation block */
135: #define OCNT(A) (*(A+2)) /* character count */
136: #define ONEXT(A) (A+3) /* next character after the operation */
137: #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
138:
139: #define SCNT(A) (*(A+1)) /* byte count of a string */
140: #define SSTR(A) (A+2) /* address of the string */
141: #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
142:
143: /*
144: * bit flags in the descriptor
145: */
146: #define OPT 1
147: #define STR 2
148: #define META 4
149: #define ALT 8
150: #define OPER 16
151:
152: static char *ccre; /* pointer to current position in converted exp*/
153: static char *ure; /* pointer current position in unconverted exp */
154:
155: char *
156: convexp(re)
157: char *re; /* unconverted irregular expression */
158: {
159: register char *cre; /* pointer to converted regular expression */
160:
161: /* allocate room for the converted expression */
162: if (re == NIL)
163: return (NIL);
164: if (*re == '\0')
165: return (NIL);
166: cre = malloc (4 * strlen(re) + 3);
167: ccre = cre;
168: ure = re;
169:
170: /* start the conversion with a \a */
171: *cre = META | OPT;
172: MSYM(cre) = 'a';
173: ccre = MNEXT(cre);
174:
175: /* start the conversion (its recursive) */
176: expconv ();
177: *ccre = 0;
178: return (cre);
179: }
180:
181: static void
182: expconv()
183: {
184: register char *cs; /* pointer to current symbol in converted exp */
185: register char c; /* character being processed */
186: register char *acs; /* pinter to last alternate */
187: register int temp;
188:
189: /* let the conversion begin */
190: acs = NIL;
191: cs = NIL;
192: while (*ure != NIL) {
193: switch (c = *ure++) {
194:
195: case '\\':
196: switch (c = *ure++) {
197:
198: /* escaped characters are just characters */
199: default:
200: if (cs == NIL || (*cs & STR) == 0) {
201: cs = ccre;
202: *cs = STR;
203: SCNT(cs) = 1;
204: ccre += 2;
205: } else
206: SCNT(cs)++;
207: *ccre++ = c;
208: break;
209:
210: /* normal(?) metacharacters */
211: case 'a':
212: case 'd':
213: case 'e':
214: case 'p':
215: if (acs != NIL && acs != cs) {
216: do {
217: temp = OCNT(acs);
218: OCNT(acs) = ccre - acs;
219: acs -= temp;
220: } while (temp != 0);
221: acs = NIL;
222: }
223: cs = ccre;
224: *cs = META;
225: MSYM(cs) = c;
226: ccre = MNEXT(cs);
227: break;
228: }
229: break;
230:
231: /* just put the symbol in */
232: case '^':
233: case '$':
234: if (acs != NIL && acs != cs) {
235: do {
236: temp = OCNT(acs);
237: OCNT(acs) = ccre - acs;
238: acs -= temp;
239: } while (temp != 0);
240: acs = NIL;
241: }
242: cs = ccre;
243: *cs = META;
244: MSYM(cs) = c;
245: ccre = MNEXT(cs);
246: break;
247:
248: /* mark the last match sequence as optional */
249: case '?':
250: if (cs)
251: *cs = *cs | OPT;
252: break;
253:
254: /* recurse and define a subexpression */
255: case '(':
256: if (acs != NIL && acs != cs) {
257: do {
258: temp = OCNT(acs);
259: OCNT(acs) = ccre - acs;
260: acs -= temp;
261: } while (temp != 0);
262: acs = NIL;
263: }
264: cs = ccre;
265: *cs = OPER;
266: OSYM(cs) = '(';
267: ccre = ONEXT(cs);
268: expconv ();
269: OCNT(cs) = ccre - cs; /* offset to next symbol */
270: break;
271:
272: /* reurn from a recursion */
273: case ')':
274: if (acs != NIL) {
275: do {
276: temp = OCNT(acs);
277: OCNT(acs) = ccre - acs;
278: acs -= temp;
279: } while (temp != 0);
280: acs = NIL;
281: }
282: cs = ccre;
283: *cs = META;
284: MSYM(cs) = c;
285: ccre = MNEXT(cs);
286: return;
287:
288: /* mark the last match sequence as having an alternate */
289: /* the third byte will contain an offset to jump over the */
290: /* alternate match in case the first did not fail */
291: case '|':
292: if (acs != NIL && acs != cs)
293: OCNT(ccre) = ccre - acs; /* make a back pointer */
294: else
295: OCNT(ccre) = 0;
296: *cs |= ALT;
297: cs = ccre;
298: *cs = OPER;
299: OSYM(cs) = '|';
300: ccre = ONEXT(cs);
301: acs = cs; /* remember that the pointer is to be filles */
302: break;
303:
304: /* if its not a metasymbol just build a scharacter string */
305: default:
306: if (cs == NIL || (*cs & STR) == 0) {
307: cs = ccre;
308: *cs = STR;
309: SCNT(cs) = 1;
310: ccre = SSTR(cs);
311: } else
312: SCNT(cs)++;
313: *ccre++ = c;
314: break;
315: }
316: }
317: if (acs != NIL) {
318: do {
319: temp = OCNT(acs);
320: OCNT(acs) = ccre - acs;
321: acs -= temp;
322: } while (temp != 0);
323: acs = NIL;
324: }
325: return;
326: }
327: /* end of convertre */
328:
329:
330: /*
331: * The following routine recognises an irregular expresion
332: * with the following special characters:
333: *
334: * \? - means last match was optional
335: * \a - matches any number of characters
336: * \d - matches any number of spaces and tabs
337: * \p - matches any number of alphanumeric
338: * characters. The
339: * characters matched will be copied into
340: * the area pointed to by 'name'.
341: * \| - alternation
342: * \( \) - grouping used mostly for alternation and
343: * optionality
344: *
345: * The irregular expression must be translated to internal form
346: * prior to calling this routine
347: *
348: * The value returned is the pointer to the first non \a
349: * character matched.
350: */
351:
352: char *
353: expmatch (s, re, mstring)
354: register char *s; /* string to check for a match in */
355: register char *re; /* a converted irregular expression */
356: register char *mstring; /* where to put whatever matches a \p */
357: {
358: register char *cs; /* the current symbol */
359: register char *ptr,*s1; /* temporary pointer */
360: boolean matched; /* a temporary boolean */
361:
362: /* initial conditions */
363: if (re == NIL)
364: return (NIL);
365: cs = re;
366: matched = FALSE;
367:
368: /* loop till expression string is exhausted (or at least pretty tired) */
369: while (*cs) {
370: switch (*cs & (OPER | STR | META)) {
371:
372: /* try to match a string */
373: case STR:
374: matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
375: if (matched) {
376:
377: /* hoorah it matches */
378: s += SCNT(cs);
379: cs = SNEXT(cs);
380: } else if (*cs & ALT) {
381:
382: /* alternation, skip to next expression */
383: cs = SNEXT(cs);
384: } else if (*cs & OPT) {
385:
386: /* the match is optional */
387: cs = SNEXT(cs);
388: matched = 1; /* indicate a successful match */
389: } else {
390:
391: /* no match, error return */
392: return (NIL);
393: }
394: break;
395:
396: /* an operator, do something fancy */
397: case OPER:
398: switch (OSYM(cs)) {
399:
400: /* this is an alternation */
401: case '|':
402: if (matched)
403:
404: /* last thing in the alternation was a match, skip ahead */
405: cs = OPTR(cs);
406: else
407:
408: /* no match, keep trying */
409: cs = ONEXT(cs);
410: break;
411:
412: /* this is a grouping, recurse */
413: case '(':
414: ptr = expmatch (s, ONEXT(cs), mstring);
415: if (ptr != NIL) {
416:
417: /* the subexpression matched */
418: matched = 1;
419: s = ptr;
420: } else if (*cs & ALT) {
421:
422: /* alternation, skip to next expression */
423: matched = 0;
424: } else if (*cs & OPT) {
425:
426: /* the match is optional */
427: matched = 1; /* indicate a successful match */
428: } else {
429:
430: /* no match, error return */
431: return (NIL);
432: }
433: cs = OPTR(cs);
434: break;
435: }
436: break;
437:
438: /* try to match a metasymbol */
439: case META:
440: switch (MSYM(cs)) {
441:
442: /* try to match anything and remember what was matched */
443: case 'p':
444: /*
445: * This is really the same as trying the match the
446: * remaining parts of the expression to any subset
447: * of the string.
448: */
449: s1 = s;
450: do {
451: ptr = expmatch (s1, MNEXT(cs), mstring);
452: if (ptr != NIL && s1 != s) {
453:
454: /* we have a match, remember the match */
455: strncpy (mstring, s, s1 - s);
456: mstring[s1 - s] = '\0';
457: return (ptr);
458: } else if (ptr != NIL && (*cs & OPT)) {
459:
460: /* it was aoptional so no match is ok */
461: return (ptr);
462: } else if (ptr != NIL) {
463:
464: /* not optional and we still matched */
465: return (NIL);
466: }
467: if (!isalnum(*s1) && *s1 != '_')
468: return (NIL);
469: if (*s1 == '\\')
470: _escaped = _escaped ? FALSE : TRUE;
471: else
472: _escaped = FALSE;
473: } while (*s1++);
474: return (NIL);
475:
476: /* try to match anything */
477: case 'a':
478: /*
479: * This is really the same as trying the match the
480: * remaining parts of the expression to any subset
481: * of the string.
482: */
483: s1 = s;
484: do {
485: ptr = expmatch (s1, MNEXT(cs), mstring);
486: if (ptr != NIL && s1 != s) {
487:
488: /* we have a match */
489: return (ptr);
490: } else if (ptr != NIL && (*cs & OPT)) {
491:
492: /* it was aoptional so no match is ok */
493: return (ptr);
494: } else if (ptr != NIL) {
495:
496: /* not optional and we still matched */
497: return (NIL);
498: }
499: if (*s1 == '\\')
500: _escaped = _escaped ? FALSE : TRUE;
501: else
502: _escaped = FALSE;
503: } while (*s1++);
504: return (NIL);
505:
506: /* fail if we are currently _escaped */
507: case 'e':
508: if (_escaped)
509: return(NIL);
510: cs = MNEXT(cs);
511: break;
512:
513: /* match any number of tabs and spaces */
514: case 'd':
515: ptr = s;
516: while (*s == ' ' || *s == '\t')
517: s++;
518: if (s != ptr || s == _start) {
519:
520: /* match, be happy */
521: matched = 1;
522: cs = MNEXT(cs);
523: } else if (*s == '\n' || *s == '\0') {
524:
525: /* match, be happy */
526: matched = 1;
527: cs = MNEXT(cs);
528: } else if (*cs & ALT) {
529:
530: /* try the next part */
531: matched = 0;
532: cs = MNEXT(cs);
533: } else if (*cs & OPT) {
534:
535: /* doesn't matter */
536: matched = 1;
537: cs = MNEXT(cs);
538: } else
539:
540: /* no match, error return */
541: return (NIL);
542: break;
543:
544: /* check for end of line */
545: case '$':
546: if (*s == '\0' || *s == '\n') {
547:
548: /* match, be happy */
549: s++;
550: matched = 1;
551: cs = MNEXT(cs);
552: } else if (*cs & ALT) {
553:
554: /* try the next part */
555: matched = 0;
556: cs = MNEXT(cs);
557: } else if (*cs & OPT) {
558:
559: /* doesn't matter */
560: matched = 1;
561: cs = MNEXT(cs);
562: } else
563:
564: /* no match, error return */
565: return (NIL);
566: break;
567:
568: /* check for start of line */
569: case '^':
570: if (s == _start) {
571:
572: /* match, be happy */
573: matched = 1;
574: cs = MNEXT(cs);
575: } else if (*cs & ALT) {
576:
577: /* try the next part */
578: matched = 0;
579: cs = MNEXT(cs);
580: } else if (*cs & OPT) {
581:
582: /* doesn't matter */
583: matched = 1;
584: cs = MNEXT(cs);
585: } else
586:
587: /* no match, error return */
588: return (NIL);
589: break;
590:
591: /* end of a subexpression, return success */
592: case ')':
593: return (s);
594: }
595: break;
596: }
597: }
598: return (s);
599: }