Annotation of src/usr.bin/vgrind/regexp.c, Revision 1.8
1.8 ! deraadt 1: /* $OpenBSD: regexp.c,v 1.7 2003/06/03 02:56:21 millert Exp $ */
1.1 deraadt 2: /* $NetBSD: regexp.c,v 1.3 1994/11/17 08:28:02 jtc Exp $ */
3:
4: /*
5: * Copyright (c) 1980, 1993
6: * The Regents of the University of California. All rights reserved.
7: *
8: *
9: * Redistribution and use in source and binary forms, with or without
10: * modification, are permitted provided that the following conditions
11: * are met:
12: * 1. Redistributions of source code must retain the above copyright
13: * notice, this list of conditions and the following disclaimer.
14: * 2. Redistributions in binary form must reproduce the above copyright
15: * notice, this list of conditions and the following disclaimer in the
16: * documentation and/or other materials provided with the distribution.
1.7 millert 17: * 3. Neither the name of the University nor the names of its contributors
1.1 deraadt 18: * may be used to endorse or promote products derived from this software
19: * without specific prior written permission.
20: *
21: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31: * SUCH DAMAGE.
32: */
33:
34: #include <ctype.h>
35: #include <stdlib.h>
36: #include <string.h>
37: #include "extern.h"
38:
39: #define FALSE 0
40: #define TRUE !(FALSE)
41: #define NIL 0
42:
1.4 millert 43: static void expconv(void);
1.1 deraadt 44:
1.5 deraadt 45: boolean x_escaped; /* true if we are currently x_escaped */
46: char *x_start; /* start of string */
1.1 deraadt 47: boolean l_onecase; /* true if upper and lower equivalent */
48:
49: #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
50:
51: /* STRNCMP - like strncmp except that we convert the
52: * first string to lower case before comparing
53: * if l_onecase is set.
54: */
55:
56: int
1.6 deraadt 57: STRNCMP(char *s1, char *s2, int len)
1.1 deraadt 58: {
59: if (l_onecase) {
60: do
61: if (*s2 - makelower(*s1))
62: return (*s2 - makelower(*s1));
63: else {
64: s2++;
65: s1++;
66: }
67: while (--len);
68: } else {
69: do
70: if (*s2 - *s1)
71: return (*s2 - *s1);
72: else {
73: s2++;
74: s1++;
75: }
76: while (--len);
77: }
78: return(0);
79: }
80:
81: /* The following routine converts an irregular expression to
82: * internal format.
83: *
84: * Either meta symbols (\a \d or \p) or character strings or
1.5 deraadt 85: * operations ( alternation or parenthesizing ) can be
1.1 deraadt 86: * specified. Each starts with a descriptor byte. The descriptor
87: * byte has STR set for strings, META set for meta symbols
88: * and OPER set for operations.
89: * The descriptor byte can also have the OPT bit set if the object
90: * defined is optional. Also ALT can be set to indicate an alternation.
91: *
1.5 deraadt 92: * For metasymbols the byte following the descriptor byte identifies
1.1 deraadt 93: * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
94: * strings the byte after the descriptor is a character count for
95: * the string:
96: *
97: * meta symbols := descriptor
98: * symbol
99: *
100: * strings := descriptor
101: * character count
102: * the string
103: *
104: * operatins := descriptor
105: * symbol
106: * character count
107: */
108:
109: /*
110: * handy macros for accessing parts of match blocks
111: */
112: #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
113: #define MNEXT(A) (A+2) /* character following a metasymbol block */
114:
115: #define OSYM(A) (*(A+1)) /* symbol in an operation block */
116: #define OCNT(A) (*(A+2)) /* character count */
117: #define ONEXT(A) (A+3) /* next character after the operation */
118: #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
119:
120: #define SCNT(A) (*(A+1)) /* byte count of a string */
121: #define SSTR(A) (A+2) /* address of the string */
122: #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
123:
124: /*
125: * bit flags in the descriptor
126: */
127: #define OPT 1
128: #define STR 2
129: #define META 4
130: #define ALT 8
131: #define OPER 16
132:
133: static char *ccre; /* pointer to current position in converted exp*/
134: static char *ure; /* pointer current position in unconverted exp */
135:
136: char *
1.6 deraadt 137: convexp(char *re)
1.1 deraadt 138: {
1.3 mpech 139: char *cre; /* pointer to converted regular expression */
1.1 deraadt 140:
141: /* allocate room for the converted expression */
142: if (re == NIL)
143: return (NIL);
144: if (*re == '\0')
145: return (NIL);
146: cre = malloc (4 * strlen(re) + 3);
147: ccre = cre;
148: ure = re;
149:
150: /* start the conversion with a \a */
151: *cre = META | OPT;
152: MSYM(cre) = 'a';
153: ccre = MNEXT(cre);
154:
155: /* start the conversion (its recursive) */
156: expconv ();
157: *ccre = 0;
158: return (cre);
159: }
160:
161: static void
1.6 deraadt 162: expconv(void)
1.1 deraadt 163: {
1.3 mpech 164: char *cs; /* pointer to current symbol in converted exp */
165: char c; /* character being processed */
1.5 deraadt 166: char *acs; /* pointer to last alternate */
1.3 mpech 167: int temp;
1.1 deraadt 168:
169: /* let the conversion begin */
170: acs = NIL;
171: cs = NIL;
172: while (*ure != NIL) {
173: switch (c = *ure++) {
174:
175: case '\\':
176: switch (c = *ure++) {
177:
178: /* escaped characters are just characters */
179: default:
180: if (cs == NIL || (*cs & STR) == 0) {
181: cs = ccre;
182: *cs = STR;
183: SCNT(cs) = 1;
184: ccre += 2;
185: } else
186: SCNT(cs)++;
187: *ccre++ = c;
188: break;
189:
190: /* normal(?) metacharacters */
191: case 'a':
192: case 'd':
193: case 'e':
194: case 'p':
195: if (acs != NIL && acs != cs) {
196: do {
197: temp = OCNT(acs);
198: OCNT(acs) = ccre - acs;
199: acs -= temp;
200: } while (temp != 0);
201: acs = NIL;
202: }
203: cs = ccre;
204: *cs = META;
205: MSYM(cs) = c;
206: ccre = MNEXT(cs);
207: break;
208: }
209: break;
210:
211: /* just put the symbol in */
212: case '^':
213: case '$':
214: if (acs != NIL && acs != cs) {
215: do {
216: temp = OCNT(acs);
217: OCNT(acs) = ccre - acs;
218: acs -= temp;
219: } while (temp != 0);
220: acs = NIL;
221: }
222: cs = ccre;
223: *cs = META;
224: MSYM(cs) = c;
225: ccre = MNEXT(cs);
226: break;
227:
228: /* mark the last match sequence as optional */
229: case '?':
230: if (cs)
231: *cs = *cs | OPT;
232: break;
233:
234: /* recurse and define a subexpression */
235: case '(':
236: if (acs != NIL && acs != cs) {
237: do {
238: temp = OCNT(acs);
239: OCNT(acs) = ccre - acs;
240: acs -= temp;
241: } while (temp != 0);
242: acs = NIL;
243: }
244: cs = ccre;
245: *cs = OPER;
246: OSYM(cs) = '(';
247: ccre = ONEXT(cs);
248: expconv ();
249: OCNT(cs) = ccre - cs; /* offset to next symbol */
250: break;
251:
1.5 deraadt 252: /* return from a recursion */
1.1 deraadt 253: case ')':
254: if (acs != NIL) {
255: do {
256: temp = OCNT(acs);
257: OCNT(acs) = ccre - acs;
258: acs -= temp;
259: } while (temp != 0);
260: acs = NIL;
261: }
262: cs = ccre;
263: *cs = META;
264: MSYM(cs) = c;
265: ccre = MNEXT(cs);
266: return;
267:
268: /* mark the last match sequence as having an alternate */
269: /* the third byte will contain an offset to jump over the */
270: /* alternate match in case the first did not fail */
271: case '|':
272: if (acs != NIL && acs != cs)
273: OCNT(ccre) = ccre - acs; /* make a back pointer */
274: else
275: OCNT(ccre) = 0;
276: *cs |= ALT;
277: cs = ccre;
278: *cs = OPER;
279: OSYM(cs) = '|';
280: ccre = ONEXT(cs);
281: acs = cs; /* remember that the pointer is to be filles */
282: break;
283:
1.5 deraadt 284: /* if its not a metasymbol just build a character string */
1.1 deraadt 285: default:
286: if (cs == NIL || (*cs & STR) == 0) {
287: cs = ccre;
288: *cs = STR;
289: SCNT(cs) = 1;
290: ccre = SSTR(cs);
291: } else
292: SCNT(cs)++;
293: *ccre++ = c;
294: break;
295: }
296: }
297: if (acs != NIL) {
298: do {
299: temp = OCNT(acs);
300: OCNT(acs) = ccre - acs;
301: acs -= temp;
302: } while (temp != 0);
303: acs = NIL;
304: }
305: return;
306: }
1.5 deraadt 307: /* end of converter */
1.1 deraadt 308:
309:
310: /*
1.5 deraadt 311: * The following routine recognises an irregular expression
1.1 deraadt 312: * with the following special characters:
313: *
314: * \? - means last match was optional
315: * \a - matches any number of characters
316: * \d - matches any number of spaces and tabs
317: * \p - matches any number of alphanumeric
318: * characters. The
319: * characters matched will be copied into
320: * the area pointed to by 'name'.
321: * \| - alternation
322: * \( \) - grouping used mostly for alternation and
323: * optionality
324: *
325: * The irregular expression must be translated to internal form
326: * prior to calling this routine
327: *
328: * The value returned is the pointer to the first non \a
329: * character matched.
330: */
331:
332: char *
1.6 deraadt 333: expmatch(char *s, char *re, char *mstring)
1.1 deraadt 334: {
1.3 mpech 335: char *cs; /* the current symbol */
336: char *ptr,*s1; /* temporary pointer */
1.1 deraadt 337: boolean matched; /* a temporary boolean */
338:
339: /* initial conditions */
340: if (re == NIL)
341: return (NIL);
342: cs = re;
343: matched = FALSE;
344:
345: /* loop till expression string is exhausted (or at least pretty tired) */
346: while (*cs) {
347: switch (*cs & (OPER | STR | META)) {
348:
349: /* try to match a string */
350: case STR:
351: matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
352: if (matched) {
353:
354: /* hoorah it matches */
355: s += SCNT(cs);
356: cs = SNEXT(cs);
357: } else if (*cs & ALT) {
358:
359: /* alternation, skip to next expression */
360: cs = SNEXT(cs);
361: } else if (*cs & OPT) {
362:
363: /* the match is optional */
364: cs = SNEXT(cs);
365: matched = 1; /* indicate a successful match */
366: } else {
367:
368: /* no match, error return */
369: return (NIL);
370: }
371: break;
372:
373: /* an operator, do something fancy */
374: case OPER:
375: switch (OSYM(cs)) {
376:
377: /* this is an alternation */
378: case '|':
379: if (matched)
380:
381: /* last thing in the alternation was a match, skip ahead */
382: cs = OPTR(cs);
383: else
384:
385: /* no match, keep trying */
386: cs = ONEXT(cs);
387: break;
388:
389: /* this is a grouping, recurse */
390: case '(':
391: ptr = expmatch (s, ONEXT(cs), mstring);
392: if (ptr != NIL) {
393:
394: /* the subexpression matched */
395: matched = 1;
396: s = ptr;
397: } else if (*cs & ALT) {
398:
399: /* alternation, skip to next expression */
400: matched = 0;
401: } else if (*cs & OPT) {
402:
403: /* the match is optional */
404: matched = 1; /* indicate a successful match */
405: } else {
406:
407: /* no match, error return */
408: return (NIL);
409: }
410: cs = OPTR(cs);
411: break;
412: }
413: break;
414:
415: /* try to match a metasymbol */
416: case META:
417: switch (MSYM(cs)) {
418:
419: /* try to match anything and remember what was matched */
420: case 'p':
421: /*
422: * This is really the same as trying the match the
423: * remaining parts of the expression to any subset
424: * of the string.
425: */
426: s1 = s;
427: do {
428: ptr = expmatch (s1, MNEXT(cs), mstring);
429: if (ptr != NIL && s1 != s) {
430:
431: /* we have a match, remember the match */
432: strncpy (mstring, s, s1 - s);
433: mstring[s1 - s] = '\0';
434: return (ptr);
435: } else if (ptr != NIL && (*cs & OPT)) {
436:
1.5 deraadt 437: /* it was optional so no match is ok */
1.1 deraadt 438: return (ptr);
439: } else if (ptr != NIL) {
440:
441: /* not optional and we still matched */
442: return (NIL);
443: }
444: if (!isalnum(*s1) && *s1 != '_')
445: return (NIL);
446: if (*s1 == '\\')
1.5 deraadt 447: x_escaped = x_escaped ? FALSE : TRUE;
1.1 deraadt 448: else
1.5 deraadt 449: x_escaped = FALSE;
1.1 deraadt 450: } while (*s1++);
451: return (NIL);
452:
453: /* try to match anything */
454: case 'a':
455: /*
456: * This is really the same as trying the match the
457: * remaining parts of the expression to any subset
458: * of the string.
459: */
460: s1 = s;
461: do {
462: ptr = expmatch (s1, MNEXT(cs), mstring);
463: if (ptr != NIL && s1 != s) {
464:
465: /* we have a match */
466: return (ptr);
467: } else if (ptr != NIL && (*cs & OPT)) {
468:
1.5 deraadt 469: /* it was optional so no match is ok */
1.1 deraadt 470: return (ptr);
471: } else if (ptr != NIL) {
472:
473: /* not optional and we still matched */
474: return (NIL);
475: }
476: if (*s1 == '\\')
1.5 deraadt 477: x_escaped = x_escaped ? FALSE : TRUE;
1.1 deraadt 478: else
1.5 deraadt 479: x_escaped = FALSE;
1.1 deraadt 480: } while (*s1++);
481: return (NIL);
482:
1.5 deraadt 483: /* fail if we are currently x_escaped */
1.1 deraadt 484: case 'e':
1.5 deraadt 485: if (x_escaped)
1.1 deraadt 486: return(NIL);
487: cs = MNEXT(cs);
488: break;
489:
490: /* match any number of tabs and spaces */
491: case 'd':
492: ptr = s;
493: while (*s == ' ' || *s == '\t')
494: s++;
1.5 deraadt 495: if (s != ptr || s == x_start) {
1.1 deraadt 496:
497: /* match, be happy */
498: matched = 1;
499: cs = MNEXT(cs);
500: } else if (*s == '\n' || *s == '\0') {
501:
502: /* match, be happy */
503: matched = 1;
504: cs = MNEXT(cs);
505: } else if (*cs & ALT) {
506:
507: /* try the next part */
508: matched = 0;
509: cs = MNEXT(cs);
510: } else if (*cs & OPT) {
511:
512: /* doesn't matter */
513: matched = 1;
514: cs = MNEXT(cs);
515: } else
516:
517: /* no match, error return */
518: return (NIL);
519: break;
520:
521: /* check for end of line */
522: case '$':
523: if (*s == '\0' || *s == '\n') {
524:
525: /* match, be happy */
526: s++;
527: matched = 1;
528: cs = MNEXT(cs);
529: } else if (*cs & ALT) {
530:
531: /* try the next part */
532: matched = 0;
533: cs = MNEXT(cs);
534: } else if (*cs & OPT) {
535:
536: /* doesn't matter */
537: matched = 1;
538: cs = MNEXT(cs);
539: } else
540:
541: /* no match, error return */
542: return (NIL);
543: break;
544:
545: /* check for start of line */
546: case '^':
1.5 deraadt 547: if (s == x_start) {
1.1 deraadt 548:
549: /* match, be happy */
550: matched = 1;
551: cs = MNEXT(cs);
552: } else if (*cs & ALT) {
553:
554: /* try the next part */
555: matched = 0;
556: cs = MNEXT(cs);
557: } else if (*cs & OPT) {
558:
559: /* doesn't matter */
560: matched = 1;
561: cs = MNEXT(cs);
562: } else
563:
564: /* no match, error return */
565: return (NIL);
566: break;
567:
568: /* end of a subexpression, return success */
569: case ')':
570: return (s);
571: }
572: break;
573: }
574: }
575: return (s);
576: }