Annotation of src/usr.bin/vgrind/regexp.c, Revision 1.1
1.1 ! deraadt 1: /* $NetBSD: regexp.c,v 1.3 1994/11/17 08:28:02 jtc Exp $ */
! 2:
! 3: /*
! 4: * Copyright (c) 1980, 1993
! 5: * The Regents of the University of California. All rights reserved.
! 6: *
! 7: *
! 8: * Redistribution and use in source and binary forms, with or without
! 9: * modification, are permitted provided that the following conditions
! 10: * are met:
! 11: * 1. Redistributions of source code must retain the above copyright
! 12: * notice, this list of conditions and the following disclaimer.
! 13: * 2. Redistributions in binary form must reproduce the above copyright
! 14: * notice, this list of conditions and the following disclaimer in the
! 15: * documentation and/or other materials provided with the distribution.
! 16: * 3. All advertising materials mentioning features or use of this software
! 17: * must display the following acknowledgement:
! 18: * This product includes software developed by the University of
! 19: * California, Berkeley and its contributors.
! 20: * 4. Neither the name of the University nor the names of its contributors
! 21: * may be used to endorse or promote products derived from this software
! 22: * without specific prior written permission.
! 23: *
! 24: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
! 25: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 26: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 27: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
! 28: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
! 29: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
! 30: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
! 31: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
! 32: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
! 33: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
! 34: * SUCH DAMAGE.
! 35: */
! 36:
! 37: #ifndef lint
! 38: static char copyright[] =
! 39: "@(#) Copyright (c) 1980, 1993\n\
! 40: The Regents of the University of California. All rights reserved.\n";
! 41: #endif /* not lint */
! 42:
! 43: #ifndef lint
! 44: #if 0
! 45: static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
! 46: #endif
! 47: static char rcsid[] = "$NetBSD: regexp.c,v 1.3 1994/11/17 08:28:02 jtc Exp $";
! 48: #endif /* not lint */
! 49:
! 50: #include <ctype.h>
! 51: #include <stdlib.h>
! 52: #include <string.h>
! 53: #include "extern.h"
! 54:
! 55: #define FALSE 0
! 56: #define TRUE !(FALSE)
! 57: #define NIL 0
! 58:
! 59: static void expconv __P((void));
! 60:
! 61: boolean _escaped; /* true if we are currently _escaped */
! 62: char *_start; /* start of string */
! 63: boolean l_onecase; /* true if upper and lower equivalent */
! 64:
! 65: #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
! 66:
! 67: /* STRNCMP - like strncmp except that we convert the
! 68: * first string to lower case before comparing
! 69: * if l_onecase is set.
! 70: */
! 71:
! 72: int
! 73: STRNCMP(s1, s2, len)
! 74: register char *s1,*s2;
! 75: register int len;
! 76: {
! 77: if (l_onecase) {
! 78: do
! 79: if (*s2 - makelower(*s1))
! 80: return (*s2 - makelower(*s1));
! 81: else {
! 82: s2++;
! 83: s1++;
! 84: }
! 85: while (--len);
! 86: } else {
! 87: do
! 88: if (*s2 - *s1)
! 89: return (*s2 - *s1);
! 90: else {
! 91: s2++;
! 92: s1++;
! 93: }
! 94: while (--len);
! 95: }
! 96: return(0);
! 97: }
! 98:
! 99: /* The following routine converts an irregular expression to
! 100: * internal format.
! 101: *
! 102: * Either meta symbols (\a \d or \p) or character strings or
! 103: * operations ( alternation or perenthesizing ) can be
! 104: * specified. Each starts with a descriptor byte. The descriptor
! 105: * byte has STR set for strings, META set for meta symbols
! 106: * and OPER set for operations.
! 107: * The descriptor byte can also have the OPT bit set if the object
! 108: * defined is optional. Also ALT can be set to indicate an alternation.
! 109: *
! 110: * For metasymbols the byte following the descriptor byte identities
! 111: * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
! 112: * strings the byte after the descriptor is a character count for
! 113: * the string:
! 114: *
! 115: * meta symbols := descriptor
! 116: * symbol
! 117: *
! 118: * strings := descriptor
! 119: * character count
! 120: * the string
! 121: *
! 122: * operatins := descriptor
! 123: * symbol
! 124: * character count
! 125: */
! 126:
! 127: /*
! 128: * handy macros for accessing parts of match blocks
! 129: */
! 130: #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
! 131: #define MNEXT(A) (A+2) /* character following a metasymbol block */
! 132:
! 133: #define OSYM(A) (*(A+1)) /* symbol in an operation block */
! 134: #define OCNT(A) (*(A+2)) /* character count */
! 135: #define ONEXT(A) (A+3) /* next character after the operation */
! 136: #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
! 137:
! 138: #define SCNT(A) (*(A+1)) /* byte count of a string */
! 139: #define SSTR(A) (A+2) /* address of the string */
! 140: #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
! 141:
! 142: /*
! 143: * bit flags in the descriptor
! 144: */
! 145: #define OPT 1
! 146: #define STR 2
! 147: #define META 4
! 148: #define ALT 8
! 149: #define OPER 16
! 150:
! 151: static char *ccre; /* pointer to current position in converted exp*/
! 152: static char *ure; /* pointer current position in unconverted exp */
! 153:
! 154: char *
! 155: convexp(re)
! 156: char *re; /* unconverted irregular expression */
! 157: {
! 158: register char *cre; /* pointer to converted regular expression */
! 159:
! 160: /* allocate room for the converted expression */
! 161: if (re == NIL)
! 162: return (NIL);
! 163: if (*re == '\0')
! 164: return (NIL);
! 165: cre = malloc (4 * strlen(re) + 3);
! 166: ccre = cre;
! 167: ure = re;
! 168:
! 169: /* start the conversion with a \a */
! 170: *cre = META | OPT;
! 171: MSYM(cre) = 'a';
! 172: ccre = MNEXT(cre);
! 173:
! 174: /* start the conversion (its recursive) */
! 175: expconv ();
! 176: *ccre = 0;
! 177: return (cre);
! 178: }
! 179:
! 180: static void
! 181: expconv()
! 182: {
! 183: register char *cs; /* pointer to current symbol in converted exp */
! 184: register char c; /* character being processed */
! 185: register char *acs; /* pinter to last alternate */
! 186: register int temp;
! 187:
! 188: /* let the conversion begin */
! 189: acs = NIL;
! 190: cs = NIL;
! 191: while (*ure != NIL) {
! 192: switch (c = *ure++) {
! 193:
! 194: case '\\':
! 195: switch (c = *ure++) {
! 196:
! 197: /* escaped characters are just characters */
! 198: default:
! 199: if (cs == NIL || (*cs & STR) == 0) {
! 200: cs = ccre;
! 201: *cs = STR;
! 202: SCNT(cs) = 1;
! 203: ccre += 2;
! 204: } else
! 205: SCNT(cs)++;
! 206: *ccre++ = c;
! 207: break;
! 208:
! 209: /* normal(?) metacharacters */
! 210: case 'a':
! 211: case 'd':
! 212: case 'e':
! 213: case 'p':
! 214: if (acs != NIL && acs != cs) {
! 215: do {
! 216: temp = OCNT(acs);
! 217: OCNT(acs) = ccre - acs;
! 218: acs -= temp;
! 219: } while (temp != 0);
! 220: acs = NIL;
! 221: }
! 222: cs = ccre;
! 223: *cs = META;
! 224: MSYM(cs) = c;
! 225: ccre = MNEXT(cs);
! 226: break;
! 227: }
! 228: break;
! 229:
! 230: /* just put the symbol in */
! 231: case '^':
! 232: case '$':
! 233: if (acs != NIL && acs != cs) {
! 234: do {
! 235: temp = OCNT(acs);
! 236: OCNT(acs) = ccre - acs;
! 237: acs -= temp;
! 238: } while (temp != 0);
! 239: acs = NIL;
! 240: }
! 241: cs = ccre;
! 242: *cs = META;
! 243: MSYM(cs) = c;
! 244: ccre = MNEXT(cs);
! 245: break;
! 246:
! 247: /* mark the last match sequence as optional */
! 248: case '?':
! 249: if (cs)
! 250: *cs = *cs | OPT;
! 251: break;
! 252:
! 253: /* recurse and define a subexpression */
! 254: case '(':
! 255: if (acs != NIL && acs != cs) {
! 256: do {
! 257: temp = OCNT(acs);
! 258: OCNT(acs) = ccre - acs;
! 259: acs -= temp;
! 260: } while (temp != 0);
! 261: acs = NIL;
! 262: }
! 263: cs = ccre;
! 264: *cs = OPER;
! 265: OSYM(cs) = '(';
! 266: ccre = ONEXT(cs);
! 267: expconv ();
! 268: OCNT(cs) = ccre - cs; /* offset to next symbol */
! 269: break;
! 270:
! 271: /* reurn from a recursion */
! 272: case ')':
! 273: if (acs != NIL) {
! 274: do {
! 275: temp = OCNT(acs);
! 276: OCNT(acs) = ccre - acs;
! 277: acs -= temp;
! 278: } while (temp != 0);
! 279: acs = NIL;
! 280: }
! 281: cs = ccre;
! 282: *cs = META;
! 283: MSYM(cs) = c;
! 284: ccre = MNEXT(cs);
! 285: return;
! 286:
! 287: /* mark the last match sequence as having an alternate */
! 288: /* the third byte will contain an offset to jump over the */
! 289: /* alternate match in case the first did not fail */
! 290: case '|':
! 291: if (acs != NIL && acs != cs)
! 292: OCNT(ccre) = ccre - acs; /* make a back pointer */
! 293: else
! 294: OCNT(ccre) = 0;
! 295: *cs |= ALT;
! 296: cs = ccre;
! 297: *cs = OPER;
! 298: OSYM(cs) = '|';
! 299: ccre = ONEXT(cs);
! 300: acs = cs; /* remember that the pointer is to be filles */
! 301: break;
! 302:
! 303: /* if its not a metasymbol just build a scharacter string */
! 304: default:
! 305: if (cs == NIL || (*cs & STR) == 0) {
! 306: cs = ccre;
! 307: *cs = STR;
! 308: SCNT(cs) = 1;
! 309: ccre = SSTR(cs);
! 310: } else
! 311: SCNT(cs)++;
! 312: *ccre++ = c;
! 313: break;
! 314: }
! 315: }
! 316: if (acs != NIL) {
! 317: do {
! 318: temp = OCNT(acs);
! 319: OCNT(acs) = ccre - acs;
! 320: acs -= temp;
! 321: } while (temp != 0);
! 322: acs = NIL;
! 323: }
! 324: return;
! 325: }
! 326: /* end of convertre */
! 327:
! 328:
! 329: /*
! 330: * The following routine recognises an irregular expresion
! 331: * with the following special characters:
! 332: *
! 333: * \? - means last match was optional
! 334: * \a - matches any number of characters
! 335: * \d - matches any number of spaces and tabs
! 336: * \p - matches any number of alphanumeric
! 337: * characters. The
! 338: * characters matched will be copied into
! 339: * the area pointed to by 'name'.
! 340: * \| - alternation
! 341: * \( \) - grouping used mostly for alternation and
! 342: * optionality
! 343: *
! 344: * The irregular expression must be translated to internal form
! 345: * prior to calling this routine
! 346: *
! 347: * The value returned is the pointer to the first non \a
! 348: * character matched.
! 349: */
! 350:
! 351: char *
! 352: expmatch (s, re, mstring)
! 353: register char *s; /* string to check for a match in */
! 354: register char *re; /* a converted irregular expression */
! 355: register char *mstring; /* where to put whatever matches a \p */
! 356: {
! 357: register char *cs; /* the current symbol */
! 358: register char *ptr,*s1; /* temporary pointer */
! 359: boolean matched; /* a temporary boolean */
! 360:
! 361: /* initial conditions */
! 362: if (re == NIL)
! 363: return (NIL);
! 364: cs = re;
! 365: matched = FALSE;
! 366:
! 367: /* loop till expression string is exhausted (or at least pretty tired) */
! 368: while (*cs) {
! 369: switch (*cs & (OPER | STR | META)) {
! 370:
! 371: /* try to match a string */
! 372: case STR:
! 373: matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
! 374: if (matched) {
! 375:
! 376: /* hoorah it matches */
! 377: s += SCNT(cs);
! 378: cs = SNEXT(cs);
! 379: } else if (*cs & ALT) {
! 380:
! 381: /* alternation, skip to next expression */
! 382: cs = SNEXT(cs);
! 383: } else if (*cs & OPT) {
! 384:
! 385: /* the match is optional */
! 386: cs = SNEXT(cs);
! 387: matched = 1; /* indicate a successful match */
! 388: } else {
! 389:
! 390: /* no match, error return */
! 391: return (NIL);
! 392: }
! 393: break;
! 394:
! 395: /* an operator, do something fancy */
! 396: case OPER:
! 397: switch (OSYM(cs)) {
! 398:
! 399: /* this is an alternation */
! 400: case '|':
! 401: if (matched)
! 402:
! 403: /* last thing in the alternation was a match, skip ahead */
! 404: cs = OPTR(cs);
! 405: else
! 406:
! 407: /* no match, keep trying */
! 408: cs = ONEXT(cs);
! 409: break;
! 410:
! 411: /* this is a grouping, recurse */
! 412: case '(':
! 413: ptr = expmatch (s, ONEXT(cs), mstring);
! 414: if (ptr != NIL) {
! 415:
! 416: /* the subexpression matched */
! 417: matched = 1;
! 418: s = ptr;
! 419: } else if (*cs & ALT) {
! 420:
! 421: /* alternation, skip to next expression */
! 422: matched = 0;
! 423: } else if (*cs & OPT) {
! 424:
! 425: /* the match is optional */
! 426: matched = 1; /* indicate a successful match */
! 427: } else {
! 428:
! 429: /* no match, error return */
! 430: return (NIL);
! 431: }
! 432: cs = OPTR(cs);
! 433: break;
! 434: }
! 435: break;
! 436:
! 437: /* try to match a metasymbol */
! 438: case META:
! 439: switch (MSYM(cs)) {
! 440:
! 441: /* try to match anything and remember what was matched */
! 442: case 'p':
! 443: /*
! 444: * This is really the same as trying the match the
! 445: * remaining parts of the expression to any subset
! 446: * of the string.
! 447: */
! 448: s1 = s;
! 449: do {
! 450: ptr = expmatch (s1, MNEXT(cs), mstring);
! 451: if (ptr != NIL && s1 != s) {
! 452:
! 453: /* we have a match, remember the match */
! 454: strncpy (mstring, s, s1 - s);
! 455: mstring[s1 - s] = '\0';
! 456: return (ptr);
! 457: } else if (ptr != NIL && (*cs & OPT)) {
! 458:
! 459: /* it was aoptional so no match is ok */
! 460: return (ptr);
! 461: } else if (ptr != NIL) {
! 462:
! 463: /* not optional and we still matched */
! 464: return (NIL);
! 465: }
! 466: if (!isalnum(*s1) && *s1 != '_')
! 467: return (NIL);
! 468: if (*s1 == '\\')
! 469: _escaped = _escaped ? FALSE : TRUE;
! 470: else
! 471: _escaped = FALSE;
! 472: } while (*s1++);
! 473: return (NIL);
! 474:
! 475: /* try to match anything */
! 476: case 'a':
! 477: /*
! 478: * This is really the same as trying the match the
! 479: * remaining parts of the expression to any subset
! 480: * of the string.
! 481: */
! 482: s1 = s;
! 483: do {
! 484: ptr = expmatch (s1, MNEXT(cs), mstring);
! 485: if (ptr != NIL && s1 != s) {
! 486:
! 487: /* we have a match */
! 488: return (ptr);
! 489: } else if (ptr != NIL && (*cs & OPT)) {
! 490:
! 491: /* it was aoptional so no match is ok */
! 492: return (ptr);
! 493: } else if (ptr != NIL) {
! 494:
! 495: /* not optional and we still matched */
! 496: return (NIL);
! 497: }
! 498: if (*s1 == '\\')
! 499: _escaped = _escaped ? FALSE : TRUE;
! 500: else
! 501: _escaped = FALSE;
! 502: } while (*s1++);
! 503: return (NIL);
! 504:
! 505: /* fail if we are currently _escaped */
! 506: case 'e':
! 507: if (_escaped)
! 508: return(NIL);
! 509: cs = MNEXT(cs);
! 510: break;
! 511:
! 512: /* match any number of tabs and spaces */
! 513: case 'd':
! 514: ptr = s;
! 515: while (*s == ' ' || *s == '\t')
! 516: s++;
! 517: if (s != ptr || s == _start) {
! 518:
! 519: /* match, be happy */
! 520: matched = 1;
! 521: cs = MNEXT(cs);
! 522: } else if (*s == '\n' || *s == '\0') {
! 523:
! 524: /* match, be happy */
! 525: matched = 1;
! 526: cs = MNEXT(cs);
! 527: } else if (*cs & ALT) {
! 528:
! 529: /* try the next part */
! 530: matched = 0;
! 531: cs = MNEXT(cs);
! 532: } else if (*cs & OPT) {
! 533:
! 534: /* doesn't matter */
! 535: matched = 1;
! 536: cs = MNEXT(cs);
! 537: } else
! 538:
! 539: /* no match, error return */
! 540: return (NIL);
! 541: break;
! 542:
! 543: /* check for end of line */
! 544: case '$':
! 545: if (*s == '\0' || *s == '\n') {
! 546:
! 547: /* match, be happy */
! 548: s++;
! 549: matched = 1;
! 550: cs = MNEXT(cs);
! 551: } else if (*cs & ALT) {
! 552:
! 553: /* try the next part */
! 554: matched = 0;
! 555: cs = MNEXT(cs);
! 556: } else if (*cs & OPT) {
! 557:
! 558: /* doesn't matter */
! 559: matched = 1;
! 560: cs = MNEXT(cs);
! 561: } else
! 562:
! 563: /* no match, error return */
! 564: return (NIL);
! 565: break;
! 566:
! 567: /* check for start of line */
! 568: case '^':
! 569: if (s == _start) {
! 570:
! 571: /* match, be happy */
! 572: matched = 1;
! 573: cs = MNEXT(cs);
! 574: } else if (*cs & ALT) {
! 575:
! 576: /* try the next part */
! 577: matched = 0;
! 578: cs = MNEXT(cs);
! 579: } else if (*cs & OPT) {
! 580:
! 581: /* doesn't matter */
! 582: matched = 1;
! 583: cs = MNEXT(cs);
! 584: } else
! 585:
! 586: /* no match, error return */
! 587: return (NIL);
! 588: break;
! 589:
! 590: /* end of a subexpression, return success */
! 591: case ')':
! 592: return (s);
! 593: }
! 594: break;
! 595: }
! 596: }
! 597: return (s);
! 598: }