Annotation of src/usr.bin/vim/regexp.c, Revision 1.1
1.1 ! downsj 1: /* $OpenBSD$ */
! 2: /* vi:set ts=4 sw=4:
! 3: * NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE
! 4: *
! 5: * This is NOT the original regular expression code as written by
! 6: * Henry Spencer. This code has been modified specifically for use
! 7: * with the VIM editor, and should not be used apart from compiling
! 8: * VIM. If you want a good regular expression library, get the
! 9: * original code. The copyright notice that follows is from the
! 10: * original.
! 11: *
! 12: * NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE
! 13: *
! 14: *
! 15: * vim_regcomp and vim_regexec -- vim_regsub and regerror are elsewhere
! 16: *
! 17: * Copyright (c) 1986 by University of Toronto.
! 18: * Written by Henry Spencer. Not derived from licensed software.
! 19: *
! 20: * Permission is granted to anyone to use this software for any
! 21: * purpose on any computer system, and to redistribute it freely,
! 22: * subject to the following restrictions:
! 23: *
! 24: * 1. The author is not responsible for the consequences of use of
! 25: * this software, no matter how awful, even if they arise
! 26: * from defects in it.
! 27: *
! 28: * 2. The origin of this software must not be misrepresented, either
! 29: * by explicit claim or by omission.
! 30: *
! 31: * 3. Altered versions must be plainly marked as such, and must not
! 32: * be misrepresented as being the original software.
! 33: *
! 34: * Beware that some of this code is subtly aware of the way operator
! 35: * precedence is structured in regular expressions. Serious changes in
! 36: * regular-expression syntax might require a total rethink.
! 37: *
! 38: * $Log: regexp.c,v $
! 39: * Revision 1.2 88/04/28 08:09:45 tony
! 40: * First modification of the regexp library. Added an external variable
! 41: * 'reg_ic' which can be set to indicate that case should be ignored.
! 42: * Added a new parameter to vim_regexec() to indicate that the given string
! 43: * comes from the beginning of a line and is thus eligible to match
! 44: * 'beginning-of-line'.
! 45: *
! 46: * Revisions by Olaf 'Rhialto' Seibert, rhialto@mbfys.kun.nl:
! 47: * Changes for vi: (the semantics of several things were rather different)
! 48: * - Added lexical analyzer, because in vi magicness of characters
! 49: * is rather difficult, and may change over time.
! 50: * - Added support for \< \> \1-\9 and ~
! 51: * - Left some magic stuff in, but only backslashed: \| \+
! 52: * - * and \+ still work after \) even though they shouldn't.
! 53: */
! 54: #include "vim.h"
! 55: #include "globals.h"
! 56: #include "proto.h"
! 57: #undef DEBUG
! 58:
! 59: #include <stdio.h>
! 60: #include "regexp.h"
! 61: #include "option.h"
! 62:
! 63: /*
! 64: * The "internal use only" fields in regexp.h are present to pass info from
! 65: * compile to execute that permits the execute phase to run lots faster on
! 66: * simple cases. They are:
! 67: *
! 68: * regstart char that must begin a match; '\0' if none obvious
! 69: * reganch is the match anchored (at beginning-of-line only)?
! 70: * regmust string (pointer into program) that match must include, or NULL
! 71: * regmlen length of regmust string
! 72: *
! 73: * Regstart and reganch permit very fast decisions on suitable starting points
! 74: * for a match, cutting down the work a lot. Regmust permits fast rejection
! 75: * of lines that cannot possibly match. The regmust tests are costly enough
! 76: * that vim_regcomp() supplies a regmust only if the r.e. contains something
! 77: * potentially expensive (at present, the only such thing detected is * or +
! 78: * at the start of the r.e., which can involve a lot of backup). Regmlen is
! 79: * supplied because the test in vim_regexec() needs it and vim_regcomp() is
! 80: * computing it anyway.
! 81: */
! 82:
! 83: /*
! 84: * Structure for regexp "program". This is essentially a linear encoding
! 85: * of a nondeterministic finite-state machine (aka syntax charts or
! 86: * "railroad normal form" in parsing technology). Each node is an opcode
! 87: * plus a "next" pointer, possibly plus an operand. "Next" pointers of
! 88: * all nodes except BRANCH implement concatenation; a "next" pointer with
! 89: * a BRANCH on both ends of it is connecting two alternatives. (Here we
! 90: * have one of the subtle syntax dependencies: an individual BRANCH (as
! 91: * opposed to a collection of them) is never concatenated with anything
! 92: * because of operator precedence.) The operand of some types of node is
! 93: * a literal string; for others, it is a node leading into a sub-FSM. In
! 94: * particular, the operand of a BRANCH node is the first node of the branch.
! 95: * (NB this is *not* a tree structure: the tail of the branch connects
! 96: * to the thing following the set of BRANCHes.) The opcodes are:
! 97: */
! 98:
! 99: /* definition number opnd? meaning */
! 100: #define END 0 /* no End of program. */
! 101: #define BOL 1 /* no Match "" at beginning of line. */
! 102: #define EOL 2 /* no Match "" at end of line. */
! 103: #define ANY 3 /* no Match any one character. */
! 104: #define ANYOF 4 /* str Match any character in this string. */
! 105: #define ANYBUT 5 /* str Match any character not in this
! 106: * string. */
! 107: #define BRANCH 6 /* node Match this alternative, or the
! 108: * next... */
! 109: #define BACK 7 /* no Match "", "next" ptr points backward. */
! 110: #define EXACTLY 8 /* str Match this string. */
! 111: #define NOTHING 9 /* no Match empty string. */
! 112: #define STAR 10 /* node Match this (simple) thing 0 or more
! 113: * times. */
! 114: #define PLUS 11 /* node Match this (simple) thing 1 or more
! 115: * times. */
! 116: #define BOW 12 /* no Match "" after [^a-zA-Z0-9_] */
! 117: #define EOW 13 /* no Match "" at [^a-zA-Z0-9_] */
! 118: #define IDENT 14 /* no Match identifier char */
! 119: #define WORD 15 /* no Match keyword char */
! 120: #define FNAME 16 /* no Match file name char */
! 121: #define PRINT 17 /* no Match printable char */
! 122: #define SIDENT 18 /* no Match identifier char but no digit */
! 123: #define SWORD 19 /* no Match word char but no digit */
! 124: #define SFNAME 20 /* no Match file name char but no digit */
! 125: #define SPRINT 21 /* no Match printable char but no digit */
! 126: #define MOPEN 30 /* no Mark this point in input as start of
! 127: * #n. */
! 128: /* MOPEN+1 is number 1, etc. */
! 129: #define MCLOSE 40 /* no Analogous to MOPEN. */
! 130: #define BACKREF 50 /* node Match same string again \1-\9 */
! 131:
! 132: #define Magic(x) ((x)|('\\'<<8))
! 133:
! 134: /*
! 135: * Opcode notes:
! 136: *
! 137: * BRANCH The set of branches constituting a single choice are hooked
! 138: * together with their "next" pointers, since precedence prevents
! 139: * anything being concatenated to any individual branch. The
! 140: * "next" pointer of the last BRANCH in a choice points to the
! 141: * thing following the whole choice. This is also where the
! 142: * final "next" pointer of each individual branch points; each
! 143: * branch starts with the operand node of a BRANCH node.
! 144: *
! 145: * BACK Normal "next" pointers all implicitly point forward; BACK
! 146: * exists to make loop structures possible.
! 147: *
! 148: * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
! 149: * BRANCH structures using BACK. Simple cases (one character
! 150: * per match) are implemented with STAR and PLUS for speed
! 151: * and to minimize recursive plunges.
! 152: * Note: We would like to use "\?" instead of "\=", but a "\?"
! 153: * can be part of a pattern to escape the special meaning of '?'
! 154: * at the end of the pattern in "?pattern?e".
! 155: *
! 156: * MOPEN,MCLOSE ...are numbered at compile time.
! 157: */
! 158:
! 159: /*
! 160: * A node is one char of opcode followed by two chars of "next" pointer.
! 161: * "Next" pointers are stored as two 8-bit pieces, high order first. The
! 162: * value is a positive offset from the opcode of the node containing it.
! 163: * An operand, if any, simply follows the node. (Note that much of the
! 164: * code generation knows about this implicit relationship.)
! 165: *
! 166: * Using two bytes for the "next" pointer is vast overkill for most things,
! 167: * but allows patterns to get big without disasters.
! 168: */
! 169: #define OP(p) (*(p))
! 170: #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
! 171: #define OPERAND(p) ((p) + 3)
! 172:
! 173: /*
! 174: * Utility definitions.
! 175: */
! 176: #ifndef CHARBITS
! 177: #define UCHARAT(p) ((int)*(unsigned char *)(p))
! 178: #else
! 179: #define UCHARAT(p) ((int)*(p)&CHARBITS)
! 180: #endif
! 181:
! 182: #define EMSG_RETURN(m) { emsg(m); rc_did_emsg = TRUE; return NULL; }
! 183:
! 184: static int ismult __ARGS((int));
! 185: static char_u *cstrchr __ARGS((char_u *, int));
! 186:
! 187: static int
! 188: ismult(c)
! 189: int c;
! 190: {
! 191: return (c == Magic('*') || c == Magic('+') || c == Magic('='));
! 192: }
! 193:
! 194: /*
! 195: * Flags to be passed up and down.
! 196: */
! 197: #define HASWIDTH 01 /* Known never to match null string. */
! 198: #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
! 199: #define SPSTART 04 /* Starts with * or +. */
! 200: #define WORST 0 /* Worst case. */
! 201:
! 202: /*
! 203: * The following allows empty REs, meaning "the same as the previous RE".
! 204: * per the ed(1) manual.
! 205: */
! 206: /* #define EMPTY_RE */ /* this is done outside of regexp */
! 207: #ifdef EMPTY_RE
! 208: char_u *reg_prev_re;
! 209: #endif
! 210:
! 211: #define TILDE
! 212: #ifdef TILDE
! 213: char_u *reg_prev_sub;
! 214: #endif
! 215:
! 216: /*
! 217: * Global work variables for vim_regcomp().
! 218: */
! 219:
! 220: static char_u *regparse; /* Input-scan pointer. */
! 221: static int regnpar; /* () count. */
! 222: static char_u regdummy;
! 223: static char_u *regcode; /* Code-emit pointer; ®dummy = don't. */
! 224: static long regsize; /* Code size. */
! 225: static char_u **regendp; /* Ditto for endp. */
! 226:
! 227: /*
! 228: * META contains all characters that may be magic, except '^' and '$'.
! 229: * This depends on the configuration options TILDE, BACKREF.
! 230: * (could be done simpler for compilers that know string concatenation)
! 231: */
! 232:
! 233: #ifdef TILDE
! 234: # ifdef BACKREF
! 235: static char_u META[] = ".[()|=+*<>iIkKfFpP~123456789";
! 236: # else
! 237: static char_u META[] = ".[()|=+*<>iIkKfFpP~";
! 238: # endif
! 239: #else
! 240: # ifdef BACKREF
! 241: static char_u META[] = ".[()|=+*<>iIkKfFpP123456789";
! 242: # else
! 243: static char_u META[] = ".[()|=+*<>iIkKfFpP";
! 244: # endif
! 245: #endif
! 246:
! 247: /*
! 248: * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
! 249: * These are:
! 250: * \r - New line (CR).
! 251: * \t - Tab (TAB).
! 252: * \e - Escape (ESC).
! 253: * \b - Backspace (Ctrl('H')).
! 254: */
! 255: static char_u REGEXP_ABBR[] = "rteb";
! 256:
! 257: /*
! 258: * Forward declarations for vim_regcomp()'s friends.
! 259: */
! 260: static void initchr __ARGS((char_u *));
! 261: static int getchr __ARGS((void));
! 262: static int peekchr __ARGS((void));
! 263: #define PeekChr() curchr /* shortcut only when last action was peekchr() */
! 264: static int curchr;
! 265: static void skipchr __ARGS((void));
! 266: static void ungetchr __ARGS((void));
! 267: static char_u *reg __ARGS((int, int *));
! 268: static char_u *regbranch __ARGS((int *));
! 269: static char_u *regpiece __ARGS((int *));
! 270: static char_u *regatom __ARGS((int *));
! 271: static char_u *regnode __ARGS((int));
! 272: static char_u *regnext __ARGS((char_u *));
! 273: static void regc __ARGS((int));
! 274: static void unregc __ARGS((void));
! 275: static void reginsert __ARGS((int, char_u *));
! 276: static void regtail __ARGS((char_u *, char_u *));
! 277: static void regoptail __ARGS((char_u *, char_u *));
! 278:
! 279: #ifndef HAVE_STRCSPN
! 280: static size_t strcspn __ARGS((const char_u *, const char_u *));
! 281: #endif
! 282:
! 283: /*
! 284: * skip past regular expression
! 285: * stop at end of 'p' of where 'dirc' is found ('/', '?', etc)
! 286: * take care of characters with a backslash in front of it
! 287: * skip strings inside [ and ].
! 288: */
! 289: char_u *
! 290: skip_regexp(p, dirc)
! 291: char_u *p;
! 292: int dirc;
! 293: {
! 294: int in_range = FALSE;
! 295:
! 296: for (; p[0] != NUL; ++p)
! 297: {
! 298: if (p[0] == dirc && !in_range) /* found end of regexp */
! 299: break;
! 300: if ((p[0] == '[' && p_magic) || (p[0] == '\\' && p[1] == '[' && !p_magic))
! 301: {
! 302: in_range = TRUE;
! 303: if (p[0] == '\\')
! 304: ++p;
! 305: /* "[^]" and "[]" are not the end of a range */
! 306: if (p[1] == '^')
! 307: ++p;
! 308: if (p[1] == ']')
! 309: ++p;
! 310: }
! 311: else if (p[0] == '\\' && p[1] != NUL)
! 312: ++p; /* skip next character */
! 313: else if (p[0] == ']')
! 314: in_range = FALSE;
! 315: }
! 316: return p;
! 317: }
! 318:
! 319: /*
! 320: - vim_regcomp - compile a regular expression into internal code
! 321: *
! 322: * We can't allocate space until we know how big the compiled form will be,
! 323: * but we can't compile it (and thus know how big it is) until we've got a
! 324: * place to put the code. So we cheat: we compile it twice, once with code
! 325: * generation turned off and size counting turned on, and once "for real".
! 326: * This also means that we don't allocate space until we are sure that the
! 327: * thing really will compile successfully, and we never have to move the
! 328: * code and thus invalidate pointers into it. (Note that it has to be in
! 329: * one piece because vim_free() must be able to free it all.)
! 330: *
! 331: * Beware that the optimization-preparation code in here knows about some
! 332: * of the structure of the compiled regexp.
! 333: */
! 334: regexp *
! 335: vim_regcomp(exp)
! 336: char_u *exp;
! 337: {
! 338: register regexp *r;
! 339: register char_u *scan;
! 340: register char_u *longest;
! 341: register int len;
! 342: int flags;
! 343: /* extern char *malloc();*/
! 344:
! 345: if (exp == NULL)
! 346: EMSG_RETURN(e_null);
! 347:
! 348: #ifdef EMPTY_RE /* this is done outside of regexp */
! 349: if (*exp == '\0')
! 350: {
! 351: if (reg_prev_re)
! 352: exp = reg_prev_re;
! 353: else
! 354: EMSG_RETURN(e_noprevre);
! 355: }
! 356: #endif
! 357:
! 358: /* First pass: determine size, legality. */
! 359: initchr((char_u *)exp);
! 360: regnpar = 1;
! 361: regsize = 0L;
! 362: regcode = ®dummy;
! 363: regendp = NULL;
! 364: regc(MAGIC);
! 365: if (reg(0, &flags) == NULL)
! 366: return NULL;
! 367:
! 368: /* Small enough for pointer-storage convention? */
! 369: if (regsize >= 32767L) /* Probably could be 65535L. */
! 370: EMSG_RETURN(e_toolong);
! 371:
! 372: /* Allocate space. */
! 373: /* r = (regexp *) malloc((unsigned) (sizeof(regexp) + regsize));*/
! 374: r = (regexp *) alloc((unsigned) (sizeof(regexp) + regsize));
! 375: if (r == NULL)
! 376: return NULL;
! 377:
! 378: #ifdef EMPTY_RE /* this is done outside of regexp */
! 379: if (exp != reg_prev_re) {
! 380: vim_free(reg_prev_re);
! 381: if (reg_prev_re = alloc(STRLEN(exp) + 1))
! 382: STRCPY(reg_prev_re, exp);
! 383: }
! 384: #endif
! 385:
! 386: /* Second pass: emit code. */
! 387: initchr((char_u *)exp);
! 388: regnpar = 1;
! 389: regcode = r->program;
! 390: regendp = r->endp;
! 391: regc(MAGIC);
! 392: if (reg(0, &flags) == NULL) {
! 393: vim_free(r);
! 394: return NULL;
! 395: }
! 396:
! 397: /* Dig out information for optimizations. */
! 398: r->regstart = '\0'; /* Worst-case defaults. */
! 399: r->reganch = 0;
! 400: r->regmust = NULL;
! 401: r->regmlen = 0;
! 402: scan = r->program + 1; /* First BRANCH. */
! 403: if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
! 404: scan = OPERAND(scan);
! 405:
! 406: /* Starting-point info. */
! 407: if (OP(scan) == EXACTLY)
! 408: r->regstart = *OPERAND(scan);
! 409: else if (OP(scan) == BOL)
! 410: r->reganch++;
! 411:
! 412: /*
! 413: * If there's something expensive in the r.e., find the longest
! 414: * literal string that must appear and make it the regmust. Resolve
! 415: * ties in favor of later strings, since the regstart check works
! 416: * with the beginning of the r.e. and avoiding duplication
! 417: * strengthens checking. Not a strong reason, but sufficient in the
! 418: * absence of others.
! 419: */
! 420: /*
! 421: * When the r.e. starts with BOW, it is faster to look for a regmust
! 422: * first. Used a lot for "#" and "*" commands. (Added by mool).
! 423: */
! 424: if (flags & SPSTART || OP(scan) == BOW) {
! 425: longest = NULL;
! 426: len = 0;
! 427: for (; scan != NULL; scan = regnext(scan))
! 428: if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len) {
! 429: longest = OPERAND(scan);
! 430: len = STRLEN(OPERAND(scan));
! 431: }
! 432: r->regmust = longest;
! 433: r->regmlen = len;
! 434: }
! 435: }
! 436: #ifdef DEBUG
! 437: regdump(r);
! 438: #endif
! 439: return r;
! 440: }
! 441:
! 442: /*
! 443: - reg - regular expression, i.e. main body or parenthesized thing
! 444: *
! 445: * Caller must absorb opening parenthesis.
! 446: *
! 447: * Combining parenthesis handling with the base level of regular expression
! 448: * is a trifle forced, but the need to tie the tails of the branches to what
! 449: * follows makes it hard to avoid.
! 450: */
! 451: static char_u *
! 452: reg(paren, flagp)
! 453: int paren; /* Parenthesized? */
! 454: int *flagp;
! 455: {
! 456: register char_u *ret;
! 457: register char_u *br;
! 458: register char_u *ender;
! 459: register int parno = 0;
! 460: int flags;
! 461:
! 462: *flagp = HASWIDTH; /* Tentatively. */
! 463:
! 464: /* Make an MOPEN node, if parenthesized. */
! 465: if (paren) {
! 466: if (regnpar >= NSUBEXP)
! 467: EMSG_RETURN(e_toombra);
! 468: parno = regnpar;
! 469: regnpar++;
! 470: ret = regnode(MOPEN + parno);
! 471: if (regendp)
! 472: regendp[parno] = NULL; /* haven't seen the close paren yet */
! 473: } else
! 474: ret = NULL;
! 475:
! 476: /* Pick up the branches, linking them together. */
! 477: br = regbranch(&flags);
! 478: if (br == NULL)
! 479: return NULL;
! 480: if (ret != NULL)
! 481: regtail(ret, br); /* MOPEN -> first. */
! 482: else
! 483: ret = br;
! 484: if (!(flags & HASWIDTH))
! 485: *flagp &= ~HASWIDTH;
! 486: *flagp |= flags & SPSTART;
! 487: while (peekchr() == Magic('|')) {
! 488: skipchr();
! 489: br = regbranch(&flags);
! 490: if (br == NULL)
! 491: return NULL;
! 492: regtail(ret, br); /* BRANCH -> BRANCH. */
! 493: if (!(flags & HASWIDTH))
! 494: *flagp &= ~HASWIDTH;
! 495: *flagp |= flags & SPSTART;
! 496: }
! 497:
! 498: /* Make a closing node, and hook it on the end. */
! 499: ender = regnode((paren) ? MCLOSE + parno : END);
! 500: regtail(ret, ender);
! 501:
! 502: /* Hook the tails of the branches to the closing node. */
! 503: for (br = ret; br != NULL; br = regnext(br))
! 504: regoptail(br, ender);
! 505:
! 506: /* Check for proper termination. */
! 507: if (paren && getchr() != Magic(')'))
! 508: EMSG_RETURN(e_toombra)
! 509: else if (!paren && peekchr() != '\0')
! 510: {
! 511: if (PeekChr() == Magic(')'))
! 512: EMSG_RETURN(e_toomket)
! 513: else
! 514: EMSG_RETURN(e_trailing) /* "Can't happen". */
! 515: /* NOTREACHED */
! 516: }
! 517: /*
! 518: * Here we set the flag allowing back references to this set of
! 519: * parentheses.
! 520: */
! 521: if (paren && regendp)
! 522: regendp[parno] = ender; /* have seen the close paren */
! 523: return ret;
! 524: }
! 525:
! 526: /*
! 527: - regbranch - one alternative of an | operator
! 528: *
! 529: * Implements the concatenation operator.
! 530: */
! 531: static char_u *
! 532: regbranch(flagp)
! 533: int *flagp;
! 534: {
! 535: register char_u *ret;
! 536: register char_u *chain;
! 537: register char_u *latest;
! 538: int flags;
! 539:
! 540: *flagp = WORST; /* Tentatively. */
! 541:
! 542: ret = regnode(BRANCH);
! 543: chain = NULL;
! 544: while (peekchr() != '\0' && PeekChr() != Magic('|') && PeekChr() != Magic(')')) {
! 545: latest = regpiece(&flags);
! 546: if (latest == NULL)
! 547: return NULL;
! 548: *flagp |= flags & HASWIDTH;
! 549: if (chain == NULL) /* First piece. */
! 550: *flagp |= flags & SPSTART;
! 551: else
! 552: regtail(chain, latest);
! 553: chain = latest;
! 554: }
! 555: if (chain == NULL) /* Loop ran zero times. */
! 556: (void) regnode(NOTHING);
! 557:
! 558: return ret;
! 559: }
! 560:
! 561: /*
! 562: - regpiece - something followed by possible [*+=]
! 563: *
! 564: * Note that the branching code sequences used for = and the general cases
! 565: * of * and + are somewhat optimized: they use the same NOTHING node as
! 566: * both the endmarker for their branch list and the body of the last branch.
! 567: * It might seem that this node could be dispensed with entirely, but the
! 568: * endmarker role is not redundant.
! 569: */
! 570: static char_u *
! 571: regpiece(flagp)
! 572: int *flagp;
! 573: {
! 574: register char_u *ret;
! 575: register int op;
! 576: register char_u *next;
! 577: int flags;
! 578:
! 579: ret = regatom(&flags);
! 580: if (ret == NULL)
! 581: return NULL;
! 582:
! 583: op = peekchr();
! 584: if (!ismult(op)) {
! 585: *flagp = flags;
! 586: return ret;
! 587: }
! 588: if (!(flags & HASWIDTH) && op != Magic('='))
! 589: EMSG_RETURN((char_u *)"*+ operand could be empty");
! 590: *flagp = (op != Magic('+')) ? (WORST | SPSTART) : (WORST | HASWIDTH);
! 591:
! 592: if (op == Magic('*') && (flags & SIMPLE))
! 593: reginsert(STAR, ret);
! 594: else if (op == Magic('*')) {
! 595: /* Emit x* as (x&|), where & means "self". */
! 596: reginsert(BRANCH, ret); /* Either x */
! 597: regoptail(ret, regnode(BACK)); /* and loop */
! 598: regoptail(ret, ret); /* back */
! 599: regtail(ret, regnode(BRANCH)); /* or */
! 600: regtail(ret, regnode(NOTHING)); /* null. */
! 601: } else if (op == Magic('+') && (flags & SIMPLE))
! 602: reginsert(PLUS, ret);
! 603: else if (op == Magic('+')) {
! 604: /* Emit x+ as x(&|), where & means "self". */
! 605: next = regnode(BRANCH); /* Either */
! 606: regtail(ret, next);
! 607: regtail(regnode(BACK), ret); /* loop back */
! 608: regtail(next, regnode(BRANCH)); /* or */
! 609: regtail(ret, regnode(NOTHING)); /* null. */
! 610: } else if (op == Magic('=')) {
! 611: /* Emit x= as (x|) */
! 612: reginsert(BRANCH, ret); /* Either x */
! 613: regtail(ret, regnode(BRANCH)); /* or */
! 614: next = regnode(NOTHING);/* null. */
! 615: regtail(ret, next);
! 616: regoptail(ret, next);
! 617: }
! 618: skipchr();
! 619: if (ismult(peekchr()))
! 620: EMSG_RETURN((char_u *)"Nested *=+");
! 621:
! 622: return ret;
! 623: }
! 624:
! 625: /*
! 626: - regatom - the lowest level
! 627: *
! 628: * Optimization: gobbles an entire sequence of ordinary characters so that
! 629: * it can turn them into a single node, which is smaller to store and
! 630: * faster to run.
! 631: */
! 632: static char_u *
! 633: regatom(flagp)
! 634: int *flagp;
! 635: {
! 636: register char_u *ret;
! 637: int flags;
! 638:
! 639: *flagp = WORST; /* Tentatively. */
! 640:
! 641: switch (getchr()) {
! 642: case Magic('^'):
! 643: ret = regnode(BOL);
! 644: break;
! 645: case Magic('$'):
! 646: ret = regnode(EOL);
! 647: break;
! 648: case Magic('<'):
! 649: ret = regnode(BOW);
! 650: break;
! 651: case Magic('>'):
! 652: ret = regnode(EOW);
! 653: break;
! 654: case Magic('.'):
! 655: ret = regnode(ANY);
! 656: *flagp |= HASWIDTH | SIMPLE;
! 657: break;
! 658: case Magic('i'):
! 659: ret = regnode(IDENT);
! 660: *flagp |= HASWIDTH | SIMPLE;
! 661: break;
! 662: case Magic('k'):
! 663: ret = regnode(WORD);
! 664: *flagp |= HASWIDTH | SIMPLE;
! 665: break;
! 666: case Magic('I'):
! 667: ret = regnode(SIDENT);
! 668: *flagp |= HASWIDTH | SIMPLE;
! 669: break;
! 670: case Magic('K'):
! 671: ret = regnode(SWORD);
! 672: *flagp |= HASWIDTH | SIMPLE;
! 673: break;
! 674: case Magic('f'):
! 675: ret = regnode(FNAME);
! 676: *flagp |= HASWIDTH | SIMPLE;
! 677: break;
! 678: case Magic('F'):
! 679: ret = regnode(SFNAME);
! 680: *flagp |= HASWIDTH | SIMPLE;
! 681: break;
! 682: case Magic('p'):
! 683: ret = regnode(PRINT);
! 684: *flagp |= HASWIDTH | SIMPLE;
! 685: break;
! 686: case Magic('P'):
! 687: ret = regnode(SPRINT);
! 688: *flagp |= HASWIDTH | SIMPLE;
! 689: break;
! 690: case Magic('('):
! 691: ret = reg(1, &flags);
! 692: if (ret == NULL)
! 693: return NULL;
! 694: *flagp |= flags & (HASWIDTH | SPSTART);
! 695: break;
! 696: case '\0':
! 697: case Magic('|'):
! 698: case Magic(')'):
! 699: EMSG_RETURN(e_internal) /* Supposed to be caught earlier. */
! 700: /* NOTREACHED */
! 701: case Magic('='):
! 702: EMSG_RETURN((char_u *)"\\= follows nothing")
! 703: /* NOTREACHED */
! 704: case Magic('+'):
! 705: EMSG_RETURN((char_u *)"\\+ follows nothing")
! 706: /* NOTREACHED */
! 707: case Magic('*'):
! 708: if (reg_magic)
! 709: EMSG_RETURN((char_u *)"* follows nothing")
! 710: else
! 711: EMSG_RETURN((char_u *)"\\* follows nothing")
! 712: /* break; Not Reached */
! 713: #ifdef TILDE
! 714: case Magic('~'): /* previous substitute pattern */
! 715: if (reg_prev_sub) {
! 716: register char_u *p;
! 717:
! 718: ret = regnode(EXACTLY);
! 719: p = reg_prev_sub;
! 720: while (*p) {
! 721: regc(*p++);
! 722: }
! 723: regc('\0');
! 724: if (p - reg_prev_sub) {
! 725: *flagp |= HASWIDTH;
! 726: if ((p - reg_prev_sub) == 1)
! 727: *flagp |= SIMPLE;
! 728: }
! 729: } else
! 730: EMSG_RETURN(e_nopresub);
! 731: break;
! 732: #endif
! 733: #ifdef BACKREF
! 734: case Magic('1'):
! 735: case Magic('2'):
! 736: case Magic('3'):
! 737: case Magic('4'):
! 738: case Magic('5'):
! 739: case Magic('6'):
! 740: case Magic('7'):
! 741: case Magic('8'):
! 742: case Magic('9'): {
! 743: int refnum;
! 744:
! 745: ungetchr();
! 746: refnum = getchr() - Magic('0');
! 747: /*
! 748: * Check if the back reference is legal. We use the parentheses
! 749: * pointers to mark encountered close parentheses, but this
! 750: * is only available in the second pass. Checking opens is
! 751: * always possible.
! 752: * Should also check that we don't refer to something that
! 753: * is repeated (+*=): what instance of the repetition should
! 754: * we match? TODO.
! 755: */
! 756: if (refnum < regnpar &&
! 757: (regendp == NULL || regendp[refnum] != NULL))
! 758: ret = regnode(BACKREF + refnum);
! 759: else
! 760: EMSG_RETURN((char_u *)"Illegal back reference");
! 761: }
! 762: break;
! 763: #endif
! 764: case Magic('['):
! 765: {
! 766: char_u *p;
! 767:
! 768: /*
! 769: * If there is no matching ']', we assume the '[' is a normal
! 770: * character. This makes ":help [" work.
! 771: */
! 772: p = regparse;
! 773: if (*p == '^') /* Complement of range. */
! 774: ++p;
! 775: if (*p == ']' || *p == '-')
! 776: ++p;
! 777: while (*p != '\0' && *p != ']')
! 778: {
! 779: if (*p == '-')
! 780: {
! 781: ++p;
! 782: if (*p != ']' && *p != '\0')
! 783: ++p;
! 784: }
! 785: else if (*p == '\\' && p[1] != '\0')
! 786: p += 2;
! 787: else
! 788: ++p;
! 789: }
! 790: if (*p == ']') /* there is a matching ']' */
! 791: {
! 792: /*
! 793: * In a character class, different parsing rules apply.
! 794: * Not even \ is special anymore, nothing is.
! 795: */
! 796: if (*regparse == '^') { /* Complement of range. */
! 797: ret = regnode(ANYBUT);
! 798: regparse++;
! 799: } else
! 800: ret = regnode(ANYOF);
! 801: if (*regparse == ']' || *regparse == '-')
! 802: regc(*regparse++);
! 803: while (*regparse != '\0' && *regparse != ']') {
! 804: if (*regparse == '-') {
! 805: regparse++;
! 806: if (*regparse == ']' || *regparse == '\0')
! 807: regc('-');
! 808: else {
! 809: register int cclass;
! 810: register int cclassend;
! 811:
! 812: cclass = UCHARAT(regparse - 2) + 1;
! 813: cclassend = UCHARAT(regparse);
! 814: if (cclass > cclassend + 1)
! 815: EMSG_RETURN(e_invrange);
! 816: for (; cclass <= cclassend; cclass++)
! 817: regc(cclass);
! 818: regparse++;
! 819: }
! 820: } else if (*regparse == '\\' && regparse[1]) {
! 821: regparse++;
! 822: regc(*regparse++);
! 823: } else
! 824: regc(*regparse++);
! 825: }
! 826: regc('\0');
! 827: if (*regparse != ']')
! 828: EMSG_RETURN(e_toomsbra);
! 829: skipchr(); /* let's be friends with the lexer again */
! 830: *flagp |= HASWIDTH | SIMPLE;
! 831: break;
! 832: }
! 833: }
! 834: /* FALLTHROUGH */
! 835:
! 836: default:
! 837: {
! 838: register int len;
! 839: int chr;
! 840:
! 841: ungetchr();
! 842: len = 0;
! 843: ret = regnode(EXACTLY);
! 844: /*
! 845: * Always take at least one character, for '[' without matching
! 846: * ']'.
! 847: */
! 848: while ((chr = peekchr()) != '\0' && (chr < Magic(0) || len == 0))
! 849: {
! 850: regc(chr);
! 851: skipchr();
! 852: len++;
! 853: }
! 854: #ifdef DEBUG
! 855: if (len == 0)
! 856: EMSG_RETURN((char_u *)"Unexpected magic character; check META.");
! 857: #endif
! 858: /*
! 859: * If there is a following *, \+ or \= we need the character
! 860: * in front of it as a single character operand
! 861: */
! 862: if (len > 1 && ismult(chr))
! 863: {
! 864: unregc(); /* Back off of *+= operand */
! 865: ungetchr(); /* and put it back for next time */
! 866: --len;
! 867: }
! 868: regc('\0');
! 869: *flagp |= HASWIDTH;
! 870: if (len == 1)
! 871: *flagp |= SIMPLE;
! 872: }
! 873: break;
! 874: }
! 875:
! 876: return ret;
! 877: }
! 878:
! 879: /*
! 880: - regnode - emit a node
! 881: */
! 882: static char_u * /* Location. */
! 883: regnode(op)
! 884: int op;
! 885: {
! 886: register char_u *ret;
! 887: register char_u *ptr;
! 888:
! 889: ret = regcode;
! 890: if (ret == ®dummy) {
! 891: regsize += 3;
! 892: return ret;
! 893: }
! 894: ptr = ret;
! 895: *ptr++ = op;
! 896: *ptr++ = '\0'; /* Null "next" pointer. */
! 897: *ptr++ = '\0';
! 898: regcode = ptr;
! 899:
! 900: return ret;
! 901: }
! 902:
! 903: /*
! 904: - regc - emit (if appropriate) a byte of code
! 905: */
! 906: static void
! 907: regc(b)
! 908: int b;
! 909: {
! 910: if (regcode != ®dummy)
! 911: *regcode++ = b;
! 912: else
! 913: regsize++;
! 914: }
! 915:
! 916: /*
! 917: - unregc - take back (if appropriate) a byte of code
! 918: */
! 919: static void
! 920: unregc()
! 921: {
! 922: if (regcode != ®dummy)
! 923: regcode--;
! 924: else
! 925: regsize--;
! 926: }
! 927:
! 928: /*
! 929: - reginsert - insert an operator in front of already-emitted operand
! 930: *
! 931: * Means relocating the operand.
! 932: */
! 933: static void
! 934: reginsert(op, opnd)
! 935: int op;
! 936: char_u *opnd;
! 937: {
! 938: register char_u *src;
! 939: register char_u *dst;
! 940: register char_u *place;
! 941:
! 942: if (regcode == ®dummy) {
! 943: regsize += 3;
! 944: return;
! 945: }
! 946: src = regcode;
! 947: regcode += 3;
! 948: dst = regcode;
! 949: while (src > opnd)
! 950: *--dst = *--src;
! 951:
! 952: place = opnd; /* Op node, where operand used to be. */
! 953: *place++ = op;
! 954: *place++ = '\0';
! 955: *place = '\0';
! 956: }
! 957:
! 958: /*
! 959: - regtail - set the next-pointer at the end of a node chain
! 960: */
! 961: static void
! 962: regtail(p, val)
! 963: char_u *p;
! 964: char_u *val;
! 965: {
! 966: register char_u *scan;
! 967: register char_u *temp;
! 968: register int offset;
! 969:
! 970: if (p == ®dummy)
! 971: return;
! 972:
! 973: /* Find last node. */
! 974: scan = p;
! 975: for (;;) {
! 976: temp = regnext(scan);
! 977: if (temp == NULL)
! 978: break;
! 979: scan = temp;
! 980: }
! 981:
! 982: if (OP(scan) == BACK)
! 983: offset = (int)(scan - val);
! 984: else
! 985: offset = (int)(val - scan);
! 986: *(scan + 1) = (char_u) ((offset >> 8) & 0377);
! 987: *(scan + 2) = (char_u) (offset & 0377);
! 988: }
! 989:
! 990: /*
! 991: - regoptail - regtail on operand of first argument; nop if operandless
! 992: */
! 993: static void
! 994: regoptail(p, val)
! 995: char_u *p;
! 996: char_u *val;
! 997: {
! 998: /* "Operandless" and "op != BRANCH" are synonymous in practice. */
! 999: if (p == NULL || p == ®dummy || OP(p) != BRANCH)
! 1000: return;
! 1001: regtail(OPERAND(p), val);
! 1002: }
! 1003:
! 1004: /*
! 1005: - getchr() - get the next character from the pattern. We know about
! 1006: * magic and such, so therefore we need a lexical analyzer.
! 1007: */
! 1008:
! 1009: /* static int curchr; */
! 1010: static int prevchr;
! 1011: static int nextchr; /* used for ungetchr() */
! 1012: /*
! 1013: * Note: prevchr is sometimes -1 when we are not at the start,
! 1014: * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
! 1015: * taken to be magic -- webb
! 1016: */
! 1017: static int at_start; /* True when we are on the first character */
! 1018:
! 1019: static void
! 1020: initchr(str)
! 1021: char_u *str;
! 1022: {
! 1023: regparse = str;
! 1024: curchr = prevchr = nextchr = -1;
! 1025: at_start = TRUE;
! 1026: }
! 1027:
! 1028: static int
! 1029: peekchr()
! 1030: {
! 1031: if (curchr < 0) {
! 1032: switch (curchr = regparse[0]) {
! 1033: case '.':
! 1034: /* case '+':*/
! 1035: /* case '=':*/
! 1036: case '[':
! 1037: case '~':
! 1038: if (reg_magic)
! 1039: curchr = Magic(curchr);
! 1040: break;
! 1041: case '*':
! 1042: /* * is not magic as the very first character, eg "?*ptr" */
! 1043: if (reg_magic && !at_start)
! 1044: curchr = Magic('*');
! 1045: break;
! 1046: case '^':
! 1047: /* ^ is only magic as the very first character */
! 1048: if (at_start)
! 1049: curchr = Magic('^');
! 1050: break;
! 1051: case '$':
! 1052: /* $ is only magic as the very last character and in front of '\|' */
! 1053: if (regparse[1] == NUL || (regparse[1] == '\\' && regparse[2] == '|'))
! 1054: curchr = Magic('$');
! 1055: break;
! 1056: case '\\':
! 1057: regparse++;
! 1058: if (regparse[0] == NUL)
! 1059: curchr = '\\'; /* trailing '\' */
! 1060: else if (vim_strchr(META, regparse[0]))
! 1061: {
! 1062: /*
! 1063: * META contains everything that may be magic sometimes, except
! 1064: * ^ and $ ("\^" and "\$" are never magic).
! 1065: * We now fetch the next character and toggle its magicness.
! 1066: * Therefore, \ is so meta-magic that it is not in META.
! 1067: */
! 1068: curchr = -1;
! 1069: at_start = FALSE; /* We still want to be able to say "/\*ptr" */
! 1070: peekchr();
! 1071: curchr ^= Magic(0);
! 1072: }
! 1073: else if (vim_strchr(REGEXP_ABBR, regparse[0]))
! 1074: {
! 1075: /*
! 1076: * Handle abbreviations, like "\t" for TAB -- webb
! 1077: */
! 1078: switch (regparse[0])
! 1079: {
! 1080: case 'r': curchr = CR; break;
! 1081: case 't': curchr = TAB; break;
! 1082: case 'e': curchr = ESC; break;
! 1083: case 'b': curchr = Ctrl('H'); break;
! 1084: }
! 1085: }
! 1086: else
! 1087: {
! 1088: /*
! 1089: * Next character can never be (made) magic?
! 1090: * Then backslashing it won't do anything.
! 1091: */
! 1092: curchr = regparse[0];
! 1093: }
! 1094: break;
! 1095: }
! 1096: }
! 1097:
! 1098: return curchr;
! 1099: }
! 1100:
! 1101: static void
! 1102: skipchr()
! 1103: {
! 1104: regparse++;
! 1105: at_start = FALSE;
! 1106: prevchr = curchr;
! 1107: curchr = nextchr; /* use previously unget char, or -1 */
! 1108: nextchr = -1;
! 1109: }
! 1110:
! 1111: static int
! 1112: getchr()
! 1113: {
! 1114: int chr;
! 1115:
! 1116: chr = peekchr();
! 1117: skipchr();
! 1118:
! 1119: return chr;
! 1120: }
! 1121:
! 1122: /*
! 1123: * put character back. Works only once!
! 1124: */
! 1125: static void
! 1126: ungetchr()
! 1127: {
! 1128: nextchr = curchr;
! 1129: curchr = prevchr;
! 1130: /*
! 1131: * Backup regparse as well; not because we will use what it points at,
! 1132: * but because skipchr() will bump it again.
! 1133: */
! 1134: regparse--;
! 1135: }
! 1136:
! 1137: /*
! 1138: * vim_regexec and friends
! 1139: */
! 1140:
! 1141: /*
! 1142: * Global work variables for vim_regexec().
! 1143: */
! 1144: static char_u *reginput; /* String-input pointer. */
! 1145: static char_u *regbol; /* Beginning of input, for ^ check. */
! 1146: static char_u **regstartp; /* Pointer to startp array. */
! 1147: /* static char_u **regendp; */ /* Ditto for endp. */
! 1148:
! 1149: /*
! 1150: * Forwards.
! 1151: */
! 1152: static int regtry __ARGS((regexp *, char_u *));
! 1153: static int regmatch __ARGS((char_u *));
! 1154: static int regrepeat __ARGS((char_u *));
! 1155:
! 1156: #ifdef DEBUG
! 1157: int regnarrate = 1;
! 1158: void regdump __ARGS((regexp *));
! 1159: static char_u *regprop __ARGS((char_u *));
! 1160: #endif
! 1161:
! 1162: /*
! 1163: * vim_regexec - match a regexp against a string
! 1164: * Return non-zero if there is a match.
! 1165: */
! 1166: int
! 1167: vim_regexec(prog, string, at_bol)
! 1168: register regexp *prog;
! 1169: register char_u *string;
! 1170: int at_bol;
! 1171: {
! 1172: register char_u *s;
! 1173:
! 1174: /* Be paranoid... */
! 1175: if (prog == NULL || string == NULL) {
! 1176: emsg(e_null);
! 1177: rc_did_emsg = TRUE;
! 1178: return 0;
! 1179: }
! 1180: /* Check validity of program. */
! 1181: if (UCHARAT(prog->program) != MAGIC) {
! 1182: emsg(e_re_corr);
! 1183: rc_did_emsg = TRUE;
! 1184: return 0;
! 1185: }
! 1186: /* If there is a "must appear" string, look for it. */
! 1187: if (prog->regmust != NULL) {
! 1188: s = string;
! 1189: while ((s = cstrchr(s, prog->regmust[0])) != NULL) {
! 1190: if (cstrncmp(s, prog->regmust, prog->regmlen) == 0)
! 1191: break; /* Found it. */
! 1192: s++;
! 1193: }
! 1194: if (s == NULL) /* Not present. */
! 1195: return 0;
! 1196: }
! 1197: /* Mark beginning of line for ^ . */
! 1198: if (at_bol)
! 1199: regbol = string; /* is possible to match bol */
! 1200: else
! 1201: regbol = NULL; /* we aren't there, so don't match it */
! 1202:
! 1203: /* Simplest case: anchored match need be tried only once. */
! 1204: if (prog->reganch)
! 1205: return regtry(prog, string);
! 1206:
! 1207: /* Messy cases: unanchored match. */
! 1208: s = string;
! 1209: if (prog->regstart != '\0')
! 1210: /* We know what char it must start with. */
! 1211: while ((s = cstrchr(s, prog->regstart)) != NULL) {
! 1212: if (regtry(prog, s))
! 1213: return 1;
! 1214: s++;
! 1215: }
! 1216: else
! 1217: /* We don't -- general case. */
! 1218: do {
! 1219: if (regtry(prog, s))
! 1220: return 1;
! 1221: } while (*s++ != '\0');
! 1222:
! 1223: /* Failure. */
! 1224: return 0;
! 1225: }
! 1226:
! 1227: /*
! 1228: - regtry - try match at specific point
! 1229: */
! 1230: static int /* 0 failure, 1 success */
! 1231: regtry(prog, string)
! 1232: regexp *prog;
! 1233: char_u *string;
! 1234: {
! 1235: register int i;
! 1236: register char_u **sp;
! 1237: register char_u **ep;
! 1238:
! 1239: reginput = string;
! 1240: regstartp = prog->startp;
! 1241: regendp = prog->endp;
! 1242:
! 1243: sp = prog->startp;
! 1244: ep = prog->endp;
! 1245: for (i = NSUBEXP; i > 0; i--) {
! 1246: *sp++ = NULL;
! 1247: *ep++ = NULL;
! 1248: }
! 1249: if (regmatch(prog->program + 1)) {
! 1250: prog->startp[0] = string;
! 1251: prog->endp[0] = reginput;
! 1252: return 1;
! 1253: } else
! 1254: return 0;
! 1255: }
! 1256:
! 1257: /*
! 1258: - regmatch - main matching routine
! 1259: *
! 1260: * Conceptually the strategy is simple: check to see whether the current
! 1261: * node matches, call self recursively to see whether the rest matches,
! 1262: * and then act accordingly. In practice we make some effort to avoid
! 1263: * recursion, in particular by going through "ordinary" nodes (that don't
! 1264: * need to know whether the rest of the match failed) by a loop instead of
! 1265: * by recursion.
! 1266: */
! 1267: static int /* 0 failure, 1 success */
! 1268: regmatch(prog)
! 1269: char_u *prog;
! 1270: {
! 1271: register char_u *scan; /* Current node. */
! 1272: char_u *next; /* Next node. */
! 1273:
! 1274: scan = prog;
! 1275: #ifdef DEBUG
! 1276: if (scan != NULL && regnarrate)
! 1277: fprintf(stderr, "%s(\n", regprop(scan));
! 1278: #endif
! 1279: while (scan != NULL) {
! 1280: #ifdef DEBUG
! 1281: if (regnarrate) {
! 1282: fprintf(stderr, "%s...\n", regprop(scan));
! 1283: }
! 1284: #endif
! 1285: next = regnext(scan);
! 1286: switch (OP(scan)) {
! 1287: case BOL:
! 1288: if (reginput != regbol)
! 1289: return 0;
! 1290: break;
! 1291: case EOL:
! 1292: if (*reginput != '\0')
! 1293: return 0;
! 1294: break;
! 1295: case BOW: /* \<word; reginput points to w */
! 1296: if (reginput != regbol && iswordchar(reginput[-1]))
! 1297: return 0;
! 1298: if (!reginput[0] || !iswordchar(reginput[0]))
! 1299: return 0;
! 1300: break;
! 1301: case EOW: /* word\>; reginput points after d */
! 1302: if (reginput == regbol || !iswordchar(reginput[-1]))
! 1303: return 0;
! 1304: if (reginput[0] && iswordchar(reginput[0]))
! 1305: return 0;
! 1306: break;
! 1307: case ANY:
! 1308: if (*reginput == '\0')
! 1309: return 0;
! 1310: reginput++;
! 1311: break;
! 1312: case IDENT:
! 1313: if (!isidchar(*reginput))
! 1314: return 0;
! 1315: reginput++;
! 1316: break;
! 1317: case WORD:
! 1318: if (!iswordchar(*reginput))
! 1319: return 0;
! 1320: reginput++;
! 1321: break;
! 1322: case FNAME:
! 1323: if (!isfilechar(*reginput))
! 1324: return 0;
! 1325: reginput++;
! 1326: break;
! 1327: case PRINT:
! 1328: if (charsize(*reginput) != 1)
! 1329: return 0;
! 1330: reginput++;
! 1331: break;
! 1332: case SIDENT:
! 1333: if (isdigit(*reginput) || !isidchar(*reginput))
! 1334: return 0;
! 1335: reginput++;
! 1336: break;
! 1337: case SWORD:
! 1338: if (isdigit(*reginput) || !iswordchar(*reginput))
! 1339: return 0;
! 1340: reginput++;
! 1341: break;
! 1342: case SFNAME:
! 1343: if (isdigit(*reginput) || !isfilechar(*reginput))
! 1344: return 0;
! 1345: reginput++;
! 1346: break;
! 1347: case SPRINT:
! 1348: if (isdigit(*reginput) || charsize(*reginput) != 1)
! 1349: return 0;
! 1350: reginput++;
! 1351: break;
! 1352: case EXACTLY:{
! 1353: register int len;
! 1354: register char_u *opnd;
! 1355:
! 1356: opnd = OPERAND(scan);
! 1357: /* Inline the first character, for speed. */
! 1358: if (*opnd != *reginput && (!reg_ic || TO_UPPER(*opnd) != TO_UPPER(*reginput)))
! 1359: return 0;
! 1360: len = STRLEN(opnd);
! 1361: if (len > 1 && cstrncmp(opnd, reginput, len) != 0)
! 1362: return 0;
! 1363: reginput += len;
! 1364: }
! 1365: break;
! 1366: case ANYOF:
! 1367: if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) == NULL)
! 1368: return 0;
! 1369: reginput++;
! 1370: break;
! 1371: case ANYBUT:
! 1372: if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) != NULL)
! 1373: return 0;
! 1374: reginput++;
! 1375: break;
! 1376: case NOTHING:
! 1377: break;
! 1378: case BACK:
! 1379: break;
! 1380: case MOPEN + 1:
! 1381: case MOPEN + 2:
! 1382: case MOPEN + 3:
! 1383: case MOPEN + 4:
! 1384: case MOPEN + 5:
! 1385: case MOPEN + 6:
! 1386: case MOPEN + 7:
! 1387: case MOPEN + 8:
! 1388: case MOPEN + 9:{
! 1389: register int no;
! 1390: register char_u *save;
! 1391:
! 1392: no = OP(scan) - MOPEN;
! 1393: save = regstartp[no];
! 1394: regstartp[no] = reginput; /* Tentatively */
! 1395: #ifdef DEBUG
! 1396: printf("MOPEN %d pre @'%s' ('%s' )'%s'\n",
! 1397: no, save,
! 1398: regstartp[no] ? regstartp[no] : "NULL",
! 1399: regendp[no] ? regendp[no] : "NULL");
! 1400: #endif
! 1401:
! 1402: if (regmatch(next)) {
! 1403: #ifdef DEBUG
! 1404: printf("MOPEN %d post @'%s' ('%s' )'%s'\n",
! 1405: no, save,
! 1406: regstartp[no] ? regstartp[no] : "NULL",
! 1407: regendp[no] ? regendp[no] : "NULL");
! 1408: #endif
! 1409: return 1;
! 1410: }
! 1411: regstartp[no] = save; /* We were wrong... */
! 1412: return 0;
! 1413: }
! 1414: /* break; Not Reached */
! 1415: case MCLOSE + 1:
! 1416: case MCLOSE + 2:
! 1417: case MCLOSE + 3:
! 1418: case MCLOSE + 4:
! 1419: case MCLOSE + 5:
! 1420: case MCLOSE + 6:
! 1421: case MCLOSE + 7:
! 1422: case MCLOSE + 8:
! 1423: case MCLOSE + 9:{
! 1424: register int no;
! 1425: register char_u *save;
! 1426:
! 1427: no = OP(scan) - MCLOSE;
! 1428: save = regendp[no];
! 1429: regendp[no] = reginput; /* Tentatively */
! 1430: #ifdef DEBUG
! 1431: printf("MCLOSE %d pre @'%s' ('%s' )'%s'\n",
! 1432: no, save,
! 1433: regstartp[no] ? regstartp[no] : "NULL",
! 1434: regendp[no] ? regendp[no] : "NULL");
! 1435: #endif
! 1436:
! 1437: if (regmatch(next)) {
! 1438: #ifdef DEBUG
! 1439: printf("MCLOSE %d post @'%s' ('%s' )'%s'\n",
! 1440: no, save,
! 1441: regstartp[no] ? regstartp[no] : "NULL",
! 1442: regendp[no] ? regendp[no] : "NULL");
! 1443: #endif
! 1444: return 1;
! 1445: }
! 1446: regendp[no] = save; /* We were wrong... */
! 1447: return 0;
! 1448: }
! 1449: /* break; Not Reached */
! 1450: #ifdef BACKREF
! 1451: case BACKREF + 1:
! 1452: case BACKREF + 2:
! 1453: case BACKREF + 3:
! 1454: case BACKREF + 4:
! 1455: case BACKREF + 5:
! 1456: case BACKREF + 6:
! 1457: case BACKREF + 7:
! 1458: case BACKREF + 8:
! 1459: case BACKREF + 9:{
! 1460: register int no;
! 1461: int len;
! 1462:
! 1463: no = OP(scan) - BACKREF;
! 1464: if (regendp[no] != NULL) {
! 1465: len = (int)(regendp[no] - regstartp[no]);
! 1466: if (cstrncmp(regstartp[no], reginput, len) != 0)
! 1467: return 0;
! 1468: reginput += len;
! 1469: } else {
! 1470: /*emsg("backref to 0-repeat");*/
! 1471: /*return 0;*/
! 1472: }
! 1473: }
! 1474: break;
! 1475: #endif
! 1476: case BRANCH:{
! 1477: register char_u *save;
! 1478:
! 1479: if (OP(next) != BRANCH) /* No choice. */
! 1480: next = OPERAND(scan); /* Avoid recursion. */
! 1481: else {
! 1482: do {
! 1483: save = reginput;
! 1484: if (regmatch(OPERAND(scan)))
! 1485: return 1;
! 1486: reginput = save;
! 1487: scan = regnext(scan);
! 1488: } while (scan != NULL && OP(scan) == BRANCH);
! 1489: return 0;
! 1490: /* NOTREACHED */
! 1491: }
! 1492: }
! 1493: break;
! 1494: case STAR:
! 1495: case PLUS:{
! 1496: register int nextch;
! 1497: register int no;
! 1498: register char_u *save;
! 1499: register int min;
! 1500:
! 1501: /*
! 1502: * Lookahead to avoid useless match attempts when we know
! 1503: * what character comes next.
! 1504: */
! 1505: nextch = '\0';
! 1506: if (OP(next) == EXACTLY)
! 1507: {
! 1508: nextch = *OPERAND(next);
! 1509: if (reg_ic)
! 1510: nextch = TO_UPPER(nextch);
! 1511: }
! 1512: min = (OP(scan) == STAR) ? 0 : 1;
! 1513: save = reginput;
! 1514: no = regrepeat(OPERAND(scan));
! 1515: while (no >= min)
! 1516: {
! 1517: /* If it could work, try it. */
! 1518: if (nextch == '\0' || (*reginput == nextch ||
! 1519: (reg_ic && TO_UPPER(*reginput) == nextch)))
! 1520: if (regmatch(next))
! 1521: return 1;
! 1522: /* Couldn't or didn't -- back up. */
! 1523: no--;
! 1524: reginput = save + no;
! 1525: }
! 1526: return 0;
! 1527: }
! 1528: /* break; Not Reached */
! 1529: case END:
! 1530: return 1; /* Success! */
! 1531: /* break; Not Reached */
! 1532: default:
! 1533: emsg(e_re_corr);
! 1534: return 0;
! 1535: /* break; Not Reached */
! 1536: }
! 1537:
! 1538: scan = next;
! 1539: }
! 1540:
! 1541: /*
! 1542: * We get here only if there's trouble -- normally "case END" is the
! 1543: * terminating point.
! 1544: */
! 1545: emsg(e_re_corr);
! 1546: return 0;
! 1547: }
! 1548:
! 1549: /*
! 1550: - regrepeat - repeatedly match something simple, report how many
! 1551: */
! 1552: static int
! 1553: regrepeat(p)
! 1554: char_u *p;
! 1555: {
! 1556: register int count = 0;
! 1557: register char_u *scan;
! 1558: register char_u *opnd;
! 1559:
! 1560: scan = reginput;
! 1561: opnd = OPERAND(p);
! 1562: switch (OP(p)) {
! 1563: case ANY:
! 1564: count = STRLEN(scan);
! 1565: scan += count;
! 1566: break;
! 1567: case IDENT:
! 1568: for (count = 0; isidchar(*scan); ++count)
! 1569: ++scan;
! 1570: break;
! 1571: case WORD:
! 1572: for (count = 0; iswordchar(*scan); ++count)
! 1573: ++scan;
! 1574: break;
! 1575: case FNAME:
! 1576: for (count = 0; isfilechar(*scan); ++count)
! 1577: ++scan;
! 1578: break;
! 1579: case PRINT:
! 1580: for (count = 0; charsize(*scan) == 1; ++count)
! 1581: ++scan;
! 1582: break;
! 1583: case SIDENT:
! 1584: for (count = 0; !isdigit(*scan) && isidchar(*scan); ++count)
! 1585: ++scan;
! 1586: break;
! 1587: case SWORD:
! 1588: for (count = 0; !isdigit(*scan) && iswordchar(*scan); ++count)
! 1589: ++scan;
! 1590: break;
! 1591: case SFNAME:
! 1592: for (count = 0; !isdigit(*scan) && isfilechar(*scan); ++count)
! 1593: ++scan;
! 1594: break;
! 1595: case SPRINT:
! 1596: for (count = 0; !isdigit(*scan) && charsize(*scan) == 1; ++count)
! 1597: ++scan;
! 1598: break;
! 1599: case EXACTLY:
! 1600: while (*opnd == *scan || (reg_ic && TO_UPPER(*opnd) == TO_UPPER(*scan)))
! 1601: {
! 1602: count++;
! 1603: scan++;
! 1604: }
! 1605: break;
! 1606: case ANYOF:
! 1607: while (*scan != '\0' && cstrchr(opnd, *scan) != NULL)
! 1608: {
! 1609: count++;
! 1610: scan++;
! 1611: }
! 1612: break;
! 1613: case ANYBUT:
! 1614: while (*scan != '\0' && cstrchr(opnd, *scan) == NULL) {
! 1615: count++;
! 1616: scan++;
! 1617: }
! 1618: break;
! 1619: default: /* Oh dear. Called inappropriately. */
! 1620: emsg(e_re_corr);
! 1621: count = 0; /* Best compromise. */
! 1622: break;
! 1623: }
! 1624: reginput = scan;
! 1625:
! 1626: return count;
! 1627: }
! 1628:
! 1629: /*
! 1630: - regnext - dig the "next" pointer out of a node
! 1631: */
! 1632: static char_u *
! 1633: regnext(p)
! 1634: register char_u *p;
! 1635: {
! 1636: register int offset;
! 1637:
! 1638: if (p == ®dummy)
! 1639: return NULL;
! 1640:
! 1641: offset = NEXT(p);
! 1642: if (offset == 0)
! 1643: return NULL;
! 1644:
! 1645: if (OP(p) == BACK)
! 1646: return p - offset;
! 1647: else
! 1648: return p + offset;
! 1649: }
! 1650:
! 1651: #ifdef DEBUG
! 1652:
! 1653: /*
! 1654: - regdump - dump a regexp onto stdout in vaguely comprehensible form
! 1655: */
! 1656: void
! 1657: regdump(r)
! 1658: regexp *r;
! 1659: {
! 1660: register char_u *s;
! 1661: register int op = EXACTLY; /* Arbitrary non-END op. */
! 1662: register char_u *next;
! 1663:
! 1664:
! 1665: s = r->program + 1;
! 1666: while (op != END) { /* While that wasn't END last time... */
! 1667: op = OP(s);
! 1668: printf("%2d%s", s - r->program, regprop(s)); /* Where, what. */
! 1669: next = regnext(s);
! 1670: if (next == NULL) /* Next ptr. */
! 1671: printf("(0)");
! 1672: else
! 1673: printf("(%d)", (s - r->program) + (next - s));
! 1674: s += 3;
! 1675: if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
! 1676: /* Literal string, where present. */
! 1677: while (*s != '\0') {
! 1678: putchar(*s);
! 1679: s++;
! 1680: }
! 1681: s++;
! 1682: }
! 1683: putchar('\n');
! 1684: }
! 1685:
! 1686: /* Header fields of interest. */
! 1687: if (r->regstart != '\0')
! 1688: printf("start `%c' ", r->regstart);
! 1689: if (r->reganch)
! 1690: printf("anchored ");
! 1691: if (r->regmust != NULL)
! 1692: printf("must have \"%s\"", r->regmust);
! 1693: printf("\n");
! 1694: }
! 1695:
! 1696: /*
! 1697: - regprop - printable representation of opcode
! 1698: */
! 1699: static char_u *
! 1700: regprop(op)
! 1701: char_u *op;
! 1702: {
! 1703: register char_u *p;
! 1704: static char_u buf[50];
! 1705:
! 1706: (void) strcpy(buf, ":");
! 1707:
! 1708: switch (OP(op)) {
! 1709: case BOL:
! 1710: p = "BOL";
! 1711: break;
! 1712: case EOL:
! 1713: p = "EOL";
! 1714: break;
! 1715: case ANY:
! 1716: p = "ANY";
! 1717: break;
! 1718: case IDENT:
! 1719: p = "IDENT";
! 1720: break;
! 1721: case WORD:
! 1722: p = "WORD";
! 1723: break;
! 1724: case FNAME:
! 1725: p = "FNAME";
! 1726: break;
! 1727: case PRINT:
! 1728: p = "PRINT";
! 1729: break;
! 1730: case SIDENT:
! 1731: p = "SIDENT";
! 1732: break;
! 1733: case SWORD:
! 1734: p = "SWORD";
! 1735: break;
! 1736: case SFNAME:
! 1737: p = "SFNAME";
! 1738: break;
! 1739: case SPRINT:
! 1740: p = "SPRINT";
! 1741: break;
! 1742: case ANYOF:
! 1743: p = "ANYOF";
! 1744: break;
! 1745: case ANYBUT:
! 1746: p = "ANYBUT";
! 1747: break;
! 1748: case BRANCH:
! 1749: p = "BRANCH";
! 1750: break;
! 1751: case EXACTLY:
! 1752: p = "EXACTLY";
! 1753: break;
! 1754: case NOTHING:
! 1755: p = "NOTHING";
! 1756: break;
! 1757: case BACK:
! 1758: p = "BACK";
! 1759: break;
! 1760: case END:
! 1761: p = "END";
! 1762: break;
! 1763: case MOPEN + 1:
! 1764: case MOPEN + 2:
! 1765: case MOPEN + 3:
! 1766: case MOPEN + 4:
! 1767: case MOPEN + 5:
! 1768: case MOPEN + 6:
! 1769: case MOPEN + 7:
! 1770: case MOPEN + 8:
! 1771: case MOPEN + 9:
! 1772: sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
! 1773: p = NULL;
! 1774: break;
! 1775: case MCLOSE + 1:
! 1776: case MCLOSE + 2:
! 1777: case MCLOSE + 3:
! 1778: case MCLOSE + 4:
! 1779: case MCLOSE + 5:
! 1780: case MCLOSE + 6:
! 1781: case MCLOSE + 7:
! 1782: case MCLOSE + 8:
! 1783: case MCLOSE + 9:
! 1784: sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
! 1785: p = NULL;
! 1786: break;
! 1787: case BACKREF + 1:
! 1788: case BACKREF + 2:
! 1789: case BACKREF + 3:
! 1790: case BACKREF + 4:
! 1791: case BACKREF + 5:
! 1792: case BACKREF + 6:
! 1793: case BACKREF + 7:
! 1794: case BACKREF + 8:
! 1795: case BACKREF + 9:
! 1796: sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
! 1797: p = NULL;
! 1798: break;
! 1799: case STAR:
! 1800: p = "STAR";
! 1801: break;
! 1802: case PLUS:
! 1803: p = "PLUS";
! 1804: break;
! 1805: default:
! 1806: sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
! 1807: p = NULL;
! 1808: break;
! 1809: }
! 1810: if (p != NULL)
! 1811: (void) strcat(buf, p);
! 1812: return buf;
! 1813: }
! 1814: #endif
! 1815:
! 1816: /*
! 1817: * The following is provided for those people who do not have strcspn() in
! 1818: * their C libraries. They should get off their butts and do something
! 1819: * about it; at least one public-domain implementation of those (highly
! 1820: * useful) string routines has been published on Usenet.
! 1821: */
! 1822: #ifndef HAVE_STRCSPN
! 1823: /*
! 1824: * strcspn - find length of initial segment of s1 consisting entirely
! 1825: * of characters not from s2
! 1826: */
! 1827:
! 1828: static size_t
! 1829: strcspn(s1, s2)
! 1830: const char_u *s1;
! 1831: const char_u *s2;
! 1832: {
! 1833: register char_u *scan1;
! 1834: register char_u *scan2;
! 1835: register int count;
! 1836:
! 1837: count = 0;
! 1838: for (scan1 = s1; *scan1 != '\0'; scan1++) {
! 1839: for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
! 1840: if (*scan1 == *scan2++)
! 1841: return count;
! 1842: count++;
! 1843: }
! 1844: return count;
! 1845: }
! 1846: #endif
! 1847:
! 1848: /*
! 1849: * Compare two strings, ignore case if reg_ic set.
! 1850: * Return 0 if strings match, non-zero otherwise.
! 1851: */
! 1852: int
! 1853: cstrncmp(s1, s2, n)
! 1854: char_u *s1, *s2;
! 1855: int n;
! 1856: {
! 1857: if (!reg_ic)
! 1858: return STRNCMP(s1, s2, (size_t)n);
! 1859:
! 1860: return vim_strnicmp(s1, s2, (size_t)n);
! 1861: }
! 1862:
! 1863: /*
! 1864: * cstrchr: This function is used a lot for simple searches, keep it fast!
! 1865: */
! 1866: static char_u *
! 1867: cstrchr(s, c)
! 1868: char_u *s;
! 1869: register int c;
! 1870: {
! 1871: register char_u *p;
! 1872:
! 1873: if (!reg_ic)
! 1874: return vim_strchr(s, c);
! 1875:
! 1876: c = TO_UPPER(c);
! 1877:
! 1878: for (p = s; *p; p++)
! 1879: {
! 1880: if (TO_UPPER(*p) == c)
! 1881: return p;
! 1882: }
! 1883: return NULL;
! 1884: }