Annotation of src/usr.bin/vim/regexp.c, Revision 1.1.1.1
1.1 downsj 1: /* $OpenBSD$ */
2: /* vi:set ts=4 sw=4:
3: * NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE
4: *
5: * This is NOT the original regular expression code as written by
6: * Henry Spencer. This code has been modified specifically for use
7: * with the VIM editor, and should not be used apart from compiling
8: * VIM. If you want a good regular expression library, get the
9: * original code. The copyright notice that follows is from the
10: * original.
11: *
12: * NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE
13: *
14: *
15: * vim_regcomp and vim_regexec -- vim_regsub and regerror are elsewhere
16: *
17: * Copyright (c) 1986 by University of Toronto.
18: * Written by Henry Spencer. Not derived from licensed software.
19: *
20: * Permission is granted to anyone to use this software for any
21: * purpose on any computer system, and to redistribute it freely,
22: * subject to the following restrictions:
23: *
24: * 1. The author is not responsible for the consequences of use of
25: * this software, no matter how awful, even if they arise
26: * from defects in it.
27: *
28: * 2. The origin of this software must not be misrepresented, either
29: * by explicit claim or by omission.
30: *
31: * 3. Altered versions must be plainly marked as such, and must not
32: * be misrepresented as being the original software.
33: *
34: * Beware that some of this code is subtly aware of the way operator
35: * precedence is structured in regular expressions. Serious changes in
36: * regular-expression syntax might require a total rethink.
37: *
38: * $Log: regexp.c,v $
39: * Revision 1.2 88/04/28 08:09:45 tony
40: * First modification of the regexp library. Added an external variable
41: * 'reg_ic' which can be set to indicate that case should be ignored.
42: * Added a new parameter to vim_regexec() to indicate that the given string
43: * comes from the beginning of a line and is thus eligible to match
44: * 'beginning-of-line'.
45: *
46: * Revisions by Olaf 'Rhialto' Seibert, rhialto@mbfys.kun.nl:
47: * Changes for vi: (the semantics of several things were rather different)
48: * - Added lexical analyzer, because in vi magicness of characters
49: * is rather difficult, and may change over time.
50: * - Added support for \< \> \1-\9 and ~
51: * - Left some magic stuff in, but only backslashed: \| \+
52: * - * and \+ still work after \) even though they shouldn't.
53: */
54: #include "vim.h"
55: #include "globals.h"
56: #include "proto.h"
57: #undef DEBUG
58:
59: #include <stdio.h>
60: #include "regexp.h"
61: #include "option.h"
62:
63: /*
64: * The "internal use only" fields in regexp.h are present to pass info from
65: * compile to execute that permits the execute phase to run lots faster on
66: * simple cases. They are:
67: *
68: * regstart char that must begin a match; '\0' if none obvious
69: * reganch is the match anchored (at beginning-of-line only)?
70: * regmust string (pointer into program) that match must include, or NULL
71: * regmlen length of regmust string
72: *
73: * Regstart and reganch permit very fast decisions on suitable starting points
74: * for a match, cutting down the work a lot. Regmust permits fast rejection
75: * of lines that cannot possibly match. The regmust tests are costly enough
76: * that vim_regcomp() supplies a regmust only if the r.e. contains something
77: * potentially expensive (at present, the only such thing detected is * or +
78: * at the start of the r.e., which can involve a lot of backup). Regmlen is
79: * supplied because the test in vim_regexec() needs it and vim_regcomp() is
80: * computing it anyway.
81: */
82:
83: /*
84: * Structure for regexp "program". This is essentially a linear encoding
85: * of a nondeterministic finite-state machine (aka syntax charts or
86: * "railroad normal form" in parsing technology). Each node is an opcode
87: * plus a "next" pointer, possibly plus an operand. "Next" pointers of
88: * all nodes except BRANCH implement concatenation; a "next" pointer with
89: * a BRANCH on both ends of it is connecting two alternatives. (Here we
90: * have one of the subtle syntax dependencies: an individual BRANCH (as
91: * opposed to a collection of them) is never concatenated with anything
92: * because of operator precedence.) The operand of some types of node is
93: * a literal string; for others, it is a node leading into a sub-FSM. In
94: * particular, the operand of a BRANCH node is the first node of the branch.
95: * (NB this is *not* a tree structure: the tail of the branch connects
96: * to the thing following the set of BRANCHes.) The opcodes are:
97: */
98:
99: /* definition number opnd? meaning */
100: #define END 0 /* no End of program. */
101: #define BOL 1 /* no Match "" at beginning of line. */
102: #define EOL 2 /* no Match "" at end of line. */
103: #define ANY 3 /* no Match any one character. */
104: #define ANYOF 4 /* str Match any character in this string. */
105: #define ANYBUT 5 /* str Match any character not in this
106: * string. */
107: #define BRANCH 6 /* node Match this alternative, or the
108: * next... */
109: #define BACK 7 /* no Match "", "next" ptr points backward. */
110: #define EXACTLY 8 /* str Match this string. */
111: #define NOTHING 9 /* no Match empty string. */
112: #define STAR 10 /* node Match this (simple) thing 0 or more
113: * times. */
114: #define PLUS 11 /* node Match this (simple) thing 1 or more
115: * times. */
116: #define BOW 12 /* no Match "" after [^a-zA-Z0-9_] */
117: #define EOW 13 /* no Match "" at [^a-zA-Z0-9_] */
118: #define IDENT 14 /* no Match identifier char */
119: #define WORD 15 /* no Match keyword char */
120: #define FNAME 16 /* no Match file name char */
121: #define PRINT 17 /* no Match printable char */
122: #define SIDENT 18 /* no Match identifier char but no digit */
123: #define SWORD 19 /* no Match word char but no digit */
124: #define SFNAME 20 /* no Match file name char but no digit */
125: #define SPRINT 21 /* no Match printable char but no digit */
126: #define MOPEN 30 /* no Mark this point in input as start of
127: * #n. */
128: /* MOPEN+1 is number 1, etc. */
129: #define MCLOSE 40 /* no Analogous to MOPEN. */
130: #define BACKREF 50 /* node Match same string again \1-\9 */
131:
132: #define Magic(x) ((x)|('\\'<<8))
133:
134: /*
135: * Opcode notes:
136: *
137: * BRANCH The set of branches constituting a single choice are hooked
138: * together with their "next" pointers, since precedence prevents
139: * anything being concatenated to any individual branch. The
140: * "next" pointer of the last BRANCH in a choice points to the
141: * thing following the whole choice. This is also where the
142: * final "next" pointer of each individual branch points; each
143: * branch starts with the operand node of a BRANCH node.
144: *
145: * BACK Normal "next" pointers all implicitly point forward; BACK
146: * exists to make loop structures possible.
147: *
148: * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
149: * BRANCH structures using BACK. Simple cases (one character
150: * per match) are implemented with STAR and PLUS for speed
151: * and to minimize recursive plunges.
152: * Note: We would like to use "\?" instead of "\=", but a "\?"
153: * can be part of a pattern to escape the special meaning of '?'
154: * at the end of the pattern in "?pattern?e".
155: *
156: * MOPEN,MCLOSE ...are numbered at compile time.
157: */
158:
159: /*
160: * A node is one char of opcode followed by two chars of "next" pointer.
161: * "Next" pointers are stored as two 8-bit pieces, high order first. The
162: * value is a positive offset from the opcode of the node containing it.
163: * An operand, if any, simply follows the node. (Note that much of the
164: * code generation knows about this implicit relationship.)
165: *
166: * Using two bytes for the "next" pointer is vast overkill for most things,
167: * but allows patterns to get big without disasters.
168: */
169: #define OP(p) (*(p))
170: #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
171: #define OPERAND(p) ((p) + 3)
172:
173: /*
174: * Utility definitions.
175: */
176: #ifndef CHARBITS
177: #define UCHARAT(p) ((int)*(unsigned char *)(p))
178: #else
179: #define UCHARAT(p) ((int)*(p)&CHARBITS)
180: #endif
181:
182: #define EMSG_RETURN(m) { emsg(m); rc_did_emsg = TRUE; return NULL; }
183:
184: static int ismult __ARGS((int));
185: static char_u *cstrchr __ARGS((char_u *, int));
186:
187: static int
188: ismult(c)
189: int c;
190: {
191: return (c == Magic('*') || c == Magic('+') || c == Magic('='));
192: }
193:
194: /*
195: * Flags to be passed up and down.
196: */
197: #define HASWIDTH 01 /* Known never to match null string. */
198: #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
199: #define SPSTART 04 /* Starts with * or +. */
200: #define WORST 0 /* Worst case. */
201:
202: /*
203: * The following allows empty REs, meaning "the same as the previous RE".
204: * per the ed(1) manual.
205: */
206: /* #define EMPTY_RE */ /* this is done outside of regexp */
207: #ifdef EMPTY_RE
208: char_u *reg_prev_re;
209: #endif
210:
211: #define TILDE
212: #ifdef TILDE
213: char_u *reg_prev_sub;
214: #endif
215:
216: /*
217: * Global work variables for vim_regcomp().
218: */
219:
220: static char_u *regparse; /* Input-scan pointer. */
221: static int regnpar; /* () count. */
222: static char_u regdummy;
223: static char_u *regcode; /* Code-emit pointer; ®dummy = don't. */
224: static long regsize; /* Code size. */
225: static char_u **regendp; /* Ditto for endp. */
226:
227: /*
228: * META contains all characters that may be magic, except '^' and '$'.
229: * This depends on the configuration options TILDE, BACKREF.
230: * (could be done simpler for compilers that know string concatenation)
231: */
232:
233: #ifdef TILDE
234: # ifdef BACKREF
235: static char_u META[] = ".[()|=+*<>iIkKfFpP~123456789";
236: # else
237: static char_u META[] = ".[()|=+*<>iIkKfFpP~";
238: # endif
239: #else
240: # ifdef BACKREF
241: static char_u META[] = ".[()|=+*<>iIkKfFpP123456789";
242: # else
243: static char_u META[] = ".[()|=+*<>iIkKfFpP";
244: # endif
245: #endif
246:
247: /*
248: * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
249: * These are:
250: * \r - New line (CR).
251: * \t - Tab (TAB).
252: * \e - Escape (ESC).
253: * \b - Backspace (Ctrl('H')).
254: */
255: static char_u REGEXP_ABBR[] = "rteb";
256:
257: /*
258: * Forward declarations for vim_regcomp()'s friends.
259: */
260: static void initchr __ARGS((char_u *));
261: static int getchr __ARGS((void));
262: static int peekchr __ARGS((void));
263: #define PeekChr() curchr /* shortcut only when last action was peekchr() */
264: static int curchr;
265: static void skipchr __ARGS((void));
266: static void ungetchr __ARGS((void));
267: static char_u *reg __ARGS((int, int *));
268: static char_u *regbranch __ARGS((int *));
269: static char_u *regpiece __ARGS((int *));
270: static char_u *regatom __ARGS((int *));
271: static char_u *regnode __ARGS((int));
272: static char_u *regnext __ARGS((char_u *));
273: static void regc __ARGS((int));
274: static void unregc __ARGS((void));
275: static void reginsert __ARGS((int, char_u *));
276: static void regtail __ARGS((char_u *, char_u *));
277: static void regoptail __ARGS((char_u *, char_u *));
278:
279: #ifndef HAVE_STRCSPN
280: static size_t strcspn __ARGS((const char_u *, const char_u *));
281: #endif
282:
283: /*
284: * skip past regular expression
285: * stop at end of 'p' of where 'dirc' is found ('/', '?', etc)
286: * take care of characters with a backslash in front of it
287: * skip strings inside [ and ].
288: */
289: char_u *
290: skip_regexp(p, dirc)
291: char_u *p;
292: int dirc;
293: {
294: int in_range = FALSE;
295:
296: for (; p[0] != NUL; ++p)
297: {
298: if (p[0] == dirc && !in_range) /* found end of regexp */
299: break;
300: if ((p[0] == '[' && p_magic) || (p[0] == '\\' && p[1] == '[' && !p_magic))
301: {
302: in_range = TRUE;
303: if (p[0] == '\\')
304: ++p;
305: /* "[^]" and "[]" are not the end of a range */
306: if (p[1] == '^')
307: ++p;
308: if (p[1] == ']')
309: ++p;
310: }
311: else if (p[0] == '\\' && p[1] != NUL)
312: ++p; /* skip next character */
313: else if (p[0] == ']')
314: in_range = FALSE;
315: }
316: return p;
317: }
318:
319: /*
320: - vim_regcomp - compile a regular expression into internal code
321: *
322: * We can't allocate space until we know how big the compiled form will be,
323: * but we can't compile it (and thus know how big it is) until we've got a
324: * place to put the code. So we cheat: we compile it twice, once with code
325: * generation turned off and size counting turned on, and once "for real".
326: * This also means that we don't allocate space until we are sure that the
327: * thing really will compile successfully, and we never have to move the
328: * code and thus invalidate pointers into it. (Note that it has to be in
329: * one piece because vim_free() must be able to free it all.)
330: *
331: * Beware that the optimization-preparation code in here knows about some
332: * of the structure of the compiled regexp.
333: */
334: regexp *
335: vim_regcomp(exp)
336: char_u *exp;
337: {
338: register regexp *r;
339: register char_u *scan;
340: register char_u *longest;
341: register int len;
342: int flags;
343: /* extern char *malloc();*/
344:
345: if (exp == NULL)
346: EMSG_RETURN(e_null);
347:
348: #ifdef EMPTY_RE /* this is done outside of regexp */
349: if (*exp == '\0')
350: {
351: if (reg_prev_re)
352: exp = reg_prev_re;
353: else
354: EMSG_RETURN(e_noprevre);
355: }
356: #endif
357:
358: /* First pass: determine size, legality. */
359: initchr((char_u *)exp);
360: regnpar = 1;
361: regsize = 0L;
362: regcode = ®dummy;
363: regendp = NULL;
364: regc(MAGIC);
365: if (reg(0, &flags) == NULL)
366: return NULL;
367:
368: /* Small enough for pointer-storage convention? */
369: if (regsize >= 32767L) /* Probably could be 65535L. */
370: EMSG_RETURN(e_toolong);
371:
372: /* Allocate space. */
373: /* r = (regexp *) malloc((unsigned) (sizeof(regexp) + regsize));*/
374: r = (regexp *) alloc((unsigned) (sizeof(regexp) + regsize));
375: if (r == NULL)
376: return NULL;
377:
378: #ifdef EMPTY_RE /* this is done outside of regexp */
379: if (exp != reg_prev_re) {
380: vim_free(reg_prev_re);
381: if (reg_prev_re = alloc(STRLEN(exp) + 1))
382: STRCPY(reg_prev_re, exp);
383: }
384: #endif
385:
386: /* Second pass: emit code. */
387: initchr((char_u *)exp);
388: regnpar = 1;
389: regcode = r->program;
390: regendp = r->endp;
391: regc(MAGIC);
392: if (reg(0, &flags) == NULL) {
393: vim_free(r);
394: return NULL;
395: }
396:
397: /* Dig out information for optimizations. */
398: r->regstart = '\0'; /* Worst-case defaults. */
399: r->reganch = 0;
400: r->regmust = NULL;
401: r->regmlen = 0;
402: scan = r->program + 1; /* First BRANCH. */
403: if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
404: scan = OPERAND(scan);
405:
406: /* Starting-point info. */
407: if (OP(scan) == EXACTLY)
408: r->regstart = *OPERAND(scan);
409: else if (OP(scan) == BOL)
410: r->reganch++;
411:
412: /*
413: * If there's something expensive in the r.e., find the longest
414: * literal string that must appear and make it the regmust. Resolve
415: * ties in favor of later strings, since the regstart check works
416: * with the beginning of the r.e. and avoiding duplication
417: * strengthens checking. Not a strong reason, but sufficient in the
418: * absence of others.
419: */
420: /*
421: * When the r.e. starts with BOW, it is faster to look for a regmust
422: * first. Used a lot for "#" and "*" commands. (Added by mool).
423: */
424: if (flags & SPSTART || OP(scan) == BOW) {
425: longest = NULL;
426: len = 0;
427: for (; scan != NULL; scan = regnext(scan))
428: if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len) {
429: longest = OPERAND(scan);
430: len = STRLEN(OPERAND(scan));
431: }
432: r->regmust = longest;
433: r->regmlen = len;
434: }
435: }
436: #ifdef DEBUG
437: regdump(r);
438: #endif
439: return r;
440: }
441:
442: /*
443: - reg - regular expression, i.e. main body or parenthesized thing
444: *
445: * Caller must absorb opening parenthesis.
446: *
447: * Combining parenthesis handling with the base level of regular expression
448: * is a trifle forced, but the need to tie the tails of the branches to what
449: * follows makes it hard to avoid.
450: */
451: static char_u *
452: reg(paren, flagp)
453: int paren; /* Parenthesized? */
454: int *flagp;
455: {
456: register char_u *ret;
457: register char_u *br;
458: register char_u *ender;
459: register int parno = 0;
460: int flags;
461:
462: *flagp = HASWIDTH; /* Tentatively. */
463:
464: /* Make an MOPEN node, if parenthesized. */
465: if (paren) {
466: if (regnpar >= NSUBEXP)
467: EMSG_RETURN(e_toombra);
468: parno = regnpar;
469: regnpar++;
470: ret = regnode(MOPEN + parno);
471: if (regendp)
472: regendp[parno] = NULL; /* haven't seen the close paren yet */
473: } else
474: ret = NULL;
475:
476: /* Pick up the branches, linking them together. */
477: br = regbranch(&flags);
478: if (br == NULL)
479: return NULL;
480: if (ret != NULL)
481: regtail(ret, br); /* MOPEN -> first. */
482: else
483: ret = br;
484: if (!(flags & HASWIDTH))
485: *flagp &= ~HASWIDTH;
486: *flagp |= flags & SPSTART;
487: while (peekchr() == Magic('|')) {
488: skipchr();
489: br = regbranch(&flags);
490: if (br == NULL)
491: return NULL;
492: regtail(ret, br); /* BRANCH -> BRANCH. */
493: if (!(flags & HASWIDTH))
494: *flagp &= ~HASWIDTH;
495: *flagp |= flags & SPSTART;
496: }
497:
498: /* Make a closing node, and hook it on the end. */
499: ender = regnode((paren) ? MCLOSE + parno : END);
500: regtail(ret, ender);
501:
502: /* Hook the tails of the branches to the closing node. */
503: for (br = ret; br != NULL; br = regnext(br))
504: regoptail(br, ender);
505:
506: /* Check for proper termination. */
507: if (paren && getchr() != Magic(')'))
508: EMSG_RETURN(e_toombra)
509: else if (!paren && peekchr() != '\0')
510: {
511: if (PeekChr() == Magic(')'))
512: EMSG_RETURN(e_toomket)
513: else
514: EMSG_RETURN(e_trailing) /* "Can't happen". */
515: /* NOTREACHED */
516: }
517: /*
518: * Here we set the flag allowing back references to this set of
519: * parentheses.
520: */
521: if (paren && regendp)
522: regendp[parno] = ender; /* have seen the close paren */
523: return ret;
524: }
525:
526: /*
527: - regbranch - one alternative of an | operator
528: *
529: * Implements the concatenation operator.
530: */
531: static char_u *
532: regbranch(flagp)
533: int *flagp;
534: {
535: register char_u *ret;
536: register char_u *chain;
537: register char_u *latest;
538: int flags;
539:
540: *flagp = WORST; /* Tentatively. */
541:
542: ret = regnode(BRANCH);
543: chain = NULL;
544: while (peekchr() != '\0' && PeekChr() != Magic('|') && PeekChr() != Magic(')')) {
545: latest = regpiece(&flags);
546: if (latest == NULL)
547: return NULL;
548: *flagp |= flags & HASWIDTH;
549: if (chain == NULL) /* First piece. */
550: *flagp |= flags & SPSTART;
551: else
552: regtail(chain, latest);
553: chain = latest;
554: }
555: if (chain == NULL) /* Loop ran zero times. */
556: (void) regnode(NOTHING);
557:
558: return ret;
559: }
560:
561: /*
562: - regpiece - something followed by possible [*+=]
563: *
564: * Note that the branching code sequences used for = and the general cases
565: * of * and + are somewhat optimized: they use the same NOTHING node as
566: * both the endmarker for their branch list and the body of the last branch.
567: * It might seem that this node could be dispensed with entirely, but the
568: * endmarker role is not redundant.
569: */
570: static char_u *
571: regpiece(flagp)
572: int *flagp;
573: {
574: register char_u *ret;
575: register int op;
576: register char_u *next;
577: int flags;
578:
579: ret = regatom(&flags);
580: if (ret == NULL)
581: return NULL;
582:
583: op = peekchr();
584: if (!ismult(op)) {
585: *flagp = flags;
586: return ret;
587: }
588: if (!(flags & HASWIDTH) && op != Magic('='))
589: EMSG_RETURN((char_u *)"*+ operand could be empty");
590: *flagp = (op != Magic('+')) ? (WORST | SPSTART) : (WORST | HASWIDTH);
591:
592: if (op == Magic('*') && (flags & SIMPLE))
593: reginsert(STAR, ret);
594: else if (op == Magic('*')) {
595: /* Emit x* as (x&|), where & means "self". */
596: reginsert(BRANCH, ret); /* Either x */
597: regoptail(ret, regnode(BACK)); /* and loop */
598: regoptail(ret, ret); /* back */
599: regtail(ret, regnode(BRANCH)); /* or */
600: regtail(ret, regnode(NOTHING)); /* null. */
601: } else if (op == Magic('+') && (flags & SIMPLE))
602: reginsert(PLUS, ret);
603: else if (op == Magic('+')) {
604: /* Emit x+ as x(&|), where & means "self". */
605: next = regnode(BRANCH); /* Either */
606: regtail(ret, next);
607: regtail(regnode(BACK), ret); /* loop back */
608: regtail(next, regnode(BRANCH)); /* or */
609: regtail(ret, regnode(NOTHING)); /* null. */
610: } else if (op == Magic('=')) {
611: /* Emit x= as (x|) */
612: reginsert(BRANCH, ret); /* Either x */
613: regtail(ret, regnode(BRANCH)); /* or */
614: next = regnode(NOTHING);/* null. */
615: regtail(ret, next);
616: regoptail(ret, next);
617: }
618: skipchr();
619: if (ismult(peekchr()))
620: EMSG_RETURN((char_u *)"Nested *=+");
621:
622: return ret;
623: }
624:
625: /*
626: - regatom - the lowest level
627: *
628: * Optimization: gobbles an entire sequence of ordinary characters so that
629: * it can turn them into a single node, which is smaller to store and
630: * faster to run.
631: */
632: static char_u *
633: regatom(flagp)
634: int *flagp;
635: {
636: register char_u *ret;
637: int flags;
638:
639: *flagp = WORST; /* Tentatively. */
640:
641: switch (getchr()) {
642: case Magic('^'):
643: ret = regnode(BOL);
644: break;
645: case Magic('$'):
646: ret = regnode(EOL);
647: break;
648: case Magic('<'):
649: ret = regnode(BOW);
650: break;
651: case Magic('>'):
652: ret = regnode(EOW);
653: break;
654: case Magic('.'):
655: ret = regnode(ANY);
656: *flagp |= HASWIDTH | SIMPLE;
657: break;
658: case Magic('i'):
659: ret = regnode(IDENT);
660: *flagp |= HASWIDTH | SIMPLE;
661: break;
662: case Magic('k'):
663: ret = regnode(WORD);
664: *flagp |= HASWIDTH | SIMPLE;
665: break;
666: case Magic('I'):
667: ret = regnode(SIDENT);
668: *flagp |= HASWIDTH | SIMPLE;
669: break;
670: case Magic('K'):
671: ret = regnode(SWORD);
672: *flagp |= HASWIDTH | SIMPLE;
673: break;
674: case Magic('f'):
675: ret = regnode(FNAME);
676: *flagp |= HASWIDTH | SIMPLE;
677: break;
678: case Magic('F'):
679: ret = regnode(SFNAME);
680: *flagp |= HASWIDTH | SIMPLE;
681: break;
682: case Magic('p'):
683: ret = regnode(PRINT);
684: *flagp |= HASWIDTH | SIMPLE;
685: break;
686: case Magic('P'):
687: ret = regnode(SPRINT);
688: *flagp |= HASWIDTH | SIMPLE;
689: break;
690: case Magic('('):
691: ret = reg(1, &flags);
692: if (ret == NULL)
693: return NULL;
694: *flagp |= flags & (HASWIDTH | SPSTART);
695: break;
696: case '\0':
697: case Magic('|'):
698: case Magic(')'):
699: EMSG_RETURN(e_internal) /* Supposed to be caught earlier. */
700: /* NOTREACHED */
701: case Magic('='):
702: EMSG_RETURN((char_u *)"\\= follows nothing")
703: /* NOTREACHED */
704: case Magic('+'):
705: EMSG_RETURN((char_u *)"\\+ follows nothing")
706: /* NOTREACHED */
707: case Magic('*'):
708: if (reg_magic)
709: EMSG_RETURN((char_u *)"* follows nothing")
710: else
711: EMSG_RETURN((char_u *)"\\* follows nothing")
712: /* break; Not Reached */
713: #ifdef TILDE
714: case Magic('~'): /* previous substitute pattern */
715: if (reg_prev_sub) {
716: register char_u *p;
717:
718: ret = regnode(EXACTLY);
719: p = reg_prev_sub;
720: while (*p) {
721: regc(*p++);
722: }
723: regc('\0');
724: if (p - reg_prev_sub) {
725: *flagp |= HASWIDTH;
726: if ((p - reg_prev_sub) == 1)
727: *flagp |= SIMPLE;
728: }
729: } else
730: EMSG_RETURN(e_nopresub);
731: break;
732: #endif
733: #ifdef BACKREF
734: case Magic('1'):
735: case Magic('2'):
736: case Magic('3'):
737: case Magic('4'):
738: case Magic('5'):
739: case Magic('6'):
740: case Magic('7'):
741: case Magic('8'):
742: case Magic('9'): {
743: int refnum;
744:
745: ungetchr();
746: refnum = getchr() - Magic('0');
747: /*
748: * Check if the back reference is legal. We use the parentheses
749: * pointers to mark encountered close parentheses, but this
750: * is only available in the second pass. Checking opens is
751: * always possible.
752: * Should also check that we don't refer to something that
753: * is repeated (+*=): what instance of the repetition should
754: * we match? TODO.
755: */
756: if (refnum < regnpar &&
757: (regendp == NULL || regendp[refnum] != NULL))
758: ret = regnode(BACKREF + refnum);
759: else
760: EMSG_RETURN((char_u *)"Illegal back reference");
761: }
762: break;
763: #endif
764: case Magic('['):
765: {
766: char_u *p;
767:
768: /*
769: * If there is no matching ']', we assume the '[' is a normal
770: * character. This makes ":help [" work.
771: */
772: p = regparse;
773: if (*p == '^') /* Complement of range. */
774: ++p;
775: if (*p == ']' || *p == '-')
776: ++p;
777: while (*p != '\0' && *p != ']')
778: {
779: if (*p == '-')
780: {
781: ++p;
782: if (*p != ']' && *p != '\0')
783: ++p;
784: }
785: else if (*p == '\\' && p[1] != '\0')
786: p += 2;
787: else
788: ++p;
789: }
790: if (*p == ']') /* there is a matching ']' */
791: {
792: /*
793: * In a character class, different parsing rules apply.
794: * Not even \ is special anymore, nothing is.
795: */
796: if (*regparse == '^') { /* Complement of range. */
797: ret = regnode(ANYBUT);
798: regparse++;
799: } else
800: ret = regnode(ANYOF);
801: if (*regparse == ']' || *regparse == '-')
802: regc(*regparse++);
803: while (*regparse != '\0' && *regparse != ']') {
804: if (*regparse == '-') {
805: regparse++;
806: if (*regparse == ']' || *regparse == '\0')
807: regc('-');
808: else {
809: register int cclass;
810: register int cclassend;
811:
812: cclass = UCHARAT(regparse - 2) + 1;
813: cclassend = UCHARAT(regparse);
814: if (cclass > cclassend + 1)
815: EMSG_RETURN(e_invrange);
816: for (; cclass <= cclassend; cclass++)
817: regc(cclass);
818: regparse++;
819: }
820: } else if (*regparse == '\\' && regparse[1]) {
821: regparse++;
822: regc(*regparse++);
823: } else
824: regc(*regparse++);
825: }
826: regc('\0');
827: if (*regparse != ']')
828: EMSG_RETURN(e_toomsbra);
829: skipchr(); /* let's be friends with the lexer again */
830: *flagp |= HASWIDTH | SIMPLE;
831: break;
832: }
833: }
834: /* FALLTHROUGH */
835:
836: default:
837: {
838: register int len;
839: int chr;
840:
841: ungetchr();
842: len = 0;
843: ret = regnode(EXACTLY);
844: /*
845: * Always take at least one character, for '[' without matching
846: * ']'.
847: */
848: while ((chr = peekchr()) != '\0' && (chr < Magic(0) || len == 0))
849: {
850: regc(chr);
851: skipchr();
852: len++;
853: }
854: #ifdef DEBUG
855: if (len == 0)
856: EMSG_RETURN((char_u *)"Unexpected magic character; check META.");
857: #endif
858: /*
859: * If there is a following *, \+ or \= we need the character
860: * in front of it as a single character operand
861: */
862: if (len > 1 && ismult(chr))
863: {
864: unregc(); /* Back off of *+= operand */
865: ungetchr(); /* and put it back for next time */
866: --len;
867: }
868: regc('\0');
869: *flagp |= HASWIDTH;
870: if (len == 1)
871: *flagp |= SIMPLE;
872: }
873: break;
874: }
875:
876: return ret;
877: }
878:
879: /*
880: - regnode - emit a node
881: */
882: static char_u * /* Location. */
883: regnode(op)
884: int op;
885: {
886: register char_u *ret;
887: register char_u *ptr;
888:
889: ret = regcode;
890: if (ret == ®dummy) {
891: regsize += 3;
892: return ret;
893: }
894: ptr = ret;
895: *ptr++ = op;
896: *ptr++ = '\0'; /* Null "next" pointer. */
897: *ptr++ = '\0';
898: regcode = ptr;
899:
900: return ret;
901: }
902:
903: /*
904: - regc - emit (if appropriate) a byte of code
905: */
906: static void
907: regc(b)
908: int b;
909: {
910: if (regcode != ®dummy)
911: *regcode++ = b;
912: else
913: regsize++;
914: }
915:
916: /*
917: - unregc - take back (if appropriate) a byte of code
918: */
919: static void
920: unregc()
921: {
922: if (regcode != ®dummy)
923: regcode--;
924: else
925: regsize--;
926: }
927:
928: /*
929: - reginsert - insert an operator in front of already-emitted operand
930: *
931: * Means relocating the operand.
932: */
933: static void
934: reginsert(op, opnd)
935: int op;
936: char_u *opnd;
937: {
938: register char_u *src;
939: register char_u *dst;
940: register char_u *place;
941:
942: if (regcode == ®dummy) {
943: regsize += 3;
944: return;
945: }
946: src = regcode;
947: regcode += 3;
948: dst = regcode;
949: while (src > opnd)
950: *--dst = *--src;
951:
952: place = opnd; /* Op node, where operand used to be. */
953: *place++ = op;
954: *place++ = '\0';
955: *place = '\0';
956: }
957:
958: /*
959: - regtail - set the next-pointer at the end of a node chain
960: */
961: static void
962: regtail(p, val)
963: char_u *p;
964: char_u *val;
965: {
966: register char_u *scan;
967: register char_u *temp;
968: register int offset;
969:
970: if (p == ®dummy)
971: return;
972:
973: /* Find last node. */
974: scan = p;
975: for (;;) {
976: temp = regnext(scan);
977: if (temp == NULL)
978: break;
979: scan = temp;
980: }
981:
982: if (OP(scan) == BACK)
983: offset = (int)(scan - val);
984: else
985: offset = (int)(val - scan);
986: *(scan + 1) = (char_u) ((offset >> 8) & 0377);
987: *(scan + 2) = (char_u) (offset & 0377);
988: }
989:
990: /*
991: - regoptail - regtail on operand of first argument; nop if operandless
992: */
993: static void
994: regoptail(p, val)
995: char_u *p;
996: char_u *val;
997: {
998: /* "Operandless" and "op != BRANCH" are synonymous in practice. */
999: if (p == NULL || p == ®dummy || OP(p) != BRANCH)
1000: return;
1001: regtail(OPERAND(p), val);
1002: }
1003:
1004: /*
1005: - getchr() - get the next character from the pattern. We know about
1006: * magic and such, so therefore we need a lexical analyzer.
1007: */
1008:
1009: /* static int curchr; */
1010: static int prevchr;
1011: static int nextchr; /* used for ungetchr() */
1012: /*
1013: * Note: prevchr is sometimes -1 when we are not at the start,
1014: * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
1015: * taken to be magic -- webb
1016: */
1017: static int at_start; /* True when we are on the first character */
1018:
1019: static void
1020: initchr(str)
1021: char_u *str;
1022: {
1023: regparse = str;
1024: curchr = prevchr = nextchr = -1;
1025: at_start = TRUE;
1026: }
1027:
1028: static int
1029: peekchr()
1030: {
1031: if (curchr < 0) {
1032: switch (curchr = regparse[0]) {
1033: case '.':
1034: /* case '+':*/
1035: /* case '=':*/
1036: case '[':
1037: case '~':
1038: if (reg_magic)
1039: curchr = Magic(curchr);
1040: break;
1041: case '*':
1042: /* * is not magic as the very first character, eg "?*ptr" */
1043: if (reg_magic && !at_start)
1044: curchr = Magic('*');
1045: break;
1046: case '^':
1047: /* ^ is only magic as the very first character */
1048: if (at_start)
1049: curchr = Magic('^');
1050: break;
1051: case '$':
1052: /* $ is only magic as the very last character and in front of '\|' */
1053: if (regparse[1] == NUL || (regparse[1] == '\\' && regparse[2] == '|'))
1054: curchr = Magic('$');
1055: break;
1056: case '\\':
1057: regparse++;
1058: if (regparse[0] == NUL)
1059: curchr = '\\'; /* trailing '\' */
1060: else if (vim_strchr(META, regparse[0]))
1061: {
1062: /*
1063: * META contains everything that may be magic sometimes, except
1064: * ^ and $ ("\^" and "\$" are never magic).
1065: * We now fetch the next character and toggle its magicness.
1066: * Therefore, \ is so meta-magic that it is not in META.
1067: */
1068: curchr = -1;
1069: at_start = FALSE; /* We still want to be able to say "/\*ptr" */
1070: peekchr();
1071: curchr ^= Magic(0);
1072: }
1073: else if (vim_strchr(REGEXP_ABBR, regparse[0]))
1074: {
1075: /*
1076: * Handle abbreviations, like "\t" for TAB -- webb
1077: */
1078: switch (regparse[0])
1079: {
1080: case 'r': curchr = CR; break;
1081: case 't': curchr = TAB; break;
1082: case 'e': curchr = ESC; break;
1083: case 'b': curchr = Ctrl('H'); break;
1084: }
1085: }
1086: else
1087: {
1088: /*
1089: * Next character can never be (made) magic?
1090: * Then backslashing it won't do anything.
1091: */
1092: curchr = regparse[0];
1093: }
1094: break;
1095: }
1096: }
1097:
1098: return curchr;
1099: }
1100:
1101: static void
1102: skipchr()
1103: {
1104: regparse++;
1105: at_start = FALSE;
1106: prevchr = curchr;
1107: curchr = nextchr; /* use previously unget char, or -1 */
1108: nextchr = -1;
1109: }
1110:
1111: static int
1112: getchr()
1113: {
1114: int chr;
1115:
1116: chr = peekchr();
1117: skipchr();
1118:
1119: return chr;
1120: }
1121:
1122: /*
1123: * put character back. Works only once!
1124: */
1125: static void
1126: ungetchr()
1127: {
1128: nextchr = curchr;
1129: curchr = prevchr;
1130: /*
1131: * Backup regparse as well; not because we will use what it points at,
1132: * but because skipchr() will bump it again.
1133: */
1134: regparse--;
1135: }
1136:
1137: /*
1138: * vim_regexec and friends
1139: */
1140:
1141: /*
1142: * Global work variables for vim_regexec().
1143: */
1144: static char_u *reginput; /* String-input pointer. */
1145: static char_u *regbol; /* Beginning of input, for ^ check. */
1146: static char_u **regstartp; /* Pointer to startp array. */
1147: /* static char_u **regendp; */ /* Ditto for endp. */
1148:
1149: /*
1150: * Forwards.
1151: */
1152: static int regtry __ARGS((regexp *, char_u *));
1153: static int regmatch __ARGS((char_u *));
1154: static int regrepeat __ARGS((char_u *));
1155:
1156: #ifdef DEBUG
1157: int regnarrate = 1;
1158: void regdump __ARGS((regexp *));
1159: static char_u *regprop __ARGS((char_u *));
1160: #endif
1161:
1162: /*
1163: * vim_regexec - match a regexp against a string
1164: * Return non-zero if there is a match.
1165: */
1166: int
1167: vim_regexec(prog, string, at_bol)
1168: register regexp *prog;
1169: register char_u *string;
1170: int at_bol;
1171: {
1172: register char_u *s;
1173:
1174: /* Be paranoid... */
1175: if (prog == NULL || string == NULL) {
1176: emsg(e_null);
1177: rc_did_emsg = TRUE;
1178: return 0;
1179: }
1180: /* Check validity of program. */
1181: if (UCHARAT(prog->program) != MAGIC) {
1182: emsg(e_re_corr);
1183: rc_did_emsg = TRUE;
1184: return 0;
1185: }
1186: /* If there is a "must appear" string, look for it. */
1187: if (prog->regmust != NULL) {
1188: s = string;
1189: while ((s = cstrchr(s, prog->regmust[0])) != NULL) {
1190: if (cstrncmp(s, prog->regmust, prog->regmlen) == 0)
1191: break; /* Found it. */
1192: s++;
1193: }
1194: if (s == NULL) /* Not present. */
1195: return 0;
1196: }
1197: /* Mark beginning of line for ^ . */
1198: if (at_bol)
1199: regbol = string; /* is possible to match bol */
1200: else
1201: regbol = NULL; /* we aren't there, so don't match it */
1202:
1203: /* Simplest case: anchored match need be tried only once. */
1204: if (prog->reganch)
1205: return regtry(prog, string);
1206:
1207: /* Messy cases: unanchored match. */
1208: s = string;
1209: if (prog->regstart != '\0')
1210: /* We know what char it must start with. */
1211: while ((s = cstrchr(s, prog->regstart)) != NULL) {
1212: if (regtry(prog, s))
1213: return 1;
1214: s++;
1215: }
1216: else
1217: /* We don't -- general case. */
1218: do {
1219: if (regtry(prog, s))
1220: return 1;
1221: } while (*s++ != '\0');
1222:
1223: /* Failure. */
1224: return 0;
1225: }
1226:
1227: /*
1228: - regtry - try match at specific point
1229: */
1230: static int /* 0 failure, 1 success */
1231: regtry(prog, string)
1232: regexp *prog;
1233: char_u *string;
1234: {
1235: register int i;
1236: register char_u **sp;
1237: register char_u **ep;
1238:
1239: reginput = string;
1240: regstartp = prog->startp;
1241: regendp = prog->endp;
1242:
1243: sp = prog->startp;
1244: ep = prog->endp;
1245: for (i = NSUBEXP; i > 0; i--) {
1246: *sp++ = NULL;
1247: *ep++ = NULL;
1248: }
1249: if (regmatch(prog->program + 1)) {
1250: prog->startp[0] = string;
1251: prog->endp[0] = reginput;
1252: return 1;
1253: } else
1254: return 0;
1255: }
1256:
1257: /*
1258: - regmatch - main matching routine
1259: *
1260: * Conceptually the strategy is simple: check to see whether the current
1261: * node matches, call self recursively to see whether the rest matches,
1262: * and then act accordingly. In practice we make some effort to avoid
1263: * recursion, in particular by going through "ordinary" nodes (that don't
1264: * need to know whether the rest of the match failed) by a loop instead of
1265: * by recursion.
1266: */
1267: static int /* 0 failure, 1 success */
1268: regmatch(prog)
1269: char_u *prog;
1270: {
1271: register char_u *scan; /* Current node. */
1272: char_u *next; /* Next node. */
1273:
1274: scan = prog;
1275: #ifdef DEBUG
1276: if (scan != NULL && regnarrate)
1277: fprintf(stderr, "%s(\n", regprop(scan));
1278: #endif
1279: while (scan != NULL) {
1280: #ifdef DEBUG
1281: if (regnarrate) {
1282: fprintf(stderr, "%s...\n", regprop(scan));
1283: }
1284: #endif
1285: next = regnext(scan);
1286: switch (OP(scan)) {
1287: case BOL:
1288: if (reginput != regbol)
1289: return 0;
1290: break;
1291: case EOL:
1292: if (*reginput != '\0')
1293: return 0;
1294: break;
1295: case BOW: /* \<word; reginput points to w */
1296: if (reginput != regbol && iswordchar(reginput[-1]))
1297: return 0;
1298: if (!reginput[0] || !iswordchar(reginput[0]))
1299: return 0;
1300: break;
1301: case EOW: /* word\>; reginput points after d */
1302: if (reginput == regbol || !iswordchar(reginput[-1]))
1303: return 0;
1304: if (reginput[0] && iswordchar(reginput[0]))
1305: return 0;
1306: break;
1307: case ANY:
1308: if (*reginput == '\0')
1309: return 0;
1310: reginput++;
1311: break;
1312: case IDENT:
1313: if (!isidchar(*reginput))
1314: return 0;
1315: reginput++;
1316: break;
1317: case WORD:
1318: if (!iswordchar(*reginput))
1319: return 0;
1320: reginput++;
1321: break;
1322: case FNAME:
1323: if (!isfilechar(*reginput))
1324: return 0;
1325: reginput++;
1326: break;
1327: case PRINT:
1328: if (charsize(*reginput) != 1)
1329: return 0;
1330: reginput++;
1331: break;
1332: case SIDENT:
1333: if (isdigit(*reginput) || !isidchar(*reginput))
1334: return 0;
1335: reginput++;
1336: break;
1337: case SWORD:
1338: if (isdigit(*reginput) || !iswordchar(*reginput))
1339: return 0;
1340: reginput++;
1341: break;
1342: case SFNAME:
1343: if (isdigit(*reginput) || !isfilechar(*reginput))
1344: return 0;
1345: reginput++;
1346: break;
1347: case SPRINT:
1348: if (isdigit(*reginput) || charsize(*reginput) != 1)
1349: return 0;
1350: reginput++;
1351: break;
1352: case EXACTLY:{
1353: register int len;
1354: register char_u *opnd;
1355:
1356: opnd = OPERAND(scan);
1357: /* Inline the first character, for speed. */
1358: if (*opnd != *reginput && (!reg_ic || TO_UPPER(*opnd) != TO_UPPER(*reginput)))
1359: return 0;
1360: len = STRLEN(opnd);
1361: if (len > 1 && cstrncmp(opnd, reginput, len) != 0)
1362: return 0;
1363: reginput += len;
1364: }
1365: break;
1366: case ANYOF:
1367: if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) == NULL)
1368: return 0;
1369: reginput++;
1370: break;
1371: case ANYBUT:
1372: if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) != NULL)
1373: return 0;
1374: reginput++;
1375: break;
1376: case NOTHING:
1377: break;
1378: case BACK:
1379: break;
1380: case MOPEN + 1:
1381: case MOPEN + 2:
1382: case MOPEN + 3:
1383: case MOPEN + 4:
1384: case MOPEN + 5:
1385: case MOPEN + 6:
1386: case MOPEN + 7:
1387: case MOPEN + 8:
1388: case MOPEN + 9:{
1389: register int no;
1390: register char_u *save;
1391:
1392: no = OP(scan) - MOPEN;
1393: save = regstartp[no];
1394: regstartp[no] = reginput; /* Tentatively */
1395: #ifdef DEBUG
1396: printf("MOPEN %d pre @'%s' ('%s' )'%s'\n",
1397: no, save,
1398: regstartp[no] ? regstartp[no] : "NULL",
1399: regendp[no] ? regendp[no] : "NULL");
1400: #endif
1401:
1402: if (regmatch(next)) {
1403: #ifdef DEBUG
1404: printf("MOPEN %d post @'%s' ('%s' )'%s'\n",
1405: no, save,
1406: regstartp[no] ? regstartp[no] : "NULL",
1407: regendp[no] ? regendp[no] : "NULL");
1408: #endif
1409: return 1;
1410: }
1411: regstartp[no] = save; /* We were wrong... */
1412: return 0;
1413: }
1414: /* break; Not Reached */
1415: case MCLOSE + 1:
1416: case MCLOSE + 2:
1417: case MCLOSE + 3:
1418: case MCLOSE + 4:
1419: case MCLOSE + 5:
1420: case MCLOSE + 6:
1421: case MCLOSE + 7:
1422: case MCLOSE + 8:
1423: case MCLOSE + 9:{
1424: register int no;
1425: register char_u *save;
1426:
1427: no = OP(scan) - MCLOSE;
1428: save = regendp[no];
1429: regendp[no] = reginput; /* Tentatively */
1430: #ifdef DEBUG
1431: printf("MCLOSE %d pre @'%s' ('%s' )'%s'\n",
1432: no, save,
1433: regstartp[no] ? regstartp[no] : "NULL",
1434: regendp[no] ? regendp[no] : "NULL");
1435: #endif
1436:
1437: if (regmatch(next)) {
1438: #ifdef DEBUG
1439: printf("MCLOSE %d post @'%s' ('%s' )'%s'\n",
1440: no, save,
1441: regstartp[no] ? regstartp[no] : "NULL",
1442: regendp[no] ? regendp[no] : "NULL");
1443: #endif
1444: return 1;
1445: }
1446: regendp[no] = save; /* We were wrong... */
1447: return 0;
1448: }
1449: /* break; Not Reached */
1450: #ifdef BACKREF
1451: case BACKREF + 1:
1452: case BACKREF + 2:
1453: case BACKREF + 3:
1454: case BACKREF + 4:
1455: case BACKREF + 5:
1456: case BACKREF + 6:
1457: case BACKREF + 7:
1458: case BACKREF + 8:
1459: case BACKREF + 9:{
1460: register int no;
1461: int len;
1462:
1463: no = OP(scan) - BACKREF;
1464: if (regendp[no] != NULL) {
1465: len = (int)(regendp[no] - regstartp[no]);
1466: if (cstrncmp(regstartp[no], reginput, len) != 0)
1467: return 0;
1468: reginput += len;
1469: } else {
1470: /*emsg("backref to 0-repeat");*/
1471: /*return 0;*/
1472: }
1473: }
1474: break;
1475: #endif
1476: case BRANCH:{
1477: register char_u *save;
1478:
1479: if (OP(next) != BRANCH) /* No choice. */
1480: next = OPERAND(scan); /* Avoid recursion. */
1481: else {
1482: do {
1483: save = reginput;
1484: if (regmatch(OPERAND(scan)))
1485: return 1;
1486: reginput = save;
1487: scan = regnext(scan);
1488: } while (scan != NULL && OP(scan) == BRANCH);
1489: return 0;
1490: /* NOTREACHED */
1491: }
1492: }
1493: break;
1494: case STAR:
1495: case PLUS:{
1496: register int nextch;
1497: register int no;
1498: register char_u *save;
1499: register int min;
1500:
1501: /*
1502: * Lookahead to avoid useless match attempts when we know
1503: * what character comes next.
1504: */
1505: nextch = '\0';
1506: if (OP(next) == EXACTLY)
1507: {
1508: nextch = *OPERAND(next);
1509: if (reg_ic)
1510: nextch = TO_UPPER(nextch);
1511: }
1512: min = (OP(scan) == STAR) ? 0 : 1;
1513: save = reginput;
1514: no = regrepeat(OPERAND(scan));
1515: while (no >= min)
1516: {
1517: /* If it could work, try it. */
1518: if (nextch == '\0' || (*reginput == nextch ||
1519: (reg_ic && TO_UPPER(*reginput) == nextch)))
1520: if (regmatch(next))
1521: return 1;
1522: /* Couldn't or didn't -- back up. */
1523: no--;
1524: reginput = save + no;
1525: }
1526: return 0;
1527: }
1528: /* break; Not Reached */
1529: case END:
1530: return 1; /* Success! */
1531: /* break; Not Reached */
1532: default:
1533: emsg(e_re_corr);
1534: return 0;
1535: /* break; Not Reached */
1536: }
1537:
1538: scan = next;
1539: }
1540:
1541: /*
1542: * We get here only if there's trouble -- normally "case END" is the
1543: * terminating point.
1544: */
1545: emsg(e_re_corr);
1546: return 0;
1547: }
1548:
1549: /*
1550: - regrepeat - repeatedly match something simple, report how many
1551: */
1552: static int
1553: regrepeat(p)
1554: char_u *p;
1555: {
1556: register int count = 0;
1557: register char_u *scan;
1558: register char_u *opnd;
1559:
1560: scan = reginput;
1561: opnd = OPERAND(p);
1562: switch (OP(p)) {
1563: case ANY:
1564: count = STRLEN(scan);
1565: scan += count;
1566: break;
1567: case IDENT:
1568: for (count = 0; isidchar(*scan); ++count)
1569: ++scan;
1570: break;
1571: case WORD:
1572: for (count = 0; iswordchar(*scan); ++count)
1573: ++scan;
1574: break;
1575: case FNAME:
1576: for (count = 0; isfilechar(*scan); ++count)
1577: ++scan;
1578: break;
1579: case PRINT:
1580: for (count = 0; charsize(*scan) == 1; ++count)
1581: ++scan;
1582: break;
1583: case SIDENT:
1584: for (count = 0; !isdigit(*scan) && isidchar(*scan); ++count)
1585: ++scan;
1586: break;
1587: case SWORD:
1588: for (count = 0; !isdigit(*scan) && iswordchar(*scan); ++count)
1589: ++scan;
1590: break;
1591: case SFNAME:
1592: for (count = 0; !isdigit(*scan) && isfilechar(*scan); ++count)
1593: ++scan;
1594: break;
1595: case SPRINT:
1596: for (count = 0; !isdigit(*scan) && charsize(*scan) == 1; ++count)
1597: ++scan;
1598: break;
1599: case EXACTLY:
1600: while (*opnd == *scan || (reg_ic && TO_UPPER(*opnd) == TO_UPPER(*scan)))
1601: {
1602: count++;
1603: scan++;
1604: }
1605: break;
1606: case ANYOF:
1607: while (*scan != '\0' && cstrchr(opnd, *scan) != NULL)
1608: {
1609: count++;
1610: scan++;
1611: }
1612: break;
1613: case ANYBUT:
1614: while (*scan != '\0' && cstrchr(opnd, *scan) == NULL) {
1615: count++;
1616: scan++;
1617: }
1618: break;
1619: default: /* Oh dear. Called inappropriately. */
1620: emsg(e_re_corr);
1621: count = 0; /* Best compromise. */
1622: break;
1623: }
1624: reginput = scan;
1625:
1626: return count;
1627: }
1628:
1629: /*
1630: - regnext - dig the "next" pointer out of a node
1631: */
1632: static char_u *
1633: regnext(p)
1634: register char_u *p;
1635: {
1636: register int offset;
1637:
1638: if (p == ®dummy)
1639: return NULL;
1640:
1641: offset = NEXT(p);
1642: if (offset == 0)
1643: return NULL;
1644:
1645: if (OP(p) == BACK)
1646: return p - offset;
1647: else
1648: return p + offset;
1649: }
1650:
1651: #ifdef DEBUG
1652:
1653: /*
1654: - regdump - dump a regexp onto stdout in vaguely comprehensible form
1655: */
1656: void
1657: regdump(r)
1658: regexp *r;
1659: {
1660: register char_u *s;
1661: register int op = EXACTLY; /* Arbitrary non-END op. */
1662: register char_u *next;
1663:
1664:
1665: s = r->program + 1;
1666: while (op != END) { /* While that wasn't END last time... */
1667: op = OP(s);
1668: printf("%2d%s", s - r->program, regprop(s)); /* Where, what. */
1669: next = regnext(s);
1670: if (next == NULL) /* Next ptr. */
1671: printf("(0)");
1672: else
1673: printf("(%d)", (s - r->program) + (next - s));
1674: s += 3;
1675: if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1676: /* Literal string, where present. */
1677: while (*s != '\0') {
1678: putchar(*s);
1679: s++;
1680: }
1681: s++;
1682: }
1683: putchar('\n');
1684: }
1685:
1686: /* Header fields of interest. */
1687: if (r->regstart != '\0')
1688: printf("start `%c' ", r->regstart);
1689: if (r->reganch)
1690: printf("anchored ");
1691: if (r->regmust != NULL)
1692: printf("must have \"%s\"", r->regmust);
1693: printf("\n");
1694: }
1695:
1696: /*
1697: - regprop - printable representation of opcode
1698: */
1699: static char_u *
1700: regprop(op)
1701: char_u *op;
1702: {
1703: register char_u *p;
1704: static char_u buf[50];
1705:
1706: (void) strcpy(buf, ":");
1707:
1708: switch (OP(op)) {
1709: case BOL:
1710: p = "BOL";
1711: break;
1712: case EOL:
1713: p = "EOL";
1714: break;
1715: case ANY:
1716: p = "ANY";
1717: break;
1718: case IDENT:
1719: p = "IDENT";
1720: break;
1721: case WORD:
1722: p = "WORD";
1723: break;
1724: case FNAME:
1725: p = "FNAME";
1726: break;
1727: case PRINT:
1728: p = "PRINT";
1729: break;
1730: case SIDENT:
1731: p = "SIDENT";
1732: break;
1733: case SWORD:
1734: p = "SWORD";
1735: break;
1736: case SFNAME:
1737: p = "SFNAME";
1738: break;
1739: case SPRINT:
1740: p = "SPRINT";
1741: break;
1742: case ANYOF:
1743: p = "ANYOF";
1744: break;
1745: case ANYBUT:
1746: p = "ANYBUT";
1747: break;
1748: case BRANCH:
1749: p = "BRANCH";
1750: break;
1751: case EXACTLY:
1752: p = "EXACTLY";
1753: break;
1754: case NOTHING:
1755: p = "NOTHING";
1756: break;
1757: case BACK:
1758: p = "BACK";
1759: break;
1760: case END:
1761: p = "END";
1762: break;
1763: case MOPEN + 1:
1764: case MOPEN + 2:
1765: case MOPEN + 3:
1766: case MOPEN + 4:
1767: case MOPEN + 5:
1768: case MOPEN + 6:
1769: case MOPEN + 7:
1770: case MOPEN + 8:
1771: case MOPEN + 9:
1772: sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
1773: p = NULL;
1774: break;
1775: case MCLOSE + 1:
1776: case MCLOSE + 2:
1777: case MCLOSE + 3:
1778: case MCLOSE + 4:
1779: case MCLOSE + 5:
1780: case MCLOSE + 6:
1781: case MCLOSE + 7:
1782: case MCLOSE + 8:
1783: case MCLOSE + 9:
1784: sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
1785: p = NULL;
1786: break;
1787: case BACKREF + 1:
1788: case BACKREF + 2:
1789: case BACKREF + 3:
1790: case BACKREF + 4:
1791: case BACKREF + 5:
1792: case BACKREF + 6:
1793: case BACKREF + 7:
1794: case BACKREF + 8:
1795: case BACKREF + 9:
1796: sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
1797: p = NULL;
1798: break;
1799: case STAR:
1800: p = "STAR";
1801: break;
1802: case PLUS:
1803: p = "PLUS";
1804: break;
1805: default:
1806: sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
1807: p = NULL;
1808: break;
1809: }
1810: if (p != NULL)
1811: (void) strcat(buf, p);
1812: return buf;
1813: }
1814: #endif
1815:
1816: /*
1817: * The following is provided for those people who do not have strcspn() in
1818: * their C libraries. They should get off their butts and do something
1819: * about it; at least one public-domain implementation of those (highly
1820: * useful) string routines has been published on Usenet.
1821: */
1822: #ifndef HAVE_STRCSPN
1823: /*
1824: * strcspn - find length of initial segment of s1 consisting entirely
1825: * of characters not from s2
1826: */
1827:
1828: static size_t
1829: strcspn(s1, s2)
1830: const char_u *s1;
1831: const char_u *s2;
1832: {
1833: register char_u *scan1;
1834: register char_u *scan2;
1835: register int count;
1836:
1837: count = 0;
1838: for (scan1 = s1; *scan1 != '\0'; scan1++) {
1839: for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1840: if (*scan1 == *scan2++)
1841: return count;
1842: count++;
1843: }
1844: return count;
1845: }
1846: #endif
1847:
1848: /*
1849: * Compare two strings, ignore case if reg_ic set.
1850: * Return 0 if strings match, non-zero otherwise.
1851: */
1852: int
1853: cstrncmp(s1, s2, n)
1854: char_u *s1, *s2;
1855: int n;
1856: {
1857: if (!reg_ic)
1858: return STRNCMP(s1, s2, (size_t)n);
1859:
1860: return vim_strnicmp(s1, s2, (size_t)n);
1861: }
1862:
1863: /*
1864: * cstrchr: This function is used a lot for simple searches, keep it fast!
1865: */
1866: static char_u *
1867: cstrchr(s, c)
1868: char_u *s;
1869: register int c;
1870: {
1871: register char_u *p;
1872:
1873: if (!reg_ic)
1874: return vim_strchr(s, c);
1875:
1876: c = TO_UPPER(c);
1877:
1878: for (p = s; *p; p++)
1879: {
1880: if (TO_UPPER(*p) == c)
1881: return p;
1882: }
1883: return NULL;
1884: }