Annotation of src/usr.bin/lex/nfa.c, Revision 1.1.1.1
1.1 deraadt 1: /* nfa - NFA construction routines */
2:
3: /*-
4: * Copyright (c) 1990 The Regents of the University of California.
5: * All rights reserved.
6: *
7: * This code is derived from software contributed to Berkeley by
8: * Vern Paxson.
9: *
10: * The United States Government has rights in this work pursuant
11: * to contract no. DE-AC03-76SF00098 between the United States
12: * Department of Energy and the University of California.
13: *
14: * Redistribution and use in source and binary forms are permitted provided
15: * that: (1) source distributions retain this entire copyright notice and
16: * comment, and (2) distributions including binaries display the following
17: * acknowledgement: ``This product includes software developed by the
18: * University of California, Berkeley and its contributors'' in the
19: * documentation or other materials provided with the distribution and in
20: * all advertising materials mentioning features or use of this software.
21: * Neither the name of the University nor the names of its contributors may
22: * be used to endorse or promote products derived from this software without
23: * specific prior written permission.
24: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
25: * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
26: * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27: */
28:
29: /* $Header: /a/cvsroot/src/usr.bin/lex/nfa.c,v 1.8 1995/05/05 05:35:38 jtc Exp $ */
30:
31: #include "flexdef.h"
32:
33:
34: /* declare functions that have forward references */
35:
36: int dupmachine PROTO((int));
37: void mkxtion PROTO((int, int));
38:
39:
40: /* add_accept - add an accepting state to a machine
41: *
42: * accepting_number becomes mach's accepting number.
43: */
44:
45: void add_accept( mach, accepting_number )
46: int mach, accepting_number;
47: {
48: /* Hang the accepting number off an epsilon state. if it is associated
49: * with a state that has a non-epsilon out-transition, then the state
50: * will accept BEFORE it makes that transition, i.e., one character
51: * too soon.
52: */
53:
54: if ( transchar[finalst[mach]] == SYM_EPSILON )
55: accptnum[finalst[mach]] = accepting_number;
56:
57: else
58: {
59: int astate = mkstate( SYM_EPSILON );
60: accptnum[astate] = accepting_number;
61: (void) link_machines( mach, astate );
62: }
63: }
64:
65:
66: /* copysingl - make a given number of copies of a singleton machine
67: *
68: * synopsis
69: *
70: * newsng = copysingl( singl, num );
71: *
72: * newsng - a new singleton composed of num copies of singl
73: * singl - a singleton machine
74: * num - the number of copies of singl to be present in newsng
75: */
76:
77: int copysingl( singl, num )
78: int singl, num;
79: {
80: int copy, i;
81:
82: copy = mkstate( SYM_EPSILON );
83:
84: for ( i = 1; i <= num; ++i )
85: copy = link_machines( copy, dupmachine( singl ) );
86:
87: return copy;
88: }
89:
90:
91: /* dumpnfa - debugging routine to write out an nfa */
92:
93: void dumpnfa( state1 )
94: int state1;
95:
96: {
97: int sym, tsp1, tsp2, anum, ns;
98:
99: fprintf( stderr,
100: _( "\n\n********** beginning dump of nfa with start state %d\n" ),
101: state1 );
102:
103: /* We probably should loop starting at firstst[state1] and going to
104: * lastst[state1], but they're not maintained properly when we "or"
105: * all of the rules together. So we use our knowledge that the machine
106: * starts at state 1 and ends at lastnfa.
107: */
108:
109: /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
110: for ( ns = 1; ns <= lastnfa; ++ns )
111: {
112: fprintf( stderr, _( "state # %4d\t" ), ns );
113:
114: sym = transchar[ns];
115: tsp1 = trans1[ns];
116: tsp2 = trans2[ns];
117: anum = accptnum[ns];
118:
119: fprintf( stderr, "%3d: %4d, %4d", sym, tsp1, tsp2 );
120:
121: if ( anum != NIL )
122: fprintf( stderr, " [%d]", anum );
123:
124: fprintf( stderr, "\n" );
125: }
126:
127: fprintf( stderr, _( "********** end of dump\n" ) );
128: }
129:
130:
131: /* dupmachine - make a duplicate of a given machine
132: *
133: * synopsis
134: *
135: * copy = dupmachine( mach );
136: *
137: * copy - holds duplicate of mach
138: * mach - machine to be duplicated
139: *
140: * note that the copy of mach is NOT an exact duplicate; rather, all the
141: * transition states values are adjusted so that the copy is self-contained,
142: * as the original should have been.
143: *
144: * also note that the original MUST be contiguous, with its low and high
145: * states accessible by the arrays firstst and lastst
146: */
147:
148: int dupmachine( mach )
149: int mach;
150: {
151: int i, init, state_offset;
152: int state = 0;
153: int last = lastst[mach];
154:
155: for ( i = firstst[mach]; i <= last; ++i )
156: {
157: state = mkstate( transchar[i] );
158:
159: if ( trans1[i] != NO_TRANSITION )
160: {
161: mkxtion( finalst[state], trans1[i] + state - i );
162:
163: if ( transchar[i] == SYM_EPSILON &&
164: trans2[i] != NO_TRANSITION )
165: mkxtion( finalst[state],
166: trans2[i] + state - i );
167: }
168:
169: accptnum[state] = accptnum[i];
170: }
171:
172: if ( state == 0 )
173: flexfatal( _( "empty machine in dupmachine()" ) );
174:
175: state_offset = state - i + 1;
176:
177: init = mach + state_offset;
178: firstst[init] = firstst[mach] + state_offset;
179: finalst[init] = finalst[mach] + state_offset;
180: lastst[init] = lastst[mach] + state_offset;
181:
182: return init;
183: }
184:
185:
186: /* finish_rule - finish up the processing for a rule
187: *
188: * An accepting number is added to the given machine. If variable_trail_rule
189: * is true then the rule has trailing context and both the head and trail
190: * are variable size. Otherwise if headcnt or trailcnt is non-zero then
191: * the machine recognizes a pattern with trailing context and headcnt is
192: * the number of characters in the matched part of the pattern, or zero
193: * if the matched part has variable length. trailcnt is the number of
194: * trailing context characters in the pattern, or zero if the trailing
195: * context has variable length.
196: */
197:
198: void finish_rule( mach, variable_trail_rule, headcnt, trailcnt )
199: int mach, variable_trail_rule, headcnt, trailcnt;
200: {
201: char action_text[MAXLINE];
202:
203: add_accept( mach, num_rules );
204:
205: /* We did this in new_rule(), but it often gets the wrong
206: * number because we do it before we start parsing the current rule.
207: */
208: rule_linenum[num_rules] = linenum;
209:
210: /* If this is a continued action, then the line-number has already
211: * been updated, giving us the wrong number.
212: */
213: if ( continued_action )
214: --rule_linenum[num_rules];
215:
216: sprintf( action_text, "case %d:\n", num_rules );
217: add_action( action_text );
218:
219: if ( variable_trail_rule )
220: {
221: rule_type[num_rules] = RULE_VARIABLE;
222:
223: if ( performance_report > 0 )
224: fprintf( stderr,
225: _( "Variable trailing context rule at line %d\n" ),
226: rule_linenum[num_rules] );
227:
228: variable_trailing_context_rules = true;
229: }
230:
231: else
232: {
233: rule_type[num_rules] = RULE_NORMAL;
234:
235: if ( headcnt > 0 || trailcnt > 0 )
236: {
237: /* Do trailing context magic to not match the trailing
238: * characters.
239: */
240: char *scanner_cp = "yy_c_buf_p = yy_cp";
241: char *scanner_bp = "yy_bp";
242:
243: add_action(
244: "*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" );
245:
246: if ( headcnt > 0 )
247: {
248: sprintf( action_text, "%s = %s + %d;\n",
249: scanner_cp, scanner_bp, headcnt );
250: add_action( action_text );
251: }
252:
253: else
254: {
255: sprintf( action_text, "%s -= %d;\n",
256: scanner_cp, trailcnt );
257: add_action( action_text );
258: }
259:
260: add_action(
261: "YY_DO_BEFORE_ACTION; /* set up yytext again */\n" );
262: }
263: }
264:
265: /* Okay, in the action code at this point yytext and yyleng have
266: * their proper final values for this rule, so here's the point
267: * to do any user action. But don't do it for continued actions,
268: * as that'll result in multiple YY_RULE_SETUP's.
269: */
270: if ( ! continued_action )
271: add_action( "YY_RULE_SETUP\n" );
272:
273: line_directive_out( (FILE *) 0, 1 );
274: }
275:
276:
277: /* link_machines - connect two machines together
278: *
279: * synopsis
280: *
281: * new = link_machines( first, last );
282: *
283: * new - a machine constructed by connecting first to last
284: * first - the machine whose successor is to be last
285: * last - the machine whose predecessor is to be first
286: *
287: * note: this routine concatenates the machine first with the machine
288: * last to produce a machine new which will pattern-match first first
289: * and then last, and will fail if either of the sub-patterns fails.
290: * FIRST is set to new by the operation. last is unmolested.
291: */
292:
293: int link_machines( first, last )
294: int first, last;
295: {
296: if ( first == NIL )
297: return last;
298:
299: else if ( last == NIL )
300: return first;
301:
302: else
303: {
304: mkxtion( finalst[first], last );
305: finalst[first] = finalst[last];
306: lastst[first] = MAX( lastst[first], lastst[last] );
307: firstst[first] = MIN( firstst[first], firstst[last] );
308:
309: return first;
310: }
311: }
312:
313:
314: /* mark_beginning_as_normal - mark each "beginning" state in a machine
315: * as being a "normal" (i.e., not trailing context-
316: * associated) states
317: *
318: * The "beginning" states are the epsilon closure of the first state
319: */
320:
321: void mark_beginning_as_normal( mach )
322: register int mach;
323: {
324: switch ( state_type[mach] )
325: {
326: case STATE_NORMAL:
327: /* Oh, we've already visited here. */
328: return;
329:
330: case STATE_TRAILING_CONTEXT:
331: state_type[mach] = STATE_NORMAL;
332:
333: if ( transchar[mach] == SYM_EPSILON )
334: {
335: if ( trans1[mach] != NO_TRANSITION )
336: mark_beginning_as_normal(
337: trans1[mach] );
338:
339: if ( trans2[mach] != NO_TRANSITION )
340: mark_beginning_as_normal(
341: trans2[mach] );
342: }
343: break;
344:
345: default:
346: flexerror(
347: _( "bad state type in mark_beginning_as_normal()" ) );
348: break;
349: }
350: }
351:
352:
353: /* mkbranch - make a machine that branches to two machines
354: *
355: * synopsis
356: *
357: * branch = mkbranch( first, second );
358: *
359: * branch - a machine which matches either first's pattern or second's
360: * first, second - machines whose patterns are to be or'ed (the | operator)
361: *
362: * Note that first and second are NEITHER destroyed by the operation. Also,
363: * the resulting machine CANNOT be used with any other "mk" operation except
364: * more mkbranch's. Compare with mkor()
365: */
366:
367: int mkbranch( first, second )
368: int first, second;
369: {
370: int eps;
371:
372: if ( first == NO_TRANSITION )
373: return second;
374:
375: else if ( second == NO_TRANSITION )
376: return first;
377:
378: eps = mkstate( SYM_EPSILON );
379:
380: mkxtion( eps, first );
381: mkxtion( eps, second );
382:
383: return eps;
384: }
385:
386:
387: /* mkclos - convert a machine into a closure
388: *
389: * synopsis
390: * new = mkclos( state );
391: *
392: * new - a new state which matches the closure of "state"
393: */
394:
395: int mkclos( state )
396: int state;
397: {
398: return mkopt( mkposcl( state ) );
399: }
400:
401:
402: /* mkopt - make a machine optional
403: *
404: * synopsis
405: *
406: * new = mkopt( mach );
407: *
408: * new - a machine which optionally matches whatever mach matched
409: * mach - the machine to make optional
410: *
411: * notes:
412: * 1. mach must be the last machine created
413: * 2. mach is destroyed by the call
414: */
415:
416: int mkopt( mach )
417: int mach;
418: {
419: int eps;
420:
421: if ( ! SUPER_FREE_EPSILON(finalst[mach]) )
422: {
423: eps = mkstate( SYM_EPSILON );
424: mach = link_machines( mach, eps );
425: }
426:
427: /* Can't skimp on the following if FREE_EPSILON(mach) is true because
428: * some state interior to "mach" might point back to the beginning
429: * for a closure.
430: */
431: eps = mkstate( SYM_EPSILON );
432: mach = link_machines( eps, mach );
433:
434: mkxtion( mach, finalst[mach] );
435:
436: return mach;
437: }
438:
439:
440: /* mkor - make a machine that matches either one of two machines
441: *
442: * synopsis
443: *
444: * new = mkor( first, second );
445: *
446: * new - a machine which matches either first's pattern or second's
447: * first, second - machines whose patterns are to be or'ed (the | operator)
448: *
449: * note that first and second are both destroyed by the operation
450: * the code is rather convoluted because an attempt is made to minimize
451: * the number of epsilon states needed
452: */
453:
454: int mkor( first, second )
455: int first, second;
456: {
457: int eps, orend;
458:
459: if ( first == NIL )
460: return second;
461:
462: else if ( second == NIL )
463: return first;
464:
465: else
466: {
467: /* See comment in mkopt() about why we can't use the first
468: * state of "first" or "second" if they satisfy "FREE_EPSILON".
469: */
470: eps = mkstate( SYM_EPSILON );
471:
472: first = link_machines( eps, first );
473:
474: mkxtion( first, second );
475:
476: if ( SUPER_FREE_EPSILON(finalst[first]) &&
477: accptnum[finalst[first]] == NIL )
478: {
479: orend = finalst[first];
480: mkxtion( finalst[second], orend );
481: }
482:
483: else if ( SUPER_FREE_EPSILON(finalst[second]) &&
484: accptnum[finalst[second]] == NIL )
485: {
486: orend = finalst[second];
487: mkxtion( finalst[first], orend );
488: }
489:
490: else
491: {
492: eps = mkstate( SYM_EPSILON );
493:
494: first = link_machines( first, eps );
495: orend = finalst[first];
496:
497: mkxtion( finalst[second], orend );
498: }
499: }
500:
501: finalst[first] = orend;
502: return first;
503: }
504:
505:
506: /* mkposcl - convert a machine into a positive closure
507: *
508: * synopsis
509: * new = mkposcl( state );
510: *
511: * new - a machine matching the positive closure of "state"
512: */
513:
514: int mkposcl( state )
515: int state;
516: {
517: int eps;
518:
519: if ( SUPER_FREE_EPSILON(finalst[state]) )
520: {
521: mkxtion( finalst[state], state );
522: return state;
523: }
524:
525: else
526: {
527: eps = mkstate( SYM_EPSILON );
528: mkxtion( eps, state );
529: return link_machines( state, eps );
530: }
531: }
532:
533:
534: /* mkrep - make a replicated machine
535: *
536: * synopsis
537: * new = mkrep( mach, lb, ub );
538: *
539: * new - a machine that matches whatever "mach" matched from "lb"
540: * number of times to "ub" number of times
541: *
542: * note
543: * if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach"
544: */
545:
546: int mkrep( mach, lb, ub )
547: int mach, lb, ub;
548: {
549: int base_mach, tail, copy, i;
550:
551: base_mach = copysingl( mach, lb - 1 );
552:
553: if ( ub == INFINITY )
554: {
555: copy = dupmachine( mach );
556: mach = link_machines( mach,
557: link_machines( base_mach, mkclos( copy ) ) );
558: }
559:
560: else
561: {
562: tail = mkstate( SYM_EPSILON );
563:
564: for ( i = lb; i < ub; ++i )
565: {
566: copy = dupmachine( mach );
567: tail = mkopt( link_machines( copy, tail ) );
568: }
569:
570: mach = link_machines( mach, link_machines( base_mach, tail ) );
571: }
572:
573: return mach;
574: }
575:
576:
577: /* mkstate - create a state with a transition on a given symbol
578: *
579: * synopsis
580: *
581: * state = mkstate( sym );
582: *
583: * state - a new state matching sym
584: * sym - the symbol the new state is to have an out-transition on
585: *
586: * note that this routine makes new states in ascending order through the
587: * state array (and increments LASTNFA accordingly). The routine DUPMACHINE
588: * relies on machines being made in ascending order and that they are
589: * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge
590: * that it admittedly is)
591: */
592:
593: int mkstate( sym )
594: int sym;
595: {
596: if ( ++lastnfa >= current_mns )
597: {
598: if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS )
599: lerrif(
600: _( "input rules are too complicated (>= %d NFA states)" ),
601: current_mns );
602:
603: ++num_reallocs;
604:
605: firstst = reallocate_integer_array( firstst, current_mns );
606: lastst = reallocate_integer_array( lastst, current_mns );
607: finalst = reallocate_integer_array( finalst, current_mns );
608: transchar = reallocate_integer_array( transchar, current_mns );
609: trans1 = reallocate_integer_array( trans1, current_mns );
610: trans2 = reallocate_integer_array( trans2, current_mns );
611: accptnum = reallocate_integer_array( accptnum, current_mns );
612: assoc_rule =
613: reallocate_integer_array( assoc_rule, current_mns );
614: state_type =
615: reallocate_integer_array( state_type, current_mns );
616: }
617:
618: firstst[lastnfa] = lastnfa;
619: finalst[lastnfa] = lastnfa;
620: lastst[lastnfa] = lastnfa;
621: transchar[lastnfa] = sym;
622: trans1[lastnfa] = NO_TRANSITION;
623: trans2[lastnfa] = NO_TRANSITION;
624: accptnum[lastnfa] = NIL;
625: assoc_rule[lastnfa] = num_rules;
626: state_type[lastnfa] = current_state_type;
627:
628: /* Fix up equivalence classes base on this transition. Note that any
629: * character which has its own transition gets its own equivalence
630: * class. Thus only characters which are only in character classes
631: * have a chance at being in the same equivalence class. E.g. "a|b"
632: * puts 'a' and 'b' into two different equivalence classes. "[ab]"
633: * puts them in the same equivalence class (barring other differences
634: * elsewhere in the input).
635: */
636:
637: if ( sym < 0 )
638: {
639: /* We don't have to update the equivalence classes since
640: * that was already done when the ccl was created for the
641: * first time.
642: */
643: }
644:
645: else if ( sym == SYM_EPSILON )
646: ++numeps;
647:
648: else
649: {
650: check_char( sym );
651:
652: if ( useecs )
653: /* Map NUL's to csize. */
654: mkechar( sym ? sym : csize, nextecm, ecgroup );
655: }
656:
657: return lastnfa;
658: }
659:
660:
661: /* mkxtion - make a transition from one state to another
662: *
663: * synopsis
664: *
665: * mkxtion( statefrom, stateto );
666: *
667: * statefrom - the state from which the transition is to be made
668: * stateto - the state to which the transition is to be made
669: */
670:
671: void mkxtion( statefrom, stateto )
672: int statefrom, stateto;
673: {
674: if ( trans1[statefrom] == NO_TRANSITION )
675: trans1[statefrom] = stateto;
676:
677: else if ( (transchar[statefrom] != SYM_EPSILON) ||
678: (trans2[statefrom] != NO_TRANSITION) )
679: flexfatal( _( "found too many transitions in mkxtion()" ) );
680:
681: else
682: { /* second out-transition for an epsilon state */
683: ++eps2;
684: trans2[statefrom] = stateto;
685: }
686: }
687:
688: /* new_rule - initialize for a new rule */
689:
690: void new_rule()
691: {
692: if ( ++num_rules >= current_max_rules )
693: {
694: ++num_reallocs;
695: current_max_rules += MAX_RULES_INCREMENT;
696: rule_type = reallocate_integer_array( rule_type,
697: current_max_rules );
698: rule_linenum = reallocate_integer_array( rule_linenum,
699: current_max_rules );
700: rule_useful = reallocate_integer_array( rule_useful,
701: current_max_rules );
702: }
703:
704: if ( num_rules > MAX_RULE )
705: lerrif( _( "too many rules (> %d)!" ), MAX_RULE );
706:
707: rule_linenum[num_rules] = linenum;
708: rule_useful[num_rules] = false;
709: }