Annotation of src/usr.bin/lex/nfa.c, Revision 1.5
1.5 ! mpech 1: /* $OpenBSD: nfa.c,v 1.4 2001/06/17 07:30:42 deraadt Exp $ */
1.2 deraadt 2:
1.1 deraadt 3: /* nfa - NFA construction routines */
4:
5: /*-
6: * Copyright (c) 1990 The Regents of the University of California.
7: * All rights reserved.
8: *
9: * This code is derived from software contributed to Berkeley by
10: * Vern Paxson.
11: *
12: * The United States Government has rights in this work pursuant
13: * to contract no. DE-AC03-76SF00098 between the United States
14: * Department of Energy and the University of California.
15: *
1.4 deraadt 16: * Redistribution and use in source and binary forms, with or without
17: * modification, are permitted provided that: (1) source distributions
18: * retain this entire copyright notice and comment, and (2) distributions
19: * including binaries display the following acknowledgement: ``This product
20: * includes software developed by the University of California, Berkeley
21: * and its contributors'' in the documentation or other materials provided
22: * with the distribution and in all advertising materials mentioning
23: * features or use of this software. Neither the name of the University nor
24: * the names of its contributors may be used to endorse or promote products
25: * derived from this software without specific prior written permission.
1.1 deraadt 26: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
27: * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
28: * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
29: */
30:
1.5 ! mpech 31: /* $Header: /cvs/src/usr.bin/lex/nfa.c,v 1.4 2001/06/17 07:30:42 deraadt Exp $ */
1.1 deraadt 32:
33: #include "flexdef.h"
34:
35:
36: /* declare functions that have forward references */
37:
38: int dupmachine PROTO((int));
39: void mkxtion PROTO((int, int));
40:
41:
42: /* add_accept - add an accepting state to a machine
43: *
44: * accepting_number becomes mach's accepting number.
45: */
46:
47: void add_accept( mach, accepting_number )
48: int mach, accepting_number;
49: {
50: /* Hang the accepting number off an epsilon state. if it is associated
51: * with a state that has a non-epsilon out-transition, then the state
52: * will accept BEFORE it makes that transition, i.e., one character
53: * too soon.
54: */
55:
56: if ( transchar[finalst[mach]] == SYM_EPSILON )
57: accptnum[finalst[mach]] = accepting_number;
58:
59: else
60: {
61: int astate = mkstate( SYM_EPSILON );
62: accptnum[astate] = accepting_number;
63: (void) link_machines( mach, astate );
64: }
65: }
66:
67:
68: /* copysingl - make a given number of copies of a singleton machine
69: *
70: * synopsis
71: *
72: * newsng = copysingl( singl, num );
73: *
74: * newsng - a new singleton composed of num copies of singl
75: * singl - a singleton machine
76: * num - the number of copies of singl to be present in newsng
77: */
78:
79: int copysingl( singl, num )
80: int singl, num;
81: {
82: int copy, i;
83:
84: copy = mkstate( SYM_EPSILON );
85:
86: for ( i = 1; i <= num; ++i )
87: copy = link_machines( copy, dupmachine( singl ) );
88:
89: return copy;
90: }
91:
92:
93: /* dumpnfa - debugging routine to write out an nfa */
94:
95: void dumpnfa( state1 )
96: int state1;
97:
98: {
99: int sym, tsp1, tsp2, anum, ns;
100:
101: fprintf( stderr,
102: _( "\n\n********** beginning dump of nfa with start state %d\n" ),
103: state1 );
104:
105: /* We probably should loop starting at firstst[state1] and going to
106: * lastst[state1], but they're not maintained properly when we "or"
107: * all of the rules together. So we use our knowledge that the machine
108: * starts at state 1 and ends at lastnfa.
109: */
110:
111: /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
112: for ( ns = 1; ns <= lastnfa; ++ns )
113: {
114: fprintf( stderr, _( "state # %4d\t" ), ns );
115:
116: sym = transchar[ns];
117: tsp1 = trans1[ns];
118: tsp2 = trans2[ns];
119: anum = accptnum[ns];
120:
121: fprintf( stderr, "%3d: %4d, %4d", sym, tsp1, tsp2 );
122:
123: if ( anum != NIL )
124: fprintf( stderr, " [%d]", anum );
125:
126: fprintf( stderr, "\n" );
127: }
128:
129: fprintf( stderr, _( "********** end of dump\n" ) );
130: }
131:
132:
133: /* dupmachine - make a duplicate of a given machine
134: *
135: * synopsis
136: *
137: * copy = dupmachine( mach );
138: *
139: * copy - holds duplicate of mach
140: * mach - machine to be duplicated
141: *
142: * note that the copy of mach is NOT an exact duplicate; rather, all the
143: * transition states values are adjusted so that the copy is self-contained,
144: * as the original should have been.
145: *
146: * also note that the original MUST be contiguous, with its low and high
147: * states accessible by the arrays firstst and lastst
148: */
149:
150: int dupmachine( mach )
151: int mach;
152: {
153: int i, init, state_offset;
154: int state = 0;
155: int last = lastst[mach];
156:
157: for ( i = firstst[mach]; i <= last; ++i )
158: {
159: state = mkstate( transchar[i] );
160:
161: if ( trans1[i] != NO_TRANSITION )
162: {
163: mkxtion( finalst[state], trans1[i] + state - i );
164:
165: if ( transchar[i] == SYM_EPSILON &&
166: trans2[i] != NO_TRANSITION )
167: mkxtion( finalst[state],
168: trans2[i] + state - i );
169: }
170:
171: accptnum[state] = accptnum[i];
172: }
173:
174: if ( state == 0 )
175: flexfatal( _( "empty machine in dupmachine()" ) );
176:
177: state_offset = state - i + 1;
178:
179: init = mach + state_offset;
180: firstst[init] = firstst[mach] + state_offset;
181: finalst[init] = finalst[mach] + state_offset;
182: lastst[init] = lastst[mach] + state_offset;
183:
184: return init;
185: }
186:
187:
188: /* finish_rule - finish up the processing for a rule
189: *
190: * An accepting number is added to the given machine. If variable_trail_rule
191: * is true then the rule has trailing context and both the head and trail
192: * are variable size. Otherwise if headcnt or trailcnt is non-zero then
193: * the machine recognizes a pattern with trailing context and headcnt is
194: * the number of characters in the matched part of the pattern, or zero
195: * if the matched part has variable length. trailcnt is the number of
196: * trailing context characters in the pattern, or zero if the trailing
197: * context has variable length.
198: */
199:
200: void finish_rule( mach, variable_trail_rule, headcnt, trailcnt )
201: int mach, variable_trail_rule, headcnt, trailcnt;
202: {
203: char action_text[MAXLINE];
204:
205: add_accept( mach, num_rules );
206:
207: /* We did this in new_rule(), but it often gets the wrong
208: * number because we do it before we start parsing the current rule.
209: */
210: rule_linenum[num_rules] = linenum;
211:
212: /* If this is a continued action, then the line-number has already
213: * been updated, giving us the wrong number.
214: */
215: if ( continued_action )
216: --rule_linenum[num_rules];
217:
218: sprintf( action_text, "case %d:\n", num_rules );
219: add_action( action_text );
220:
221: if ( variable_trail_rule )
222: {
223: rule_type[num_rules] = RULE_VARIABLE;
224:
225: if ( performance_report > 0 )
226: fprintf( stderr,
227: _( "Variable trailing context rule at line %d\n" ),
228: rule_linenum[num_rules] );
229:
230: variable_trailing_context_rules = true;
231: }
232:
233: else
234: {
235: rule_type[num_rules] = RULE_NORMAL;
236:
237: if ( headcnt > 0 || trailcnt > 0 )
238: {
239: /* Do trailing context magic to not match the trailing
240: * characters.
241: */
242: char *scanner_cp = "yy_c_buf_p = yy_cp";
243: char *scanner_bp = "yy_bp";
244:
245: add_action(
246: "*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" );
247:
248: if ( headcnt > 0 )
249: {
250: sprintf( action_text, "%s = %s + %d;\n",
251: scanner_cp, scanner_bp, headcnt );
252: add_action( action_text );
253: }
254:
255: else
256: {
257: sprintf( action_text, "%s -= %d;\n",
258: scanner_cp, trailcnt );
259: add_action( action_text );
260: }
261:
262: add_action(
263: "YY_DO_BEFORE_ACTION; /* set up yytext again */\n" );
264: }
265: }
266:
267: /* Okay, in the action code at this point yytext and yyleng have
268: * their proper final values for this rule, so here's the point
269: * to do any user action. But don't do it for continued actions,
270: * as that'll result in multiple YY_RULE_SETUP's.
271: */
272: if ( ! continued_action )
273: add_action( "YY_RULE_SETUP\n" );
274:
275: line_directive_out( (FILE *) 0, 1 );
276: }
277:
278:
279: /* link_machines - connect two machines together
280: *
281: * synopsis
282: *
283: * new = link_machines( first, last );
284: *
285: * new - a machine constructed by connecting first to last
286: * first - the machine whose successor is to be last
287: * last - the machine whose predecessor is to be first
288: *
289: * note: this routine concatenates the machine first with the machine
290: * last to produce a machine new which will pattern-match first first
291: * and then last, and will fail if either of the sub-patterns fails.
292: * FIRST is set to new by the operation. last is unmolested.
293: */
294:
295: int link_machines( first, last )
296: int first, last;
297: {
298: if ( first == NIL )
299: return last;
300:
301: else if ( last == NIL )
302: return first;
303:
304: else
305: {
306: mkxtion( finalst[first], last );
307: finalst[first] = finalst[last];
308: lastst[first] = MAX( lastst[first], lastst[last] );
309: firstst[first] = MIN( firstst[first], firstst[last] );
310:
311: return first;
312: }
313: }
314:
315:
316: /* mark_beginning_as_normal - mark each "beginning" state in a machine
317: * as being a "normal" (i.e., not trailing context-
318: * associated) states
319: *
320: * The "beginning" states are the epsilon closure of the first state
321: */
322:
323: void mark_beginning_as_normal( mach )
1.5 ! mpech 324: int mach;
1.1 deraadt 325: {
326: switch ( state_type[mach] )
327: {
328: case STATE_NORMAL:
329: /* Oh, we've already visited here. */
330: return;
331:
332: case STATE_TRAILING_CONTEXT:
333: state_type[mach] = STATE_NORMAL;
334:
335: if ( transchar[mach] == SYM_EPSILON )
336: {
337: if ( trans1[mach] != NO_TRANSITION )
338: mark_beginning_as_normal(
339: trans1[mach] );
340:
341: if ( trans2[mach] != NO_TRANSITION )
342: mark_beginning_as_normal(
343: trans2[mach] );
344: }
345: break;
346:
347: default:
348: flexerror(
349: _( "bad state type in mark_beginning_as_normal()" ) );
350: break;
351: }
352: }
353:
354:
355: /* mkbranch - make a machine that branches to two machines
356: *
357: * synopsis
358: *
359: * branch = mkbranch( first, second );
360: *
361: * branch - a machine which matches either first's pattern or second's
362: * first, second - machines whose patterns are to be or'ed (the | operator)
363: *
364: * Note that first and second are NEITHER destroyed by the operation. Also,
365: * the resulting machine CANNOT be used with any other "mk" operation except
366: * more mkbranch's. Compare with mkor()
367: */
368:
369: int mkbranch( first, second )
370: int first, second;
371: {
372: int eps;
373:
374: if ( first == NO_TRANSITION )
375: return second;
376:
377: else if ( second == NO_TRANSITION )
378: return first;
379:
380: eps = mkstate( SYM_EPSILON );
381:
382: mkxtion( eps, first );
383: mkxtion( eps, second );
384:
385: return eps;
386: }
387:
388:
389: /* mkclos - convert a machine into a closure
390: *
391: * synopsis
392: * new = mkclos( state );
393: *
394: * new - a new state which matches the closure of "state"
395: */
396:
397: int mkclos( state )
398: int state;
399: {
400: return mkopt( mkposcl( state ) );
401: }
402:
403:
404: /* mkopt - make a machine optional
405: *
406: * synopsis
407: *
408: * new = mkopt( mach );
409: *
410: * new - a machine which optionally matches whatever mach matched
411: * mach - the machine to make optional
412: *
413: * notes:
414: * 1. mach must be the last machine created
415: * 2. mach is destroyed by the call
416: */
417:
418: int mkopt( mach )
419: int mach;
420: {
421: int eps;
422:
423: if ( ! SUPER_FREE_EPSILON(finalst[mach]) )
424: {
425: eps = mkstate( SYM_EPSILON );
426: mach = link_machines( mach, eps );
427: }
428:
429: /* Can't skimp on the following if FREE_EPSILON(mach) is true because
430: * some state interior to "mach" might point back to the beginning
431: * for a closure.
432: */
433: eps = mkstate( SYM_EPSILON );
434: mach = link_machines( eps, mach );
435:
436: mkxtion( mach, finalst[mach] );
437:
438: return mach;
439: }
440:
441:
442: /* mkor - make a machine that matches either one of two machines
443: *
444: * synopsis
445: *
446: * new = mkor( first, second );
447: *
448: * new - a machine which matches either first's pattern or second's
449: * first, second - machines whose patterns are to be or'ed (the | operator)
450: *
451: * note that first and second are both destroyed by the operation
452: * the code is rather convoluted because an attempt is made to minimize
453: * the number of epsilon states needed
454: */
455:
456: int mkor( first, second )
457: int first, second;
458: {
459: int eps, orend;
460:
461: if ( first == NIL )
462: return second;
463:
464: else if ( second == NIL )
465: return first;
466:
467: else
468: {
469: /* See comment in mkopt() about why we can't use the first
470: * state of "first" or "second" if they satisfy "FREE_EPSILON".
471: */
472: eps = mkstate( SYM_EPSILON );
473:
474: first = link_machines( eps, first );
475:
476: mkxtion( first, second );
477:
478: if ( SUPER_FREE_EPSILON(finalst[first]) &&
479: accptnum[finalst[first]] == NIL )
480: {
481: orend = finalst[first];
482: mkxtion( finalst[second], orend );
483: }
484:
485: else if ( SUPER_FREE_EPSILON(finalst[second]) &&
486: accptnum[finalst[second]] == NIL )
487: {
488: orend = finalst[second];
489: mkxtion( finalst[first], orend );
490: }
491:
492: else
493: {
494: eps = mkstate( SYM_EPSILON );
495:
496: first = link_machines( first, eps );
497: orend = finalst[first];
498:
499: mkxtion( finalst[second], orend );
500: }
501: }
502:
503: finalst[first] = orend;
504: return first;
505: }
506:
507:
508: /* mkposcl - convert a machine into a positive closure
509: *
510: * synopsis
511: * new = mkposcl( state );
512: *
513: * new - a machine matching the positive closure of "state"
514: */
515:
516: int mkposcl( state )
517: int state;
518: {
519: int eps;
520:
521: if ( SUPER_FREE_EPSILON(finalst[state]) )
522: {
523: mkxtion( finalst[state], state );
524: return state;
525: }
526:
527: else
528: {
529: eps = mkstate( SYM_EPSILON );
530: mkxtion( eps, state );
531: return link_machines( state, eps );
532: }
533: }
534:
535:
536: /* mkrep - make a replicated machine
537: *
538: * synopsis
539: * new = mkrep( mach, lb, ub );
540: *
541: * new - a machine that matches whatever "mach" matched from "lb"
542: * number of times to "ub" number of times
543: *
544: * note
545: * if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach"
546: */
547:
548: int mkrep( mach, lb, ub )
549: int mach, lb, ub;
550: {
551: int base_mach, tail, copy, i;
552:
553: base_mach = copysingl( mach, lb - 1 );
554:
555: if ( ub == INFINITY )
556: {
557: copy = dupmachine( mach );
558: mach = link_machines( mach,
559: link_machines( base_mach, mkclos( copy ) ) );
560: }
561:
562: else
563: {
564: tail = mkstate( SYM_EPSILON );
565:
566: for ( i = lb; i < ub; ++i )
567: {
568: copy = dupmachine( mach );
569: tail = mkopt( link_machines( copy, tail ) );
570: }
571:
572: mach = link_machines( mach, link_machines( base_mach, tail ) );
573: }
574:
575: return mach;
576: }
577:
578:
579: /* mkstate - create a state with a transition on a given symbol
580: *
581: * synopsis
582: *
583: * state = mkstate( sym );
584: *
585: * state - a new state matching sym
586: * sym - the symbol the new state is to have an out-transition on
587: *
588: * note that this routine makes new states in ascending order through the
589: * state array (and increments LASTNFA accordingly). The routine DUPMACHINE
590: * relies on machines being made in ascending order and that they are
591: * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge
592: * that it admittedly is)
593: */
594:
595: int mkstate( sym )
596: int sym;
597: {
598: if ( ++lastnfa >= current_mns )
599: {
600: if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS )
601: lerrif(
602: _( "input rules are too complicated (>= %d NFA states)" ),
603: current_mns );
604:
605: ++num_reallocs;
606:
607: firstst = reallocate_integer_array( firstst, current_mns );
608: lastst = reallocate_integer_array( lastst, current_mns );
609: finalst = reallocate_integer_array( finalst, current_mns );
610: transchar = reallocate_integer_array( transchar, current_mns );
611: trans1 = reallocate_integer_array( trans1, current_mns );
612: trans2 = reallocate_integer_array( trans2, current_mns );
613: accptnum = reallocate_integer_array( accptnum, current_mns );
614: assoc_rule =
615: reallocate_integer_array( assoc_rule, current_mns );
616: state_type =
617: reallocate_integer_array( state_type, current_mns );
618: }
619:
620: firstst[lastnfa] = lastnfa;
621: finalst[lastnfa] = lastnfa;
622: lastst[lastnfa] = lastnfa;
623: transchar[lastnfa] = sym;
624: trans1[lastnfa] = NO_TRANSITION;
625: trans2[lastnfa] = NO_TRANSITION;
626: accptnum[lastnfa] = NIL;
627: assoc_rule[lastnfa] = num_rules;
628: state_type[lastnfa] = current_state_type;
629:
630: /* Fix up equivalence classes base on this transition. Note that any
631: * character which has its own transition gets its own equivalence
632: * class. Thus only characters which are only in character classes
633: * have a chance at being in the same equivalence class. E.g. "a|b"
634: * puts 'a' and 'b' into two different equivalence classes. "[ab]"
635: * puts them in the same equivalence class (barring other differences
636: * elsewhere in the input).
637: */
638:
639: if ( sym < 0 )
640: {
641: /* We don't have to update the equivalence classes since
642: * that was already done when the ccl was created for the
643: * first time.
644: */
645: }
646:
647: else if ( sym == SYM_EPSILON )
648: ++numeps;
649:
650: else
651: {
652: check_char( sym );
653:
654: if ( useecs )
655: /* Map NUL's to csize. */
656: mkechar( sym ? sym : csize, nextecm, ecgroup );
657: }
658:
659: return lastnfa;
660: }
661:
662:
663: /* mkxtion - make a transition from one state to another
664: *
665: * synopsis
666: *
667: * mkxtion( statefrom, stateto );
668: *
669: * statefrom - the state from which the transition is to be made
670: * stateto - the state to which the transition is to be made
671: */
672:
673: void mkxtion( statefrom, stateto )
674: int statefrom, stateto;
675: {
676: if ( trans1[statefrom] == NO_TRANSITION )
677: trans1[statefrom] = stateto;
678:
679: else if ( (transchar[statefrom] != SYM_EPSILON) ||
680: (trans2[statefrom] != NO_TRANSITION) )
681: flexfatal( _( "found too many transitions in mkxtion()" ) );
682:
683: else
684: { /* second out-transition for an epsilon state */
685: ++eps2;
686: trans2[statefrom] = stateto;
687: }
688: }
689:
690: /* new_rule - initialize for a new rule */
691:
692: void new_rule()
693: {
694: if ( ++num_rules >= current_max_rules )
695: {
696: ++num_reallocs;
697: current_max_rules += MAX_RULES_INCREMENT;
698: rule_type = reallocate_integer_array( rule_type,
699: current_max_rules );
700: rule_linenum = reallocate_integer_array( rule_linenum,
701: current_max_rules );
702: rule_useful = reallocate_integer_array( rule_useful,
703: current_max_rules );
704: }
705:
706: if ( num_rules > MAX_RULE )
707: lerrif( _( "too many rules (> %d)!" ), MAX_RULE );
708:
709: rule_linenum[num_rules] = linenum;
710: rule_useful[num_rules] = false;
711: }