Annotation of src/usr.bin/lex/nfa.c, Revision 1.1
1.1 ! deraadt 1: /* nfa - NFA construction routines */
! 2:
! 3: /*-
! 4: * Copyright (c) 1990 The Regents of the University of California.
! 5: * All rights reserved.
! 6: *
! 7: * This code is derived from software contributed to Berkeley by
! 8: * Vern Paxson.
! 9: *
! 10: * The United States Government has rights in this work pursuant
! 11: * to contract no. DE-AC03-76SF00098 between the United States
! 12: * Department of Energy and the University of California.
! 13: *
! 14: * Redistribution and use in source and binary forms are permitted provided
! 15: * that: (1) source distributions retain this entire copyright notice and
! 16: * comment, and (2) distributions including binaries display the following
! 17: * acknowledgement: ``This product includes software developed by the
! 18: * University of California, Berkeley and its contributors'' in the
! 19: * documentation or other materials provided with the distribution and in
! 20: * all advertising materials mentioning features or use of this software.
! 21: * Neither the name of the University nor the names of its contributors may
! 22: * be used to endorse or promote products derived from this software without
! 23: * specific prior written permission.
! 24: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
! 25: * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
! 26: * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
! 27: */
! 28:
! 29: /* $Header: /a/cvsroot/src/usr.bin/lex/nfa.c,v 1.8 1995/05/05 05:35:38 jtc Exp $ */
! 30:
! 31: #include "flexdef.h"
! 32:
! 33:
! 34: /* declare functions that have forward references */
! 35:
! 36: int dupmachine PROTO((int));
! 37: void mkxtion PROTO((int, int));
! 38:
! 39:
! 40: /* add_accept - add an accepting state to a machine
! 41: *
! 42: * accepting_number becomes mach's accepting number.
! 43: */
! 44:
! 45: void add_accept( mach, accepting_number )
! 46: int mach, accepting_number;
! 47: {
! 48: /* Hang the accepting number off an epsilon state. if it is associated
! 49: * with a state that has a non-epsilon out-transition, then the state
! 50: * will accept BEFORE it makes that transition, i.e., one character
! 51: * too soon.
! 52: */
! 53:
! 54: if ( transchar[finalst[mach]] == SYM_EPSILON )
! 55: accptnum[finalst[mach]] = accepting_number;
! 56:
! 57: else
! 58: {
! 59: int astate = mkstate( SYM_EPSILON );
! 60: accptnum[astate] = accepting_number;
! 61: (void) link_machines( mach, astate );
! 62: }
! 63: }
! 64:
! 65:
! 66: /* copysingl - make a given number of copies of a singleton machine
! 67: *
! 68: * synopsis
! 69: *
! 70: * newsng = copysingl( singl, num );
! 71: *
! 72: * newsng - a new singleton composed of num copies of singl
! 73: * singl - a singleton machine
! 74: * num - the number of copies of singl to be present in newsng
! 75: */
! 76:
! 77: int copysingl( singl, num )
! 78: int singl, num;
! 79: {
! 80: int copy, i;
! 81:
! 82: copy = mkstate( SYM_EPSILON );
! 83:
! 84: for ( i = 1; i <= num; ++i )
! 85: copy = link_machines( copy, dupmachine( singl ) );
! 86:
! 87: return copy;
! 88: }
! 89:
! 90:
! 91: /* dumpnfa - debugging routine to write out an nfa */
! 92:
! 93: void dumpnfa( state1 )
! 94: int state1;
! 95:
! 96: {
! 97: int sym, tsp1, tsp2, anum, ns;
! 98:
! 99: fprintf( stderr,
! 100: _( "\n\n********** beginning dump of nfa with start state %d\n" ),
! 101: state1 );
! 102:
! 103: /* We probably should loop starting at firstst[state1] and going to
! 104: * lastst[state1], but they're not maintained properly when we "or"
! 105: * all of the rules together. So we use our knowledge that the machine
! 106: * starts at state 1 and ends at lastnfa.
! 107: */
! 108:
! 109: /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
! 110: for ( ns = 1; ns <= lastnfa; ++ns )
! 111: {
! 112: fprintf( stderr, _( "state # %4d\t" ), ns );
! 113:
! 114: sym = transchar[ns];
! 115: tsp1 = trans1[ns];
! 116: tsp2 = trans2[ns];
! 117: anum = accptnum[ns];
! 118:
! 119: fprintf( stderr, "%3d: %4d, %4d", sym, tsp1, tsp2 );
! 120:
! 121: if ( anum != NIL )
! 122: fprintf( stderr, " [%d]", anum );
! 123:
! 124: fprintf( stderr, "\n" );
! 125: }
! 126:
! 127: fprintf( stderr, _( "********** end of dump\n" ) );
! 128: }
! 129:
! 130:
! 131: /* dupmachine - make a duplicate of a given machine
! 132: *
! 133: * synopsis
! 134: *
! 135: * copy = dupmachine( mach );
! 136: *
! 137: * copy - holds duplicate of mach
! 138: * mach - machine to be duplicated
! 139: *
! 140: * note that the copy of mach is NOT an exact duplicate; rather, all the
! 141: * transition states values are adjusted so that the copy is self-contained,
! 142: * as the original should have been.
! 143: *
! 144: * also note that the original MUST be contiguous, with its low and high
! 145: * states accessible by the arrays firstst and lastst
! 146: */
! 147:
! 148: int dupmachine( mach )
! 149: int mach;
! 150: {
! 151: int i, init, state_offset;
! 152: int state = 0;
! 153: int last = lastst[mach];
! 154:
! 155: for ( i = firstst[mach]; i <= last; ++i )
! 156: {
! 157: state = mkstate( transchar[i] );
! 158:
! 159: if ( trans1[i] != NO_TRANSITION )
! 160: {
! 161: mkxtion( finalst[state], trans1[i] + state - i );
! 162:
! 163: if ( transchar[i] == SYM_EPSILON &&
! 164: trans2[i] != NO_TRANSITION )
! 165: mkxtion( finalst[state],
! 166: trans2[i] + state - i );
! 167: }
! 168:
! 169: accptnum[state] = accptnum[i];
! 170: }
! 171:
! 172: if ( state == 0 )
! 173: flexfatal( _( "empty machine in dupmachine()" ) );
! 174:
! 175: state_offset = state - i + 1;
! 176:
! 177: init = mach + state_offset;
! 178: firstst[init] = firstst[mach] + state_offset;
! 179: finalst[init] = finalst[mach] + state_offset;
! 180: lastst[init] = lastst[mach] + state_offset;
! 181:
! 182: return init;
! 183: }
! 184:
! 185:
! 186: /* finish_rule - finish up the processing for a rule
! 187: *
! 188: * An accepting number is added to the given machine. If variable_trail_rule
! 189: * is true then the rule has trailing context and both the head and trail
! 190: * are variable size. Otherwise if headcnt or trailcnt is non-zero then
! 191: * the machine recognizes a pattern with trailing context and headcnt is
! 192: * the number of characters in the matched part of the pattern, or zero
! 193: * if the matched part has variable length. trailcnt is the number of
! 194: * trailing context characters in the pattern, or zero if the trailing
! 195: * context has variable length.
! 196: */
! 197:
! 198: void finish_rule( mach, variable_trail_rule, headcnt, trailcnt )
! 199: int mach, variable_trail_rule, headcnt, trailcnt;
! 200: {
! 201: char action_text[MAXLINE];
! 202:
! 203: add_accept( mach, num_rules );
! 204:
! 205: /* We did this in new_rule(), but it often gets the wrong
! 206: * number because we do it before we start parsing the current rule.
! 207: */
! 208: rule_linenum[num_rules] = linenum;
! 209:
! 210: /* If this is a continued action, then the line-number has already
! 211: * been updated, giving us the wrong number.
! 212: */
! 213: if ( continued_action )
! 214: --rule_linenum[num_rules];
! 215:
! 216: sprintf( action_text, "case %d:\n", num_rules );
! 217: add_action( action_text );
! 218:
! 219: if ( variable_trail_rule )
! 220: {
! 221: rule_type[num_rules] = RULE_VARIABLE;
! 222:
! 223: if ( performance_report > 0 )
! 224: fprintf( stderr,
! 225: _( "Variable trailing context rule at line %d\n" ),
! 226: rule_linenum[num_rules] );
! 227:
! 228: variable_trailing_context_rules = true;
! 229: }
! 230:
! 231: else
! 232: {
! 233: rule_type[num_rules] = RULE_NORMAL;
! 234:
! 235: if ( headcnt > 0 || trailcnt > 0 )
! 236: {
! 237: /* Do trailing context magic to not match the trailing
! 238: * characters.
! 239: */
! 240: char *scanner_cp = "yy_c_buf_p = yy_cp";
! 241: char *scanner_bp = "yy_bp";
! 242:
! 243: add_action(
! 244: "*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" );
! 245:
! 246: if ( headcnt > 0 )
! 247: {
! 248: sprintf( action_text, "%s = %s + %d;\n",
! 249: scanner_cp, scanner_bp, headcnt );
! 250: add_action( action_text );
! 251: }
! 252:
! 253: else
! 254: {
! 255: sprintf( action_text, "%s -= %d;\n",
! 256: scanner_cp, trailcnt );
! 257: add_action( action_text );
! 258: }
! 259:
! 260: add_action(
! 261: "YY_DO_BEFORE_ACTION; /* set up yytext again */\n" );
! 262: }
! 263: }
! 264:
! 265: /* Okay, in the action code at this point yytext and yyleng have
! 266: * their proper final values for this rule, so here's the point
! 267: * to do any user action. But don't do it for continued actions,
! 268: * as that'll result in multiple YY_RULE_SETUP's.
! 269: */
! 270: if ( ! continued_action )
! 271: add_action( "YY_RULE_SETUP\n" );
! 272:
! 273: line_directive_out( (FILE *) 0, 1 );
! 274: }
! 275:
! 276:
! 277: /* link_machines - connect two machines together
! 278: *
! 279: * synopsis
! 280: *
! 281: * new = link_machines( first, last );
! 282: *
! 283: * new - a machine constructed by connecting first to last
! 284: * first - the machine whose successor is to be last
! 285: * last - the machine whose predecessor is to be first
! 286: *
! 287: * note: this routine concatenates the machine first with the machine
! 288: * last to produce a machine new which will pattern-match first first
! 289: * and then last, and will fail if either of the sub-patterns fails.
! 290: * FIRST is set to new by the operation. last is unmolested.
! 291: */
! 292:
! 293: int link_machines( first, last )
! 294: int first, last;
! 295: {
! 296: if ( first == NIL )
! 297: return last;
! 298:
! 299: else if ( last == NIL )
! 300: return first;
! 301:
! 302: else
! 303: {
! 304: mkxtion( finalst[first], last );
! 305: finalst[first] = finalst[last];
! 306: lastst[first] = MAX( lastst[first], lastst[last] );
! 307: firstst[first] = MIN( firstst[first], firstst[last] );
! 308:
! 309: return first;
! 310: }
! 311: }
! 312:
! 313:
! 314: /* mark_beginning_as_normal - mark each "beginning" state in a machine
! 315: * as being a "normal" (i.e., not trailing context-
! 316: * associated) states
! 317: *
! 318: * The "beginning" states are the epsilon closure of the first state
! 319: */
! 320:
! 321: void mark_beginning_as_normal( mach )
! 322: register int mach;
! 323: {
! 324: switch ( state_type[mach] )
! 325: {
! 326: case STATE_NORMAL:
! 327: /* Oh, we've already visited here. */
! 328: return;
! 329:
! 330: case STATE_TRAILING_CONTEXT:
! 331: state_type[mach] = STATE_NORMAL;
! 332:
! 333: if ( transchar[mach] == SYM_EPSILON )
! 334: {
! 335: if ( trans1[mach] != NO_TRANSITION )
! 336: mark_beginning_as_normal(
! 337: trans1[mach] );
! 338:
! 339: if ( trans2[mach] != NO_TRANSITION )
! 340: mark_beginning_as_normal(
! 341: trans2[mach] );
! 342: }
! 343: break;
! 344:
! 345: default:
! 346: flexerror(
! 347: _( "bad state type in mark_beginning_as_normal()" ) );
! 348: break;
! 349: }
! 350: }
! 351:
! 352:
! 353: /* mkbranch - make a machine that branches to two machines
! 354: *
! 355: * synopsis
! 356: *
! 357: * branch = mkbranch( first, second );
! 358: *
! 359: * branch - a machine which matches either first's pattern or second's
! 360: * first, second - machines whose patterns are to be or'ed (the | operator)
! 361: *
! 362: * Note that first and second are NEITHER destroyed by the operation. Also,
! 363: * the resulting machine CANNOT be used with any other "mk" operation except
! 364: * more mkbranch's. Compare with mkor()
! 365: */
! 366:
! 367: int mkbranch( first, second )
! 368: int first, second;
! 369: {
! 370: int eps;
! 371:
! 372: if ( first == NO_TRANSITION )
! 373: return second;
! 374:
! 375: else if ( second == NO_TRANSITION )
! 376: return first;
! 377:
! 378: eps = mkstate( SYM_EPSILON );
! 379:
! 380: mkxtion( eps, first );
! 381: mkxtion( eps, second );
! 382:
! 383: return eps;
! 384: }
! 385:
! 386:
! 387: /* mkclos - convert a machine into a closure
! 388: *
! 389: * synopsis
! 390: * new = mkclos( state );
! 391: *
! 392: * new - a new state which matches the closure of "state"
! 393: */
! 394:
! 395: int mkclos( state )
! 396: int state;
! 397: {
! 398: return mkopt( mkposcl( state ) );
! 399: }
! 400:
! 401:
! 402: /* mkopt - make a machine optional
! 403: *
! 404: * synopsis
! 405: *
! 406: * new = mkopt( mach );
! 407: *
! 408: * new - a machine which optionally matches whatever mach matched
! 409: * mach - the machine to make optional
! 410: *
! 411: * notes:
! 412: * 1. mach must be the last machine created
! 413: * 2. mach is destroyed by the call
! 414: */
! 415:
! 416: int mkopt( mach )
! 417: int mach;
! 418: {
! 419: int eps;
! 420:
! 421: if ( ! SUPER_FREE_EPSILON(finalst[mach]) )
! 422: {
! 423: eps = mkstate( SYM_EPSILON );
! 424: mach = link_machines( mach, eps );
! 425: }
! 426:
! 427: /* Can't skimp on the following if FREE_EPSILON(mach) is true because
! 428: * some state interior to "mach" might point back to the beginning
! 429: * for a closure.
! 430: */
! 431: eps = mkstate( SYM_EPSILON );
! 432: mach = link_machines( eps, mach );
! 433:
! 434: mkxtion( mach, finalst[mach] );
! 435:
! 436: return mach;
! 437: }
! 438:
! 439:
! 440: /* mkor - make a machine that matches either one of two machines
! 441: *
! 442: * synopsis
! 443: *
! 444: * new = mkor( first, second );
! 445: *
! 446: * new - a machine which matches either first's pattern or second's
! 447: * first, second - machines whose patterns are to be or'ed (the | operator)
! 448: *
! 449: * note that first and second are both destroyed by the operation
! 450: * the code is rather convoluted because an attempt is made to minimize
! 451: * the number of epsilon states needed
! 452: */
! 453:
! 454: int mkor( first, second )
! 455: int first, second;
! 456: {
! 457: int eps, orend;
! 458:
! 459: if ( first == NIL )
! 460: return second;
! 461:
! 462: else if ( second == NIL )
! 463: return first;
! 464:
! 465: else
! 466: {
! 467: /* See comment in mkopt() about why we can't use the first
! 468: * state of "first" or "second" if they satisfy "FREE_EPSILON".
! 469: */
! 470: eps = mkstate( SYM_EPSILON );
! 471:
! 472: first = link_machines( eps, first );
! 473:
! 474: mkxtion( first, second );
! 475:
! 476: if ( SUPER_FREE_EPSILON(finalst[first]) &&
! 477: accptnum[finalst[first]] == NIL )
! 478: {
! 479: orend = finalst[first];
! 480: mkxtion( finalst[second], orend );
! 481: }
! 482:
! 483: else if ( SUPER_FREE_EPSILON(finalst[second]) &&
! 484: accptnum[finalst[second]] == NIL )
! 485: {
! 486: orend = finalst[second];
! 487: mkxtion( finalst[first], orend );
! 488: }
! 489:
! 490: else
! 491: {
! 492: eps = mkstate( SYM_EPSILON );
! 493:
! 494: first = link_machines( first, eps );
! 495: orend = finalst[first];
! 496:
! 497: mkxtion( finalst[second], orend );
! 498: }
! 499: }
! 500:
! 501: finalst[first] = orend;
! 502: return first;
! 503: }
! 504:
! 505:
! 506: /* mkposcl - convert a machine into a positive closure
! 507: *
! 508: * synopsis
! 509: * new = mkposcl( state );
! 510: *
! 511: * new - a machine matching the positive closure of "state"
! 512: */
! 513:
! 514: int mkposcl( state )
! 515: int state;
! 516: {
! 517: int eps;
! 518:
! 519: if ( SUPER_FREE_EPSILON(finalst[state]) )
! 520: {
! 521: mkxtion( finalst[state], state );
! 522: return state;
! 523: }
! 524:
! 525: else
! 526: {
! 527: eps = mkstate( SYM_EPSILON );
! 528: mkxtion( eps, state );
! 529: return link_machines( state, eps );
! 530: }
! 531: }
! 532:
! 533:
! 534: /* mkrep - make a replicated machine
! 535: *
! 536: * synopsis
! 537: * new = mkrep( mach, lb, ub );
! 538: *
! 539: * new - a machine that matches whatever "mach" matched from "lb"
! 540: * number of times to "ub" number of times
! 541: *
! 542: * note
! 543: * if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach"
! 544: */
! 545:
! 546: int mkrep( mach, lb, ub )
! 547: int mach, lb, ub;
! 548: {
! 549: int base_mach, tail, copy, i;
! 550:
! 551: base_mach = copysingl( mach, lb - 1 );
! 552:
! 553: if ( ub == INFINITY )
! 554: {
! 555: copy = dupmachine( mach );
! 556: mach = link_machines( mach,
! 557: link_machines( base_mach, mkclos( copy ) ) );
! 558: }
! 559:
! 560: else
! 561: {
! 562: tail = mkstate( SYM_EPSILON );
! 563:
! 564: for ( i = lb; i < ub; ++i )
! 565: {
! 566: copy = dupmachine( mach );
! 567: tail = mkopt( link_machines( copy, tail ) );
! 568: }
! 569:
! 570: mach = link_machines( mach, link_machines( base_mach, tail ) );
! 571: }
! 572:
! 573: return mach;
! 574: }
! 575:
! 576:
! 577: /* mkstate - create a state with a transition on a given symbol
! 578: *
! 579: * synopsis
! 580: *
! 581: * state = mkstate( sym );
! 582: *
! 583: * state - a new state matching sym
! 584: * sym - the symbol the new state is to have an out-transition on
! 585: *
! 586: * note that this routine makes new states in ascending order through the
! 587: * state array (and increments LASTNFA accordingly). The routine DUPMACHINE
! 588: * relies on machines being made in ascending order and that they are
! 589: * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge
! 590: * that it admittedly is)
! 591: */
! 592:
! 593: int mkstate( sym )
! 594: int sym;
! 595: {
! 596: if ( ++lastnfa >= current_mns )
! 597: {
! 598: if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS )
! 599: lerrif(
! 600: _( "input rules are too complicated (>= %d NFA states)" ),
! 601: current_mns );
! 602:
! 603: ++num_reallocs;
! 604:
! 605: firstst = reallocate_integer_array( firstst, current_mns );
! 606: lastst = reallocate_integer_array( lastst, current_mns );
! 607: finalst = reallocate_integer_array( finalst, current_mns );
! 608: transchar = reallocate_integer_array( transchar, current_mns );
! 609: trans1 = reallocate_integer_array( trans1, current_mns );
! 610: trans2 = reallocate_integer_array( trans2, current_mns );
! 611: accptnum = reallocate_integer_array( accptnum, current_mns );
! 612: assoc_rule =
! 613: reallocate_integer_array( assoc_rule, current_mns );
! 614: state_type =
! 615: reallocate_integer_array( state_type, current_mns );
! 616: }
! 617:
! 618: firstst[lastnfa] = lastnfa;
! 619: finalst[lastnfa] = lastnfa;
! 620: lastst[lastnfa] = lastnfa;
! 621: transchar[lastnfa] = sym;
! 622: trans1[lastnfa] = NO_TRANSITION;
! 623: trans2[lastnfa] = NO_TRANSITION;
! 624: accptnum[lastnfa] = NIL;
! 625: assoc_rule[lastnfa] = num_rules;
! 626: state_type[lastnfa] = current_state_type;
! 627:
! 628: /* Fix up equivalence classes base on this transition. Note that any
! 629: * character which has its own transition gets its own equivalence
! 630: * class. Thus only characters which are only in character classes
! 631: * have a chance at being in the same equivalence class. E.g. "a|b"
! 632: * puts 'a' and 'b' into two different equivalence classes. "[ab]"
! 633: * puts them in the same equivalence class (barring other differences
! 634: * elsewhere in the input).
! 635: */
! 636:
! 637: if ( sym < 0 )
! 638: {
! 639: /* We don't have to update the equivalence classes since
! 640: * that was already done when the ccl was created for the
! 641: * first time.
! 642: */
! 643: }
! 644:
! 645: else if ( sym == SYM_EPSILON )
! 646: ++numeps;
! 647:
! 648: else
! 649: {
! 650: check_char( sym );
! 651:
! 652: if ( useecs )
! 653: /* Map NUL's to csize. */
! 654: mkechar( sym ? sym : csize, nextecm, ecgroup );
! 655: }
! 656:
! 657: return lastnfa;
! 658: }
! 659:
! 660:
! 661: /* mkxtion - make a transition from one state to another
! 662: *
! 663: * synopsis
! 664: *
! 665: * mkxtion( statefrom, stateto );
! 666: *
! 667: * statefrom - the state from which the transition is to be made
! 668: * stateto - the state to which the transition is to be made
! 669: */
! 670:
! 671: void mkxtion( statefrom, stateto )
! 672: int statefrom, stateto;
! 673: {
! 674: if ( trans1[statefrom] == NO_TRANSITION )
! 675: trans1[statefrom] = stateto;
! 676:
! 677: else if ( (transchar[statefrom] != SYM_EPSILON) ||
! 678: (trans2[statefrom] != NO_TRANSITION) )
! 679: flexfatal( _( "found too many transitions in mkxtion()" ) );
! 680:
! 681: else
! 682: { /* second out-transition for an epsilon state */
! 683: ++eps2;
! 684: trans2[statefrom] = stateto;
! 685: }
! 686: }
! 687:
! 688: /* new_rule - initialize for a new rule */
! 689:
! 690: void new_rule()
! 691: {
! 692: if ( ++num_rules >= current_max_rules )
! 693: {
! 694: ++num_reallocs;
! 695: current_max_rules += MAX_RULES_INCREMENT;
! 696: rule_type = reallocate_integer_array( rule_type,
! 697: current_max_rules );
! 698: rule_linenum = reallocate_integer_array( rule_linenum,
! 699: current_max_rules );
! 700: rule_useful = reallocate_integer_array( rule_useful,
! 701: current_max_rules );
! 702: }
! 703:
! 704: if ( num_rules > MAX_RULE )
! 705: lerrif( _( "too many rules (> %d)!" ), MAX_RULE );
! 706:
! 707: rule_linenum[num_rules] = linenum;
! 708: rule_useful[num_rules] = false;
! 709: }