Annotation of src/usr.bin/indent/lexi.c, Revision 1.3
1.3 ! mickey 1: /* $OpenBSD: lexi.c,v 1.2 1996/06/26 05:34:31 deraadt Exp $ */
1.2 deraadt 2:
1.1 deraadt 3: /*
4: * Copyright (c) 1985 Sun Microsystems, Inc.
5: * Copyright (c) 1980 The Regents of the University of California.
6: * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7: * All rights reserved.
8: *
9: * Redistribution and use in source and binary forms, with or without
10: * modification, are permitted provided that the following conditions
11: * are met:
12: * 1. Redistributions of source code must retain the above copyright
13: * notice, this list of conditions and the following disclaimer.
14: * 2. Redistributions in binary form must reproduce the above copyright
15: * notice, this list of conditions and the following disclaimer in the
16: * documentation and/or other materials provided with the distribution.
17: * 3. All advertising materials mentioning features or use of this software
18: * must display the following acknowledgement:
19: * This product includes software developed by the University of
20: * California, Berkeley and its contributors.
21: * 4. Neither the name of the University nor the names of its contributors
22: * may be used to endorse or promote products derived from this software
23: * without specific prior written permission.
24: *
25: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35: * SUCH DAMAGE.
36: */
37:
38: #ifndef lint
39: /*static char sccsid[] = "from: @(#)lexi.c 5.16 (Berkeley) 2/26/91";*/
1.3 ! mickey 40: static char rcsid[] = "$OpenBSD: lexi.c,v 1.2 1996/06/26 05:34:31 deraadt Exp $";
1.1 deraadt 41: #endif /* not lint */
42:
43: /*
44: * Here we have the token scanner for indent. It scans off one token and puts
45: * it in the global variable "token". It returns a code, indicating the type
46: * of token scanned.
47: */
48:
49: #include <stdio.h>
50: #include <ctype.h>
51: #include <stdlib.h>
52: #include <string.h>
53: #include "indent_globs.h"
54: #include "indent_codes.h"
55:
56: #define alphanum 1
57: #define opchar 3
58:
59: struct templ {
60: char *rwd;
61: int rwcode;
62: };
63:
64: struct templ specials[100] =
65: {
1.3 ! mickey 66: { "switch", 1 },
! 67: { "case", 2 },
! 68: { "break", 0 },
! 69: { "struct", 3 },
! 70: { "union", 3 },
! 71: { "enum", 3 },
! 72: { "default", 2 },
! 73: { "int", 4 },
! 74: { "char", 4 },
! 75: { "float", 4 },
! 76: { "double", 4 },
! 77: { "long", 4 },
! 78: { "short", 4 },
! 79: { "typdef", 4 },
! 80: { "unsigned", 4 },
! 81: { "register", 4 },
! 82: { "static", 4 },
! 83: { "global", 4 },
! 84: { "extern", 4 },
! 85: { "void", 4 },
! 86: { "goto", 0 },
! 87: { "return", 0 },
! 88: { "if", 5 },
! 89: { "while", 5 },
! 90: { "for", 5 },
! 91: { "else", 6 },
! 92: { "do", 6 },
! 93: { "sizeof", 7 },
! 94: { 0, 0 }
1.1 deraadt 95: };
96:
97: char chartype[128] =
98: { /* this is used to facilitate the decision of
99: * what type (alphanumeric, operator) each
100: * character is */
101: 0, 0, 0, 0, 0, 0, 0, 0,
102: 0, 0, 0, 0, 0, 0, 0, 0,
103: 0, 0, 0, 0, 0, 0, 0, 0,
104: 0, 0, 0, 0, 0, 0, 0, 0,
105: 0, 3, 0, 0, 1, 3, 3, 0,
106: 0, 0, 3, 3, 0, 3, 0, 3,
107: 1, 1, 1, 1, 1, 1, 1, 1,
108: 1, 1, 0, 0, 3, 3, 3, 3,
109: 0, 1, 1, 1, 1, 1, 1, 1,
110: 1, 1, 1, 1, 1, 1, 1, 1,
111: 1, 1, 1, 1, 1, 1, 1, 1,
112: 1, 1, 1, 0, 0, 0, 3, 1,
113: 0, 1, 1, 1, 1, 1, 1, 1,
114: 1, 1, 1, 1, 1, 1, 1, 1,
115: 1, 1, 1, 1, 1, 1, 1, 1,
116: 1, 1, 1, 0, 3, 0, 3, 0
117: };
118:
119:
120:
121:
122: int
123: lexi()
124: {
125: int unary_delim; /* this is set to 1 if the current token
126: *
127: * forces a following operator to be unary */
128: static int last_code; /* the last token type returned */
129: static int l_struct; /* set to 1 if the last token was 'struct' */
130: int code; /* internal code to be returned */
131: char qchar; /* the delimiter character for a string */
132:
133: e_token = s_token; /* point to start of place to save token */
134: unary_delim = false;
135: ps.col_1 = ps.last_nl; /* tell world that this token started in
136: * column 1 iff the last thing scanned was nl */
137: ps.last_nl = false;
138:
139: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
140: ps.col_1 = false; /* leading blanks imply token is not in column
141: * 1 */
142: if (++buf_ptr >= buf_end)
143: fill_buffer();
144: }
145:
146: /* Scan an alphanumeric token */
1.3 ! mickey 147: if (chartype[*buf_ptr] == alphanum ||
! 148: (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
1.1 deraadt 149: /*
150: * we have a character or number
151: */
152: register char *j; /* used for searching thru list of
153: *
154: * reserved words */
155: register struct templ *p;
156:
1.3 ! mickey 157: if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
1.1 deraadt 158: int seendot = 0,
159: seenexp = 0;
160: if (*buf_ptr == '0' &&
161: (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
162: *e_token++ = *buf_ptr++;
163: *e_token++ = *buf_ptr++;
164: while (isxdigit(*buf_ptr)) {
165: CHECK_SIZE_TOKEN;
166: *e_token++ = *buf_ptr++;
167: }
168: }
169: else
170: while (1) {
171: if (*buf_ptr == '.')
172: if (seendot)
173: break;
174: else
175: seendot++;
176: CHECK_SIZE_TOKEN;
177: *e_token++ = *buf_ptr++;
178: if (!isdigit(*buf_ptr) && *buf_ptr != '.')
179: if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
180: break;
181: else {
182: seenexp++;
183: seendot++;
184: CHECK_SIZE_TOKEN;
185: *e_token++ = *buf_ptr++;
186: if (*buf_ptr == '+' || *buf_ptr == '-')
187: *e_token++ = *buf_ptr++;
188: }
189: }
190: if (*buf_ptr == 'L' || *buf_ptr == 'l')
191: *e_token++ = *buf_ptr++;
192: }
193: else
194: while (chartype[*buf_ptr] == alphanum) { /* copy it over */
195: CHECK_SIZE_TOKEN;
196: *e_token++ = *buf_ptr++;
197: if (buf_ptr >= buf_end)
198: fill_buffer();
199: }
200: *e_token++ = '\0';
201: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
202: if (++buf_ptr >= buf_end)
203: fill_buffer();
204: }
205: ps.its_a_keyword = false;
206: ps.sizeof_keyword = false;
207: if (l_struct) { /* if last token was 'struct', then this token
208: * should be treated as a declaration */
209: l_struct = false;
210: last_code = ident;
211: ps.last_u_d = true;
212: return (decl);
213: }
214: ps.last_u_d = false; /* Operator after indentifier is binary */
215: last_code = ident; /* Remember that this is the code we will
216: * return */
217:
218: /*
219: * This loop will check if the token is a keyword.
220: */
221: for (p = specials; (j = p->rwd) != 0; p++) {
222: register char *p = s_token; /* point at scanned token */
223: if (*j++ != *p++ || *j++ != *p++)
224: continue; /* This test depends on the fact that
225: * identifiers are always at least 1 character
226: * long (ie. the first two bytes of the
227: * identifier are always meaningful) */
228: if (p[-1] == 0)
229: break; /* If its a one-character identifier */
230: while (*p++ == *j)
231: if (*j++ == 0)
232: goto found_keyword; /* I wish that C had a multi-level
233: * break... */
234: }
235: if (p->rwd) { /* we have a keyword */
236: found_keyword:
237: ps.its_a_keyword = true;
238: ps.last_u_d = true;
239: switch (p->rwcode) {
240: case 1: /* it is a switch */
241: return (swstmt);
242: case 2: /* a case or default */
243: return (casestmt);
244:
245: case 3: /* a "struct" */
246: if (ps.p_l_follow)
247: break; /* inside parens: cast */
248: l_struct = true;
249:
250: /*
251: * Next time around, we will want to know that we have had a
252: * 'struct'
253: */
254: case 4: /* one of the declaration keywords */
255: if (ps.p_l_follow) {
256: ps.cast_mask |= 1 << ps.p_l_follow;
257: break; /* inside parens: cast */
258: }
259: last_code = decl;
260: return (decl);
261:
262: case 5: /* if, while, for */
263: return (sp_paren);
264:
265: case 6: /* do, else */
266: return (sp_nparen);
267:
268: case 7:
269: ps.sizeof_keyword = true;
270: default: /* all others are treated like any other
271: * identifier */
272: return (ident);
273: } /* end of switch */
274: } /* end of if (found_it) */
275: if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
276: register char *tp = buf_ptr;
277: while (tp < buf_end)
278: if (*tp++ == ')' && (*tp == ';' || *tp == ','))
279: goto not_proc;
280: strncpy(ps.procname, token, sizeof ps.procname - 1);
281: ps.in_parameter_declaration = 1;
282: rparen_count = 1;
283: not_proc:;
284: }
285: /*
286: * The following hack attempts to guess whether or not the current
287: * token is in fact a declaration keyword -- one that has been
288: * typedefd
289: */
290: if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
291: && !ps.p_l_follow
292: && !ps.block_init
293: && (ps.last_token == rparen || ps.last_token == semicolon ||
294: ps.last_token == decl ||
295: ps.last_token == lbrace || ps.last_token == rbrace)) {
296: ps.its_a_keyword = true;
297: ps.last_u_d = true;
298: last_code = decl;
299: return decl;
300: }
301: if (last_code == decl) /* if this is a declared variable, then
302: * following sign is unary */
303: ps.last_u_d = true; /* will make "int a -1" work */
304: last_code = ident;
305: return (ident); /* the ident is not in the list */
306: } /* end of procesing for alpanum character */
307:
308: /* Scan a non-alphanumeric token */
309:
310: *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
311: * moved here */
312: *e_token = '\0';
313: if (++buf_ptr >= buf_end)
314: fill_buffer();
315:
316: switch (*token) {
317: case '\n':
318: unary_delim = ps.last_u_d;
319: ps.last_nl = true; /* remember that we just had a newline */
320: code = (had_eof ? 0 : newline);
321:
322: /*
323: * if data has been exausted, the newline is a dummy, and we should
324: * return code to stop
325: */
326: break;
327:
328: case '\'': /* start of quoted character */
329: case '"': /* start of string */
330: qchar = *token;
331: if (troff) {
332: e_token[-1] = '`';
333: if (qchar == '"')
334: *e_token++ = '`';
335: e_token = chfont(&bodyf, &stringf, e_token);
336: }
337: do { /* copy the string */
338: while (1) { /* move one character or [/<char>]<char> */
339: if (*buf_ptr == '\n') {
340: printf("%d: Unterminated literal\n", line_no);
341: goto stop_lit;
342: }
343: CHECK_SIZE_TOKEN; /* Only have to do this once in this loop,
344: * since CHECK_SIZE guarantees that there
345: * are at least 5 entries left */
346: *e_token = *buf_ptr++;
347: if (buf_ptr >= buf_end)
348: fill_buffer();
349: if (*e_token == BACKSLASH) { /* if escape, copy extra char */
350: if (*buf_ptr == '\n') /* check for escaped newline */
351: ++line_no;
352: if (troff) {
353: *++e_token = BACKSLASH;
354: if (*buf_ptr == BACKSLASH)
355: *++e_token = BACKSLASH;
356: }
357: *++e_token = *buf_ptr++;
358: ++e_token; /* we must increment this again because we
359: * copied two chars */
360: if (buf_ptr >= buf_end)
361: fill_buffer();
362: }
363: else
364: break; /* we copied one character */
365: } /* end of while (1) */
366: } while (*e_token++ != qchar);
367: if (troff) {
368: e_token = chfont(&stringf, &bodyf, e_token - 1);
369: if (qchar == '"')
370: *e_token++ = '\'';
371: }
372: stop_lit:
373: code = ident;
374: break;
375:
376: case ('('):
377: case ('['):
378: unary_delim = true;
379: code = lparen;
380: break;
381:
382: case (')'):
383: case (']'):
384: code = rparen;
385: break;
386:
387: case '#':
388: unary_delim = ps.last_u_d;
389: code = preesc;
390: break;
391:
392: case '?':
393: unary_delim = true;
394: code = question;
395: break;
396:
397: case (':'):
398: code = colon;
399: unary_delim = true;
400: break;
401:
402: case (';'):
403: unary_delim = true;
404: code = semicolon;
405: break;
406:
407: case ('{'):
408: unary_delim = true;
409:
410: /*
411: * if (ps.in_or_st) ps.block_init = 1;
412: */
413: /* ? code = ps.block_init ? lparen : lbrace; */
414: code = lbrace;
415: break;
416:
417: case ('}'):
418: unary_delim = true;
419: /* ? code = ps.block_init ? rparen : rbrace; */
420: code = rbrace;
421: break;
422:
423: case 014: /* a form feed */
424: unary_delim = ps.last_u_d;
425: ps.last_nl = true; /* remember this so we can set 'ps.col_1'
426: * right */
427: code = form_feed;
428: break;
429:
430: case (','):
431: unary_delim = true;
432: code = comma;
433: break;
434:
435: case '.':
436: unary_delim = false;
437: code = period;
438: break;
439:
440: case '-':
441: case '+': /* check for -, +, --, ++ */
442: code = (ps.last_u_d ? unary_op : binary_op);
443: unary_delim = true;
444:
445: if (*buf_ptr == token[0]) {
446: /* check for doubled character */
447: *e_token++ = *buf_ptr++;
448: /* buffer overflow will be checked at end of loop */
449: if (last_code == ident || last_code == rparen) {
450: code = (ps.last_u_d ? unary_op : postop);
451: /* check for following ++ or -- */
452: unary_delim = false;
453: }
454: }
455: else if (*buf_ptr == '=')
456: /* check for operator += */
457: *e_token++ = *buf_ptr++;
458: else if (*buf_ptr == '>') {
459: /* check for operator -> */
460: *e_token++ = *buf_ptr++;
461: if (!pointer_as_binop) {
462: unary_delim = false;
463: code = unary_op;
464: ps.want_blank = false;
465: }
466: }
467: break; /* buffer overflow will be checked at end of
468: * switch */
469:
470: case '=':
471: if (ps.in_or_st)
472: ps.block_init = 1;
473: #ifdef undef
474: if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
475: e_token[-1] = *buf_ptr++;
476: if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
477: *e_token++ = *buf_ptr++;
478: *e_token++ = '='; /* Flip =+ to += */
479: *e_token = 0;
480: }
481: #else
482: if (*buf_ptr == '=') {/* == */
483: *e_token++ = '='; /* Flip =+ to += */
484: buf_ptr++;
485: *e_token = 0;
486: }
487: #endif
488: code = binary_op;
489: unary_delim = true;
490: break;
491: /* can drop thru!!! */
492:
493: case '>':
494: case '<':
495: case '!': /* ops like <, <<, <=, !=, etc */
496: if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
497: *e_token++ = *buf_ptr;
498: if (++buf_ptr >= buf_end)
499: fill_buffer();
500: }
501: if (*buf_ptr == '=')
502: *e_token++ = *buf_ptr++;
503: code = (ps.last_u_d ? unary_op : binary_op);
504: unary_delim = true;
505: break;
506:
507: default:
508: if (token[0] == '/' && *buf_ptr == '*') {
509: /* it is start of comment */
510: *e_token++ = '*';
511:
512: if (++buf_ptr >= buf_end)
513: fill_buffer();
514:
515: code = comment;
516: unary_delim = ps.last_u_d;
517: break;
518: }
519: while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
520: /*
521: * handle ||, &&, etc, and also things as in int *****i
522: */
523: *e_token++ = *buf_ptr;
524: if (++buf_ptr >= buf_end)
525: fill_buffer();
526: }
527: code = (ps.last_u_d ? unary_op : binary_op);
528: unary_delim = true;
529:
530:
531: } /* end of switch */
532: if (code != newline) {
533: l_struct = false;
534: last_code = code;
535: }
536: if (buf_ptr >= buf_end) /* check for input buffer empty */
537: fill_buffer();
538: ps.last_u_d = unary_delim;
539: *e_token = '\0'; /* null terminate the token */
540: return (code);
541: }
542:
543: /*
544: * Add the given keyword to the keyword table, using val as the keyword type
545: */
1.3 ! mickey 546: void
1.1 deraadt 547: addkey(key, val)
548: char *key;
549: {
550: register struct templ *p = specials;
551: while (p->rwd)
552: if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
553: return;
554: else
555: p++;
556: if (p >= specials + sizeof specials / sizeof specials[0])
557: return; /* For now, table overflows are silently
558: * ignored */
559: p->rwd = key;
560: p->rwcode = val;
561: p[1].rwd = 0;
562: p[1].rwcode = 0;
563: return;
564: }