Annotation of src/usr.bin/indent/lexi.c, Revision 1.1.1.1
1.1 deraadt 1: /*
2: * Copyright (c) 1985 Sun Microsystems, Inc.
3: * Copyright (c) 1980 The Regents of the University of California.
4: * Copyright (c) 1976 Board of Trustees of the University of Illinois.
5: * All rights reserved.
6: *
7: * Redistribution and use in source and binary forms, with or without
8: * modification, are permitted provided that the following conditions
9: * are met:
10: * 1. Redistributions of source code must retain the above copyright
11: * notice, this list of conditions and the following disclaimer.
12: * 2. Redistributions in binary form must reproduce the above copyright
13: * notice, this list of conditions and the following disclaimer in the
14: * documentation and/or other materials provided with the distribution.
15: * 3. All advertising materials mentioning features or use of this software
16: * must display the following acknowledgement:
17: * This product includes software developed by the University of
18: * California, Berkeley and its contributors.
19: * 4. Neither the name of the University nor the names of its contributors
20: * may be used to endorse or promote products derived from this software
21: * without specific prior written permission.
22: *
23: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33: * SUCH DAMAGE.
34: */
35:
36: #ifndef lint
37: /*static char sccsid[] = "from: @(#)lexi.c 5.16 (Berkeley) 2/26/91";*/
38: static char rcsid[] = "$Id: lexi.c,v 1.2 1993/08/01 18:14:31 mycroft Exp $";
39: #endif /* not lint */
40:
41: /*
42: * Here we have the token scanner for indent. It scans off one token and puts
43: * it in the global variable "token". It returns a code, indicating the type
44: * of token scanned.
45: */
46:
47: #include <stdio.h>
48: #include <ctype.h>
49: #include <stdlib.h>
50: #include <string.h>
51: #include "indent_globs.h"
52: #include "indent_codes.h"
53:
54: #define alphanum 1
55: #define opchar 3
56:
57: struct templ {
58: char *rwd;
59: int rwcode;
60: };
61:
62: struct templ specials[100] =
63: {
64: "switch", 1,
65: "case", 2,
66: "break", 0,
67: "struct", 3,
68: "union", 3,
69: "enum", 3,
70: "default", 2,
71: "int", 4,
72: "char", 4,
73: "float", 4,
74: "double", 4,
75: "long", 4,
76: "short", 4,
77: "typdef", 4,
78: "unsigned", 4,
79: "register", 4,
80: "static", 4,
81: "global", 4,
82: "extern", 4,
83: "void", 4,
84: "goto", 0,
85: "return", 0,
86: "if", 5,
87: "while", 5,
88: "for", 5,
89: "else", 6,
90: "do", 6,
91: "sizeof", 7,
92: 0, 0
93: };
94:
95: char chartype[128] =
96: { /* this is used to facilitate the decision of
97: * what type (alphanumeric, operator) each
98: * character is */
99: 0, 0, 0, 0, 0, 0, 0, 0,
100: 0, 0, 0, 0, 0, 0, 0, 0,
101: 0, 0, 0, 0, 0, 0, 0, 0,
102: 0, 0, 0, 0, 0, 0, 0, 0,
103: 0, 3, 0, 0, 1, 3, 3, 0,
104: 0, 0, 3, 3, 0, 3, 0, 3,
105: 1, 1, 1, 1, 1, 1, 1, 1,
106: 1, 1, 0, 0, 3, 3, 3, 3,
107: 0, 1, 1, 1, 1, 1, 1, 1,
108: 1, 1, 1, 1, 1, 1, 1, 1,
109: 1, 1, 1, 1, 1, 1, 1, 1,
110: 1, 1, 1, 0, 0, 0, 3, 1,
111: 0, 1, 1, 1, 1, 1, 1, 1,
112: 1, 1, 1, 1, 1, 1, 1, 1,
113: 1, 1, 1, 1, 1, 1, 1, 1,
114: 1, 1, 1, 0, 3, 0, 3, 0
115: };
116:
117:
118:
119:
120: int
121: lexi()
122: {
123: int unary_delim; /* this is set to 1 if the current token
124: *
125: * forces a following operator to be unary */
126: static int last_code; /* the last token type returned */
127: static int l_struct; /* set to 1 if the last token was 'struct' */
128: int code; /* internal code to be returned */
129: char qchar; /* the delimiter character for a string */
130:
131: e_token = s_token; /* point to start of place to save token */
132: unary_delim = false;
133: ps.col_1 = ps.last_nl; /* tell world that this token started in
134: * column 1 iff the last thing scanned was nl */
135: ps.last_nl = false;
136:
137: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
138: ps.col_1 = false; /* leading blanks imply token is not in column
139: * 1 */
140: if (++buf_ptr >= buf_end)
141: fill_buffer();
142: }
143:
144: /* Scan an alphanumeric token */
145: if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
146: /*
147: * we have a character or number
148: */
149: register char *j; /* used for searching thru list of
150: *
151: * reserved words */
152: register struct templ *p;
153:
154: if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
155: int seendot = 0,
156: seenexp = 0;
157: if (*buf_ptr == '0' &&
158: (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
159: *e_token++ = *buf_ptr++;
160: *e_token++ = *buf_ptr++;
161: while (isxdigit(*buf_ptr)) {
162: CHECK_SIZE_TOKEN;
163: *e_token++ = *buf_ptr++;
164: }
165: }
166: else
167: while (1) {
168: if (*buf_ptr == '.')
169: if (seendot)
170: break;
171: else
172: seendot++;
173: CHECK_SIZE_TOKEN;
174: *e_token++ = *buf_ptr++;
175: if (!isdigit(*buf_ptr) && *buf_ptr != '.')
176: if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
177: break;
178: else {
179: seenexp++;
180: seendot++;
181: CHECK_SIZE_TOKEN;
182: *e_token++ = *buf_ptr++;
183: if (*buf_ptr == '+' || *buf_ptr == '-')
184: *e_token++ = *buf_ptr++;
185: }
186: }
187: if (*buf_ptr == 'L' || *buf_ptr == 'l')
188: *e_token++ = *buf_ptr++;
189: }
190: else
191: while (chartype[*buf_ptr] == alphanum) { /* copy it over */
192: CHECK_SIZE_TOKEN;
193: *e_token++ = *buf_ptr++;
194: if (buf_ptr >= buf_end)
195: fill_buffer();
196: }
197: *e_token++ = '\0';
198: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
199: if (++buf_ptr >= buf_end)
200: fill_buffer();
201: }
202: ps.its_a_keyword = false;
203: ps.sizeof_keyword = false;
204: if (l_struct) { /* if last token was 'struct', then this token
205: * should be treated as a declaration */
206: l_struct = false;
207: last_code = ident;
208: ps.last_u_d = true;
209: return (decl);
210: }
211: ps.last_u_d = false; /* Operator after indentifier is binary */
212: last_code = ident; /* Remember that this is the code we will
213: * return */
214:
215: /*
216: * This loop will check if the token is a keyword.
217: */
218: for (p = specials; (j = p->rwd) != 0; p++) {
219: register char *p = s_token; /* point at scanned token */
220: if (*j++ != *p++ || *j++ != *p++)
221: continue; /* This test depends on the fact that
222: * identifiers are always at least 1 character
223: * long (ie. the first two bytes of the
224: * identifier are always meaningful) */
225: if (p[-1] == 0)
226: break; /* If its a one-character identifier */
227: while (*p++ == *j)
228: if (*j++ == 0)
229: goto found_keyword; /* I wish that C had a multi-level
230: * break... */
231: }
232: if (p->rwd) { /* we have a keyword */
233: found_keyword:
234: ps.its_a_keyword = true;
235: ps.last_u_d = true;
236: switch (p->rwcode) {
237: case 1: /* it is a switch */
238: return (swstmt);
239: case 2: /* a case or default */
240: return (casestmt);
241:
242: case 3: /* a "struct" */
243: if (ps.p_l_follow)
244: break; /* inside parens: cast */
245: l_struct = true;
246:
247: /*
248: * Next time around, we will want to know that we have had a
249: * 'struct'
250: */
251: case 4: /* one of the declaration keywords */
252: if (ps.p_l_follow) {
253: ps.cast_mask |= 1 << ps.p_l_follow;
254: break; /* inside parens: cast */
255: }
256: last_code = decl;
257: return (decl);
258:
259: case 5: /* if, while, for */
260: return (sp_paren);
261:
262: case 6: /* do, else */
263: return (sp_nparen);
264:
265: case 7:
266: ps.sizeof_keyword = true;
267: default: /* all others are treated like any other
268: * identifier */
269: return (ident);
270: } /* end of switch */
271: } /* end of if (found_it) */
272: if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
273: register char *tp = buf_ptr;
274: while (tp < buf_end)
275: if (*tp++ == ')' && (*tp == ';' || *tp == ','))
276: goto not_proc;
277: strncpy(ps.procname, token, sizeof ps.procname - 1);
278: ps.in_parameter_declaration = 1;
279: rparen_count = 1;
280: not_proc:;
281: }
282: /*
283: * The following hack attempts to guess whether or not the current
284: * token is in fact a declaration keyword -- one that has been
285: * typedefd
286: */
287: if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
288: && !ps.p_l_follow
289: && !ps.block_init
290: && (ps.last_token == rparen || ps.last_token == semicolon ||
291: ps.last_token == decl ||
292: ps.last_token == lbrace || ps.last_token == rbrace)) {
293: ps.its_a_keyword = true;
294: ps.last_u_d = true;
295: last_code = decl;
296: return decl;
297: }
298: if (last_code == decl) /* if this is a declared variable, then
299: * following sign is unary */
300: ps.last_u_d = true; /* will make "int a -1" work */
301: last_code = ident;
302: return (ident); /* the ident is not in the list */
303: } /* end of procesing for alpanum character */
304:
305: /* Scan a non-alphanumeric token */
306:
307: *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
308: * moved here */
309: *e_token = '\0';
310: if (++buf_ptr >= buf_end)
311: fill_buffer();
312:
313: switch (*token) {
314: case '\n':
315: unary_delim = ps.last_u_d;
316: ps.last_nl = true; /* remember that we just had a newline */
317: code = (had_eof ? 0 : newline);
318:
319: /*
320: * if data has been exausted, the newline is a dummy, and we should
321: * return code to stop
322: */
323: break;
324:
325: case '\'': /* start of quoted character */
326: case '"': /* start of string */
327: qchar = *token;
328: if (troff) {
329: e_token[-1] = '`';
330: if (qchar == '"')
331: *e_token++ = '`';
332: e_token = chfont(&bodyf, &stringf, e_token);
333: }
334: do { /* copy the string */
335: while (1) { /* move one character or [/<char>]<char> */
336: if (*buf_ptr == '\n') {
337: printf("%d: Unterminated literal\n", line_no);
338: goto stop_lit;
339: }
340: CHECK_SIZE_TOKEN; /* Only have to do this once in this loop,
341: * since CHECK_SIZE guarantees that there
342: * are at least 5 entries left */
343: *e_token = *buf_ptr++;
344: if (buf_ptr >= buf_end)
345: fill_buffer();
346: if (*e_token == BACKSLASH) { /* if escape, copy extra char */
347: if (*buf_ptr == '\n') /* check for escaped newline */
348: ++line_no;
349: if (troff) {
350: *++e_token = BACKSLASH;
351: if (*buf_ptr == BACKSLASH)
352: *++e_token = BACKSLASH;
353: }
354: *++e_token = *buf_ptr++;
355: ++e_token; /* we must increment this again because we
356: * copied two chars */
357: if (buf_ptr >= buf_end)
358: fill_buffer();
359: }
360: else
361: break; /* we copied one character */
362: } /* end of while (1) */
363: } while (*e_token++ != qchar);
364: if (troff) {
365: e_token = chfont(&stringf, &bodyf, e_token - 1);
366: if (qchar == '"')
367: *e_token++ = '\'';
368: }
369: stop_lit:
370: code = ident;
371: break;
372:
373: case ('('):
374: case ('['):
375: unary_delim = true;
376: code = lparen;
377: break;
378:
379: case (')'):
380: case (']'):
381: code = rparen;
382: break;
383:
384: case '#':
385: unary_delim = ps.last_u_d;
386: code = preesc;
387: break;
388:
389: case '?':
390: unary_delim = true;
391: code = question;
392: break;
393:
394: case (':'):
395: code = colon;
396: unary_delim = true;
397: break;
398:
399: case (';'):
400: unary_delim = true;
401: code = semicolon;
402: break;
403:
404: case ('{'):
405: unary_delim = true;
406:
407: /*
408: * if (ps.in_or_st) ps.block_init = 1;
409: */
410: /* ? code = ps.block_init ? lparen : lbrace; */
411: code = lbrace;
412: break;
413:
414: case ('}'):
415: unary_delim = true;
416: /* ? code = ps.block_init ? rparen : rbrace; */
417: code = rbrace;
418: break;
419:
420: case 014: /* a form feed */
421: unary_delim = ps.last_u_d;
422: ps.last_nl = true; /* remember this so we can set 'ps.col_1'
423: * right */
424: code = form_feed;
425: break;
426:
427: case (','):
428: unary_delim = true;
429: code = comma;
430: break;
431:
432: case '.':
433: unary_delim = false;
434: code = period;
435: break;
436:
437: case '-':
438: case '+': /* check for -, +, --, ++ */
439: code = (ps.last_u_d ? unary_op : binary_op);
440: unary_delim = true;
441:
442: if (*buf_ptr == token[0]) {
443: /* check for doubled character */
444: *e_token++ = *buf_ptr++;
445: /* buffer overflow will be checked at end of loop */
446: if (last_code == ident || last_code == rparen) {
447: code = (ps.last_u_d ? unary_op : postop);
448: /* check for following ++ or -- */
449: unary_delim = false;
450: }
451: }
452: else if (*buf_ptr == '=')
453: /* check for operator += */
454: *e_token++ = *buf_ptr++;
455: else if (*buf_ptr == '>') {
456: /* check for operator -> */
457: *e_token++ = *buf_ptr++;
458: if (!pointer_as_binop) {
459: unary_delim = false;
460: code = unary_op;
461: ps.want_blank = false;
462: }
463: }
464: break; /* buffer overflow will be checked at end of
465: * switch */
466:
467: case '=':
468: if (ps.in_or_st)
469: ps.block_init = 1;
470: #ifdef undef
471: if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
472: e_token[-1] = *buf_ptr++;
473: if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
474: *e_token++ = *buf_ptr++;
475: *e_token++ = '='; /* Flip =+ to += */
476: *e_token = 0;
477: }
478: #else
479: if (*buf_ptr == '=') {/* == */
480: *e_token++ = '='; /* Flip =+ to += */
481: buf_ptr++;
482: *e_token = 0;
483: }
484: #endif
485: code = binary_op;
486: unary_delim = true;
487: break;
488: /* can drop thru!!! */
489:
490: case '>':
491: case '<':
492: case '!': /* ops like <, <<, <=, !=, etc */
493: if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
494: *e_token++ = *buf_ptr;
495: if (++buf_ptr >= buf_end)
496: fill_buffer();
497: }
498: if (*buf_ptr == '=')
499: *e_token++ = *buf_ptr++;
500: code = (ps.last_u_d ? unary_op : binary_op);
501: unary_delim = true;
502: break;
503:
504: default:
505: if (token[0] == '/' && *buf_ptr == '*') {
506: /* it is start of comment */
507: *e_token++ = '*';
508:
509: if (++buf_ptr >= buf_end)
510: fill_buffer();
511:
512: code = comment;
513: unary_delim = ps.last_u_d;
514: break;
515: }
516: while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
517: /*
518: * handle ||, &&, etc, and also things as in int *****i
519: */
520: *e_token++ = *buf_ptr;
521: if (++buf_ptr >= buf_end)
522: fill_buffer();
523: }
524: code = (ps.last_u_d ? unary_op : binary_op);
525: unary_delim = true;
526:
527:
528: } /* end of switch */
529: if (code != newline) {
530: l_struct = false;
531: last_code = code;
532: }
533: if (buf_ptr >= buf_end) /* check for input buffer empty */
534: fill_buffer();
535: ps.last_u_d = unary_delim;
536: *e_token = '\0'; /* null terminate the token */
537: return (code);
538: }
539:
540: /*
541: * Add the given keyword to the keyword table, using val as the keyword type
542: */
543: addkey(key, val)
544: char *key;
545: {
546: register struct templ *p = specials;
547: while (p->rwd)
548: if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
549: return;
550: else
551: p++;
552: if (p >= specials + sizeof specials / sizeof specials[0])
553: return; /* For now, table overflows are silently
554: * ignored */
555: p->rwd = key;
556: p->rwcode = val;
557: p[1].rwd = 0;
558: p[1].rwcode = 0;
559: return;
560: }