Annotation of src/usr.bin/indent/lexi.c, Revision 1.10
1.10 ! tedu 1: /* $OpenBSD: lexi.c,v 1.9 2003/06/12 01:07:27 deraadt Exp $ */
1.2 deraadt 2:
1.1 deraadt 3: /*
1.7 pjanzen 4: * Copyright (c) 1980, 1993
5: * The Regents of the University of California.
6: * Copyright (c) 1976 Board of Trustees of the University of Illinois.
1.1 deraadt 7: * Copyright (c) 1985 Sun Microsystems, Inc.
8: * All rights reserved.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
1.9 deraadt 18: * 3. Neither the name of the University nor the names of its contributors
1.1 deraadt 19: * may be used to endorse or promote products derived from this software
20: * without specific prior written permission.
21: *
22: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32: * SUCH DAMAGE.
33: */
34:
35: #ifndef lint
1.7 pjanzen 36: /*static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";*/
1.10 ! tedu 37: static char rcsid[] = "$OpenBSD: lexi.c,v 1.9 2003/06/12 01:07:27 deraadt Exp $";
1.1 deraadt 38: #endif /* not lint */
39:
40: /*
41: * Here we have the token scanner for indent. It scans off one token and puts
42: * it in the global variable "token". It returns a code, indicating the type
43: * of token scanned.
44: */
45:
46: #include <stdio.h>
47: #include <ctype.h>
48: #include <stdlib.h>
49: #include <string.h>
1.7 pjanzen 50: #include <err.h>
1.1 deraadt 51: #include "indent_globs.h"
52: #include "indent_codes.h"
53:
54: #define alphanum 1
55: #define opchar 3
56:
57: struct templ {
58: char *rwd;
59: int rwcode;
60: };
61:
1.4 deraadt 62: struct templ specialsinit[] = {
1.3 mickey 63: { "switch", 1 },
64: { "case", 2 },
65: { "break", 0 },
66: { "struct", 3 },
67: { "union", 3 },
68: { "enum", 3 },
69: { "default", 2 },
70: { "int", 4 },
71: { "char", 4 },
72: { "float", 4 },
73: { "double", 4 },
74: { "long", 4 },
75: { "short", 4 },
76: { "typdef", 4 },
77: { "unsigned", 4 },
78: { "register", 4 },
79: { "static", 4 },
80: { "global", 4 },
81: { "extern", 4 },
82: { "void", 4 },
83: { "goto", 0 },
84: { "return", 0 },
85: { "if", 5 },
86: { "while", 5 },
87: { "for", 5 },
88: { "else", 6 },
89: { "do", 6 },
90: { "sizeof", 7 },
1.1 deraadt 91: };
92:
1.4 deraadt 93: struct templ *specials = specialsinit;
94: int nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
95: int maxspecials;
96:
1.1 deraadt 97: char chartype[128] =
98: { /* this is used to facilitate the decision of
99: * what type (alphanumeric, operator) each
100: * character is */
101: 0, 0, 0, 0, 0, 0, 0, 0,
102: 0, 0, 0, 0, 0, 0, 0, 0,
103: 0, 0, 0, 0, 0, 0, 0, 0,
104: 0, 0, 0, 0, 0, 0, 0, 0,
105: 0, 3, 0, 0, 1, 3, 3, 0,
106: 0, 0, 3, 3, 0, 3, 0, 3,
107: 1, 1, 1, 1, 1, 1, 1, 1,
108: 1, 1, 0, 0, 3, 3, 3, 3,
109: 0, 1, 1, 1, 1, 1, 1, 1,
110: 1, 1, 1, 1, 1, 1, 1, 1,
111: 1, 1, 1, 1, 1, 1, 1, 1,
112: 1, 1, 1, 0, 0, 0, 3, 1,
113: 0, 1, 1, 1, 1, 1, 1, 1,
114: 1, 1, 1, 1, 1, 1, 1, 1,
115: 1, 1, 1, 1, 1, 1, 1, 1,
116: 1, 1, 1, 0, 3, 0, 3, 0
117: };
118:
119:
120:
121:
122: int
123: lexi()
124: {
125: int unary_delim; /* this is set to 1 if the current token
126: * forces a following operator to be unary */
127: static int last_code; /* the last token type returned */
128: static int l_struct; /* set to 1 if the last token was 'struct' */
129: int code; /* internal code to be returned */
130: char qchar; /* the delimiter character for a string */
1.4 deraadt 131: int i;
1.1 deraadt 132:
133: e_token = s_token; /* point to start of place to save token */
134: unary_delim = false;
135: ps.col_1 = ps.last_nl; /* tell world that this token started in
136: * column 1 iff the last thing scanned was nl */
137: ps.last_nl = false;
138:
139: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
140: ps.col_1 = false; /* leading blanks imply token is not in column
141: * 1 */
142: if (++buf_ptr >= buf_end)
143: fill_buffer();
144: }
145:
146: /* Scan an alphanumeric token */
1.7 pjanzen 147: if (chartype[(int)*buf_ptr] == alphanum ||
1.3 mickey 148: (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
1.1 deraadt 149: /*
150: * we have a character or number
151: */
1.7 pjanzen 152: char *j; /* used for searching thru list of
153: * reserved words */
1.3 mickey 154: if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
1.1 deraadt 155: int seendot = 0,
1.6 deraadt 156: seenexp = 0,
157: seensfx = 0;
1.1 deraadt 158: if (*buf_ptr == '0' &&
159: (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
160: *e_token++ = *buf_ptr++;
161: *e_token++ = *buf_ptr++;
162: while (isxdigit(*buf_ptr)) {
163: CHECK_SIZE_TOKEN;
164: *e_token++ = *buf_ptr++;
165: }
166: }
167: else
168: while (1) {
1.7 pjanzen 169: if (*buf_ptr == '.') {
1.1 deraadt 170: if (seendot)
171: break;
172: else
173: seendot++;
1.7 pjanzen 174: }
1.1 deraadt 175: CHECK_SIZE_TOKEN;
176: *e_token++ = *buf_ptr++;
1.7 pjanzen 177: if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
1.1 deraadt 178: if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
179: break;
180: else {
181: seenexp++;
182: seendot++;
183: CHECK_SIZE_TOKEN;
184: *e_token++ = *buf_ptr++;
185: if (*buf_ptr == '+' || *buf_ptr == '-')
186: *e_token++ = *buf_ptr++;
187: }
1.7 pjanzen 188: }
1.1 deraadt 189: }
1.6 deraadt 190: while (1) {
191: if (!(seensfx & 1) &&
192: (*buf_ptr == 'U' || *buf_ptr == 'u')) {
193: CHECK_SIZE_TOKEN;
194: *e_token++ = *buf_ptr++;
195: seensfx |= 1;
196: continue;
197: }
198: if (!(seensfx & 2) &&
199: (*buf_ptr == 'L' || *buf_ptr == 'l')) {
200: CHECK_SIZE_TOKEN;
201: if (buf_ptr[1] == buf_ptr[0])
202: *e_token++ = *buf_ptr++;
203: *e_token++ = *buf_ptr++;
204: seensfx |= 2;
205: continue;
206: }
207: break;
208: }
1.1 deraadt 209: }
210: else
1.7 pjanzen 211: while (chartype[(int)*buf_ptr] == alphanum) { /* copy it over */
1.1 deraadt 212: CHECK_SIZE_TOKEN;
213: *e_token++ = *buf_ptr++;
214: if (buf_ptr >= buf_end)
215: fill_buffer();
216: }
217: *e_token++ = '\0';
218: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
219: if (++buf_ptr >= buf_end)
220: fill_buffer();
221: }
222: ps.its_a_keyword = false;
223: ps.sizeof_keyword = false;
224: if (l_struct) { /* if last token was 'struct', then this token
225: * should be treated as a declaration */
226: l_struct = false;
227: last_code = ident;
228: ps.last_u_d = true;
229: return (decl);
230: }
231: ps.last_u_d = false; /* Operator after indentifier is binary */
232: last_code = ident; /* Remember that this is the code we will
233: * return */
234:
235: /*
236: * This loop will check if the token is a keyword.
237: */
1.4 deraadt 238: for (i = 0; i < nspecials; i++) {
1.7 pjanzen 239: char *p = s_token; /* point at scanned token */
1.4 deraadt 240: j = specials[i].rwd;
1.1 deraadt 241: if (*j++ != *p++ || *j++ != *p++)
242: continue; /* This test depends on the fact that
243: * identifiers are always at least 1 character
244: * long (ie. the first two bytes of the
245: * identifier are always meaningful) */
246: if (p[-1] == 0)
247: break; /* If its a one-character identifier */
248: while (*p++ == *j)
249: if (*j++ == 0)
250: goto found_keyword; /* I wish that C had a multi-level
251: * break... */
252: }
1.4 deraadt 253: if (i < nspecials) { /* we have a keyword */
1.1 deraadt 254: found_keyword:
255: ps.its_a_keyword = true;
256: ps.last_u_d = true;
1.4 deraadt 257: switch (specials[i].rwcode) {
1.1 deraadt 258: case 1: /* it is a switch */
259: return (swstmt);
260: case 2: /* a case or default */
261: return (casestmt);
262:
263: case 3: /* a "struct" */
264: if (ps.p_l_follow)
265: break; /* inside parens: cast */
266: l_struct = true;
267:
268: /*
269: * Next time around, we will want to know that we have had a
270: * 'struct'
271: */
272: case 4: /* one of the declaration keywords */
273: if (ps.p_l_follow) {
274: ps.cast_mask |= 1 << ps.p_l_follow;
275: break; /* inside parens: cast */
276: }
277: last_code = decl;
278: return (decl);
279:
280: case 5: /* if, while, for */
281: return (sp_paren);
282:
283: case 6: /* do, else */
284: return (sp_nparen);
285:
286: case 7:
287: ps.sizeof_keyword = true;
288: default: /* all others are treated like any other
289: * identifier */
290: return (ident);
291: } /* end of switch */
292: } /* end of if (found_it) */
293: if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
1.7 pjanzen 294: char *tp = buf_ptr;
1.1 deraadt 295: while (tp < buf_end)
296: if (*tp++ == ')' && (*tp == ';' || *tp == ','))
297: goto not_proc;
1.7 pjanzen 298: strlcpy(ps.procname, token, sizeof ps.procname);
1.1 deraadt 299: ps.in_parameter_declaration = 1;
300: rparen_count = 1;
301: not_proc:;
302: }
303: /*
304: * The following hack attempts to guess whether or not the current
305: * token is in fact a declaration keyword -- one that has been
306: * typedefd
307: */
308: if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
309: && !ps.p_l_follow
310: && !ps.block_init
311: && (ps.last_token == rparen || ps.last_token == semicolon ||
312: ps.last_token == decl ||
313: ps.last_token == lbrace || ps.last_token == rbrace)) {
314: ps.its_a_keyword = true;
315: ps.last_u_d = true;
316: last_code = decl;
317: return decl;
318: }
319: if (last_code == decl) /* if this is a declared variable, then
320: * following sign is unary */
321: ps.last_u_d = true; /* will make "int a -1" work */
322: last_code = ident;
323: return (ident); /* the ident is not in the list */
324: } /* end of procesing for alpanum character */
325:
326: /* Scan a non-alphanumeric token */
327:
328: *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
329: * moved here */
330: *e_token = '\0';
331: if (++buf_ptr >= buf_end)
332: fill_buffer();
333:
334: switch (*token) {
335: case '\n':
336: unary_delim = ps.last_u_d;
337: ps.last_nl = true; /* remember that we just had a newline */
338: code = (had_eof ? 0 : newline);
339:
340: /*
341: * if data has been exausted, the newline is a dummy, and we should
342: * return code to stop
343: */
344: break;
345:
346: case '\'': /* start of quoted character */
347: case '"': /* start of string */
348: qchar = *token;
349: if (troff) {
350: e_token[-1] = '`';
351: if (qchar == '"')
352: *e_token++ = '`';
353: e_token = chfont(&bodyf, &stringf, e_token);
354: }
355: do { /* copy the string */
356: while (1) { /* move one character or [/<char>]<char> */
357: if (*buf_ptr == '\n') {
358: printf("%d: Unterminated literal\n", line_no);
359: goto stop_lit;
360: }
361: CHECK_SIZE_TOKEN; /* Only have to do this once in this loop,
362: * since CHECK_SIZE guarantees that there
363: * are at least 5 entries left */
364: *e_token = *buf_ptr++;
365: if (buf_ptr >= buf_end)
366: fill_buffer();
367: if (*e_token == BACKSLASH) { /* if escape, copy extra char */
368: if (*buf_ptr == '\n') /* check for escaped newline */
369: ++line_no;
370: if (troff) {
371: *++e_token = BACKSLASH;
372: if (*buf_ptr == BACKSLASH)
373: *++e_token = BACKSLASH;
374: }
375: *++e_token = *buf_ptr++;
376: ++e_token; /* we must increment this again because we
377: * copied two chars */
378: if (buf_ptr >= buf_end)
379: fill_buffer();
380: }
381: else
382: break; /* we copied one character */
383: } /* end of while (1) */
384: } while (*e_token++ != qchar);
385: if (troff) {
386: e_token = chfont(&stringf, &bodyf, e_token - 1);
387: if (qchar == '"')
388: *e_token++ = '\'';
389: }
390: stop_lit:
391: code = ident;
392: break;
393:
394: case ('('):
395: case ('['):
396: unary_delim = true;
397: code = lparen;
398: break;
399:
400: case (')'):
401: case (']'):
402: code = rparen;
403: break;
404:
405: case '#':
406: unary_delim = ps.last_u_d;
407: code = preesc;
408: break;
409:
410: case '?':
411: unary_delim = true;
412: code = question;
413: break;
414:
415: case (':'):
416: code = colon;
417: unary_delim = true;
418: break;
419:
420: case (';'):
421: unary_delim = true;
422: code = semicolon;
423: break;
424:
425: case ('{'):
426: unary_delim = true;
427:
428: /*
429: * if (ps.in_or_st) ps.block_init = 1;
430: */
431: /* ? code = ps.block_init ? lparen : lbrace; */
432: code = lbrace;
433: break;
434:
435: case ('}'):
436: unary_delim = true;
437: /* ? code = ps.block_init ? rparen : rbrace; */
438: code = rbrace;
439: break;
440:
441: case 014: /* a form feed */
442: unary_delim = ps.last_u_d;
443: ps.last_nl = true; /* remember this so we can set 'ps.col_1'
444: * right */
445: code = form_feed;
446: break;
447:
448: case (','):
449: unary_delim = true;
450: code = comma;
451: break;
452:
453: case '.':
454: unary_delim = false;
455: code = period;
456: break;
457:
458: case '-':
459: case '+': /* check for -, +, --, ++ */
460: code = (ps.last_u_d ? unary_op : binary_op);
461: unary_delim = true;
462:
463: if (*buf_ptr == token[0]) {
464: /* check for doubled character */
465: *e_token++ = *buf_ptr++;
466: /* buffer overflow will be checked at end of loop */
467: if (last_code == ident || last_code == rparen) {
468: code = (ps.last_u_d ? unary_op : postop);
469: /* check for following ++ or -- */
470: unary_delim = false;
471: }
472: }
473: else if (*buf_ptr == '=')
474: /* check for operator += */
475: *e_token++ = *buf_ptr++;
476: else if (*buf_ptr == '>') {
477: /* check for operator -> */
478: *e_token++ = *buf_ptr++;
479: if (!pointer_as_binop) {
480: unary_delim = false;
481: code = unary_op;
482: ps.want_blank = false;
483: }
484: }
485: break; /* buffer overflow will be checked at end of
486: * switch */
487:
488: case '=':
489: if (ps.in_or_st)
490: ps.block_init = 1;
491: #ifdef undef
492: if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
493: e_token[-1] = *buf_ptr++;
494: if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
495: *e_token++ = *buf_ptr++;
496: *e_token++ = '='; /* Flip =+ to += */
497: *e_token = 0;
498: }
499: #else
500: if (*buf_ptr == '=') {/* == */
501: *e_token++ = '='; /* Flip =+ to += */
502: buf_ptr++;
503: *e_token = 0;
504: }
505: #endif
506: code = binary_op;
507: unary_delim = true;
508: break;
509: /* can drop thru!!! */
510:
511: case '>':
512: case '<':
513: case '!': /* ops like <, <<, <=, !=, etc */
514: if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
515: *e_token++ = *buf_ptr;
516: if (++buf_ptr >= buf_end)
517: fill_buffer();
518: }
519: if (*buf_ptr == '=')
520: *e_token++ = *buf_ptr++;
521: code = (ps.last_u_d ? unary_op : binary_op);
522: unary_delim = true;
523: break;
524:
525: default:
526: if (token[0] == '/' && *buf_ptr == '*') {
527: /* it is start of comment */
528: *e_token++ = '*';
529:
530: if (++buf_ptr >= buf_end)
531: fill_buffer();
532:
533: code = comment;
534: unary_delim = ps.last_u_d;
535: break;
536: }
537: while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
538: /*
539: * handle ||, &&, etc, and also things as in int *****i
540: */
541: *e_token++ = *buf_ptr;
542: if (++buf_ptr >= buf_end)
543: fill_buffer();
544: }
545: code = (ps.last_u_d ? unary_op : binary_op);
546: unary_delim = true;
547:
548:
549: } /* end of switch */
550: if (code != newline) {
551: l_struct = false;
552: last_code = code;
553: }
554: if (buf_ptr >= buf_end) /* check for input buffer empty */
555: fill_buffer();
556: ps.last_u_d = unary_delim;
557: *e_token = '\0'; /* null terminate the token */
558: return (code);
559: }
560:
561: /*
562: * Add the given keyword to the keyword table, using val as the keyword type
563: */
1.3 mickey 564: void
1.1 deraadt 565: addkey(key, val)
566: char *key;
1.4 deraadt 567: int val;
1.1 deraadt 568: {
1.7 pjanzen 569: struct templ *p;
1.4 deraadt 570: int i = 0;
571:
572: while (i < nspecials) {
573: p = &specials[i];
1.1 deraadt 574: if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
575: return;
576: else
1.4 deraadt 577: i++;
578: }
579:
580: if (specials == specialsinit) {
581: /*
582: * Whoa. Must reallocate special table.
583: */
584: nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
585: maxspecials = nspecials;
586: maxspecials += maxspecials >> 2;
587: specials = (struct templ *)malloc(maxspecials * sizeof specials[0]);
588: if (specials == NULL)
1.8 pjanzen 589: err(1, NULL);
1.4 deraadt 590: memmove(specials, specialsinit, sizeof specialsinit);
591: } else if (nspecials >= maxspecials) {
1.10 ! tedu 592: int newspecials = maxspecials + maxspecials >> 2;
! 593: struct templ *specials2;
! 594:
! 595: specials2 = realloc(specials, newspecials * sizeof specials[0]);
! 596: if (specials2 == NULL)
1.8 pjanzen 597: err(1, NULL);
1.10 ! tedu 598: specials = specials2;
! 599: maxspecials = newspecials;
1.4 deraadt 600: }
1.7 pjanzen 601:
1.4 deraadt 602: p = &specials[i];
1.1 deraadt 603: p->rwd = key;
604: p->rwcode = val;
1.4 deraadt 605: nspecials++;
1.1 deraadt 606: return;
607: }