Annotation of src/usr.bin/indent/lexi.c, Revision 1.16
1.16 ! deraadt 1: /* $OpenBSD: lexi.c,v 1.15 2009/10/27 23:59:39 deraadt Exp $ */
1.2 deraadt 2:
1.1 deraadt 3: /*
1.7 pjanzen 4: * Copyright (c) 1980, 1993
5: * The Regents of the University of California.
6: * Copyright (c) 1976 Board of Trustees of the University of Illinois.
1.1 deraadt 7: * Copyright (c) 1985 Sun Microsystems, Inc.
8: * All rights reserved.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
1.9 deraadt 18: * 3. Neither the name of the University nor the names of its contributors
1.1 deraadt 19: * may be used to endorse or promote products derived from this software
20: * without specific prior written permission.
21: *
22: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32: * SUCH DAMAGE.
33: */
34:
35: /*
36: * Here we have the token scanner for indent. It scans off one token and puts
37: * it in the global variable "token". It returns a code, indicating the type
38: * of token scanned.
39: */
40:
41: #include <stdio.h>
42: #include <ctype.h>
43: #include <stdlib.h>
44: #include <string.h>
1.7 pjanzen 45: #include <err.h>
1.1 deraadt 46: #include "indent_globs.h"
47: #include "indent_codes.h"
48:
49: #define alphanum 1
50: #define opchar 3
51:
52: struct templ {
53: char *rwd;
54: int rwcode;
55: };
56:
1.4 deraadt 57: struct templ specialsinit[] = {
1.3 mickey 58: { "switch", 1 },
59: { "case", 2 },
60: { "break", 0 },
61: { "struct", 3 },
62: { "union", 3 },
63: { "enum", 3 },
64: { "default", 2 },
65: { "int", 4 },
66: { "char", 4 },
67: { "float", 4 },
68: { "double", 4 },
69: { "long", 4 },
70: { "short", 4 },
71: { "typdef", 4 },
72: { "unsigned", 4 },
73: { "register", 4 },
74: { "static", 4 },
75: { "global", 4 },
76: { "extern", 4 },
77: { "void", 4 },
78: { "goto", 0 },
79: { "return", 0 },
80: { "if", 5 },
81: { "while", 5 },
82: { "for", 5 },
83: { "else", 6 },
84: { "do", 6 },
85: { "sizeof", 7 },
1.1 deraadt 86: };
87:
1.4 deraadt 88: struct templ *specials = specialsinit;
89: int nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
90: int maxspecials;
91:
1.1 deraadt 92: char chartype[128] =
93: { /* this is used to facilitate the decision of
94: * what type (alphanumeric, operator) each
95: * character is */
96: 0, 0, 0, 0, 0, 0, 0, 0,
97: 0, 0, 0, 0, 0, 0, 0, 0,
98: 0, 0, 0, 0, 0, 0, 0, 0,
99: 0, 0, 0, 0, 0, 0, 0, 0,
100: 0, 3, 0, 0, 1, 3, 3, 0,
101: 0, 0, 3, 3, 0, 3, 0, 3,
102: 1, 1, 1, 1, 1, 1, 1, 1,
103: 1, 1, 0, 0, 3, 3, 3, 3,
104: 0, 1, 1, 1, 1, 1, 1, 1,
105: 1, 1, 1, 1, 1, 1, 1, 1,
106: 1, 1, 1, 1, 1, 1, 1, 1,
107: 1, 1, 1, 0, 0, 0, 3, 1,
108: 0, 1, 1, 1, 1, 1, 1, 1,
109: 1, 1, 1, 1, 1, 1, 1, 1,
110: 1, 1, 1, 1, 1, 1, 1, 1,
111: 1, 1, 1, 0, 3, 0, 3, 0
112: };
113:
114:
115:
116:
117: int
1.11 deraadt 118: lexi(void)
1.1 deraadt 119: {
120: int unary_delim; /* this is set to 1 if the current token
121: * forces a following operator to be unary */
122: static int last_code; /* the last token type returned */
123: static int l_struct; /* set to 1 if the last token was 'struct' */
124: int code; /* internal code to be returned */
125: char qchar; /* the delimiter character for a string */
1.4 deraadt 126: int i;
1.1 deraadt 127:
128: e_token = s_token; /* point to start of place to save token */
129: unary_delim = false;
130: ps.col_1 = ps.last_nl; /* tell world that this token started in
131: * column 1 iff the last thing scanned was nl */
132: ps.last_nl = false;
133:
134: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
135: ps.col_1 = false; /* leading blanks imply token is not in column
136: * 1 */
137: if (++buf_ptr >= buf_end)
138: fill_buffer();
139: }
140:
141: /* Scan an alphanumeric token */
1.7 pjanzen 142: if (chartype[(int)*buf_ptr] == alphanum ||
1.16 ! deraadt 143: (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
1.1 deraadt 144: /*
145: * we have a character or number
146: */
1.7 pjanzen 147: char *j; /* used for searching thru list of
148: * reserved words */
1.16 ! deraadt 149: if (isdigit((unsigned char)*buf_ptr) ||
! 150: (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
1.1 deraadt 151: int seendot = 0,
1.6 deraadt 152: seenexp = 0,
153: seensfx = 0;
1.1 deraadt 154: if (*buf_ptr == '0' &&
155: (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
156: *e_token++ = *buf_ptr++;
157: *e_token++ = *buf_ptr++;
158: while (isxdigit(*buf_ptr)) {
159: CHECK_SIZE_TOKEN;
160: *e_token++ = *buf_ptr++;
161: }
162: }
163: else
164: while (1) {
1.7 pjanzen 165: if (*buf_ptr == '.') {
1.1 deraadt 166: if (seendot)
167: break;
168: else
169: seendot++;
1.7 pjanzen 170: }
1.1 deraadt 171: CHECK_SIZE_TOKEN;
172: *e_token++ = *buf_ptr++;
1.16 ! deraadt 173: if (!isdigit((unsigned char)*buf_ptr) && *buf_ptr != '.') {
1.1 deraadt 174: if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
175: break;
176: else {
177: seenexp++;
178: seendot++;
179: CHECK_SIZE_TOKEN;
180: *e_token++ = *buf_ptr++;
181: if (*buf_ptr == '+' || *buf_ptr == '-')
182: *e_token++ = *buf_ptr++;
183: }
1.7 pjanzen 184: }
1.1 deraadt 185: }
1.6 deraadt 186: while (1) {
187: if (!(seensfx & 1) &&
188: (*buf_ptr == 'U' || *buf_ptr == 'u')) {
189: CHECK_SIZE_TOKEN;
190: *e_token++ = *buf_ptr++;
191: seensfx |= 1;
192: continue;
193: }
194: if (!(seensfx & 2) &&
195: (*buf_ptr == 'L' || *buf_ptr == 'l')) {
196: CHECK_SIZE_TOKEN;
197: if (buf_ptr[1] == buf_ptr[0])
198: *e_token++ = *buf_ptr++;
199: *e_token++ = *buf_ptr++;
200: seensfx |= 2;
201: continue;
202: }
203: break;
204: }
1.1 deraadt 205: }
206: else
1.7 pjanzen 207: while (chartype[(int)*buf_ptr] == alphanum) { /* copy it over */
1.1 deraadt 208: CHECK_SIZE_TOKEN;
209: *e_token++ = *buf_ptr++;
210: if (buf_ptr >= buf_end)
211: fill_buffer();
212: }
213: *e_token++ = '\0';
214: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
215: if (++buf_ptr >= buf_end)
216: fill_buffer();
217: }
218: ps.its_a_keyword = false;
219: ps.sizeof_keyword = false;
220: if (l_struct) { /* if last token was 'struct', then this token
221: * should be treated as a declaration */
222: l_struct = false;
223: last_code = ident;
224: ps.last_u_d = true;
225: return (decl);
226: }
1.14 martynas 227: ps.last_u_d = false; /* Operator after identifier is binary */
1.1 deraadt 228: last_code = ident; /* Remember that this is the code we will
229: * return */
230:
231: /*
232: * This loop will check if the token is a keyword.
233: */
1.4 deraadt 234: for (i = 0; i < nspecials; i++) {
1.7 pjanzen 235: char *p = s_token; /* point at scanned token */
1.4 deraadt 236: j = specials[i].rwd;
1.1 deraadt 237: if (*j++ != *p++ || *j++ != *p++)
238: continue; /* This test depends on the fact that
239: * identifiers are always at least 1 character
240: * long (ie. the first two bytes of the
241: * identifier are always meaningful) */
242: if (p[-1] == 0)
243: break; /* If its a one-character identifier */
244: while (*p++ == *j)
245: if (*j++ == 0)
246: goto found_keyword; /* I wish that C had a multi-level
247: * break... */
248: }
1.4 deraadt 249: if (i < nspecials) { /* we have a keyword */
1.1 deraadt 250: found_keyword:
251: ps.its_a_keyword = true;
252: ps.last_u_d = true;
1.4 deraadt 253: switch (specials[i].rwcode) {
1.1 deraadt 254: case 1: /* it is a switch */
255: return (swstmt);
256: case 2: /* a case or default */
257: return (casestmt);
258:
259: case 3: /* a "struct" */
260: if (ps.p_l_follow)
261: break; /* inside parens: cast */
262: l_struct = true;
263:
264: /*
265: * Next time around, we will want to know that we have had a
266: * 'struct'
267: */
268: case 4: /* one of the declaration keywords */
269: if (ps.p_l_follow) {
270: ps.cast_mask |= 1 << ps.p_l_follow;
271: break; /* inside parens: cast */
272: }
273: last_code = decl;
274: return (decl);
275:
276: case 5: /* if, while, for */
277: return (sp_paren);
278:
279: case 6: /* do, else */
280: return (sp_nparen);
281:
282: case 7:
283: ps.sizeof_keyword = true;
284: default: /* all others are treated like any other
285: * identifier */
286: return (ident);
287: } /* end of switch */
288: } /* end of if (found_it) */
289: if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
1.7 pjanzen 290: char *tp = buf_ptr;
1.1 deraadt 291: while (tp < buf_end)
292: if (*tp++ == ')' && (*tp == ';' || *tp == ','))
293: goto not_proc;
1.7 pjanzen 294: strlcpy(ps.procname, token, sizeof ps.procname);
1.1 deraadt 295: ps.in_parameter_declaration = 1;
296: rparen_count = 1;
297: not_proc:;
298: }
299: /*
300: * The following hack attempts to guess whether or not the current
301: * token is in fact a declaration keyword -- one that has been
302: * typedefd
303: */
1.16 ! deraadt 304: if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
! 305: isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
1.1 deraadt 306: && !ps.p_l_follow
307: && !ps.block_init
308: && (ps.last_token == rparen || ps.last_token == semicolon ||
309: ps.last_token == decl ||
310: ps.last_token == lbrace || ps.last_token == rbrace)) {
311: ps.its_a_keyword = true;
312: ps.last_u_d = true;
313: last_code = decl;
314: return decl;
315: }
316: if (last_code == decl) /* if this is a declared variable, then
317: * following sign is unary */
318: ps.last_u_d = true; /* will make "int a -1" work */
319: last_code = ident;
320: return (ident); /* the ident is not in the list */
321: } /* end of procesing for alpanum character */
322:
323: /* Scan a non-alphanumeric token */
324:
325: *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
326: * moved here */
327: *e_token = '\0';
328: if (++buf_ptr >= buf_end)
329: fill_buffer();
330:
331: switch (*token) {
332: case '\n':
333: unary_delim = ps.last_u_d;
334: ps.last_nl = true; /* remember that we just had a newline */
335: code = (had_eof ? 0 : newline);
336:
337: /*
338: * if data has been exausted, the newline is a dummy, and we should
339: * return code to stop
340: */
341: break;
342:
343: case '\'': /* start of quoted character */
344: case '"': /* start of string */
345: qchar = *token;
346: if (troff) {
347: e_token[-1] = '`';
348: if (qchar == '"')
349: *e_token++ = '`';
350: e_token = chfont(&bodyf, &stringf, e_token);
351: }
352: do { /* copy the string */
353: while (1) { /* move one character or [/<char>]<char> */
354: if (*buf_ptr == '\n') {
355: printf("%d: Unterminated literal\n", line_no);
356: goto stop_lit;
357: }
358: CHECK_SIZE_TOKEN; /* Only have to do this once in this loop,
359: * since CHECK_SIZE guarantees that there
360: * are at least 5 entries left */
361: *e_token = *buf_ptr++;
362: if (buf_ptr >= buf_end)
363: fill_buffer();
364: if (*e_token == BACKSLASH) { /* if escape, copy extra char */
365: if (*buf_ptr == '\n') /* check for escaped newline */
366: ++line_no;
367: if (troff) {
368: *++e_token = BACKSLASH;
369: if (*buf_ptr == BACKSLASH)
370: *++e_token = BACKSLASH;
371: }
372: *++e_token = *buf_ptr++;
373: ++e_token; /* we must increment this again because we
374: * copied two chars */
375: if (buf_ptr >= buf_end)
376: fill_buffer();
377: }
378: else
379: break; /* we copied one character */
380: } /* end of while (1) */
381: } while (*e_token++ != qchar);
382: if (troff) {
383: e_token = chfont(&stringf, &bodyf, e_token - 1);
384: if (qchar == '"')
385: *e_token++ = '\'';
386: }
387: stop_lit:
388: code = ident;
389: break;
390:
391: case ('('):
392: case ('['):
393: unary_delim = true;
394: code = lparen;
395: break;
396:
397: case (')'):
398: case (']'):
399: code = rparen;
400: break;
401:
402: case '#':
403: unary_delim = ps.last_u_d;
404: code = preesc;
405: break;
406:
407: case '?':
408: unary_delim = true;
409: code = question;
410: break;
411:
412: case (':'):
413: code = colon;
414: unary_delim = true;
415: break;
416:
417: case (';'):
418: unary_delim = true;
419: code = semicolon;
420: break;
421:
422: case ('{'):
423: unary_delim = true;
424:
425: /*
426: * if (ps.in_or_st) ps.block_init = 1;
427: */
428: /* ? code = ps.block_init ? lparen : lbrace; */
429: code = lbrace;
430: break;
431:
432: case ('}'):
433: unary_delim = true;
434: /* ? code = ps.block_init ? rparen : rbrace; */
435: code = rbrace;
436: break;
437:
438: case 014: /* a form feed */
439: unary_delim = ps.last_u_d;
440: ps.last_nl = true; /* remember this so we can set 'ps.col_1'
441: * right */
442: code = form_feed;
443: break;
444:
445: case (','):
446: unary_delim = true;
447: code = comma;
448: break;
449:
450: case '.':
451: unary_delim = false;
452: code = period;
453: break;
454:
455: case '-':
456: case '+': /* check for -, +, --, ++ */
457: code = (ps.last_u_d ? unary_op : binary_op);
458: unary_delim = true;
459:
460: if (*buf_ptr == token[0]) {
461: /* check for doubled character */
462: *e_token++ = *buf_ptr++;
463: /* buffer overflow will be checked at end of loop */
464: if (last_code == ident || last_code == rparen) {
465: code = (ps.last_u_d ? unary_op : postop);
466: /* check for following ++ or -- */
467: unary_delim = false;
468: }
469: }
470: else if (*buf_ptr == '=')
471: /* check for operator += */
472: *e_token++ = *buf_ptr++;
473: else if (*buf_ptr == '>') {
474: /* check for operator -> */
475: *e_token++ = *buf_ptr++;
476: if (!pointer_as_binop) {
477: unary_delim = false;
478: code = unary_op;
479: ps.want_blank = false;
480: }
481: }
482: break; /* buffer overflow will be checked at end of
483: * switch */
484:
485: case '=':
486: if (ps.in_or_st)
487: ps.block_init = 1;
488: #ifdef undef
489: if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
490: e_token[-1] = *buf_ptr++;
491: if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
492: *e_token++ = *buf_ptr++;
493: *e_token++ = '='; /* Flip =+ to += */
494: *e_token = 0;
495: }
496: #else
497: if (*buf_ptr == '=') {/* == */
498: *e_token++ = '='; /* Flip =+ to += */
499: buf_ptr++;
500: *e_token = 0;
501: }
502: #endif
503: code = binary_op;
504: unary_delim = true;
505: break;
506: /* can drop thru!!! */
507:
508: case '>':
509: case '<':
510: case '!': /* ops like <, <<, <=, !=, etc */
511: if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
512: *e_token++ = *buf_ptr;
513: if (++buf_ptr >= buf_end)
514: fill_buffer();
515: }
516: if (*buf_ptr == '=')
517: *e_token++ = *buf_ptr++;
518: code = (ps.last_u_d ? unary_op : binary_op);
519: unary_delim = true;
520: break;
521:
522: default:
523: if (token[0] == '/' && *buf_ptr == '*') {
524: /* it is start of comment */
525: *e_token++ = '*';
526:
527: if (++buf_ptr >= buf_end)
528: fill_buffer();
529:
530: code = comment;
531: unary_delim = ps.last_u_d;
532: break;
533: }
534: while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
535: /*
536: * handle ||, &&, etc, and also things as in int *****i
537: */
538: *e_token++ = *buf_ptr;
539: if (++buf_ptr >= buf_end)
540: fill_buffer();
541: }
542: code = (ps.last_u_d ? unary_op : binary_op);
543: unary_delim = true;
544:
545:
546: } /* end of switch */
547: if (code != newline) {
548: l_struct = false;
549: last_code = code;
550: }
551: if (buf_ptr >= buf_end) /* check for input buffer empty */
552: fill_buffer();
553: ps.last_u_d = unary_delim;
554: *e_token = '\0'; /* null terminate the token */
555: return (code);
556: }
557:
558: /*
559: * Add the given keyword to the keyword table, using val as the keyword type
560: */
1.3 mickey 561: void
1.11 deraadt 562: addkey(char *key, int val)
1.1 deraadt 563: {
1.7 pjanzen 564: struct templ *p;
1.12 millert 565: int i;
1.4 deraadt 566:
1.12 millert 567: for (i = 0; i < nspecials; i++) {
1.4 deraadt 568: p = &specials[i];
1.1 deraadt 569: if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
570: return;
1.4 deraadt 571: }
572:
573: if (specials == specialsinit) {
574: /*
575: * Whoa. Must reallocate special table.
576: */
577: nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
1.12 millert 578: maxspecials = nspecials + (nspecials >> 2);
1.13 deraadt 579: specials = (struct templ *)calloc(maxspecials, sizeof specials[0]);
1.4 deraadt 580: if (specials == NULL)
1.8 pjanzen 581: err(1, NULL);
1.12 millert 582: memcpy(specials, specialsinit, sizeof specialsinit);
1.4 deraadt 583: } else if (nspecials >= maxspecials) {
1.12 millert 584: int newspecials = maxspecials + (maxspecials >> 2);
1.10 tedu 585: struct templ *specials2;
586:
587: specials2 = realloc(specials, newspecials * sizeof specials[0]);
588: if (specials2 == NULL)
1.8 pjanzen 589: err(1, NULL);
1.10 tedu 590: specials = specials2;
591: maxspecials = newspecials;
1.4 deraadt 592: }
1.7 pjanzen 593:
1.12 millert 594: p = &specials[nspecials];
1.1 deraadt 595: p->rwd = key;
596: p->rwcode = val;
1.4 deraadt 597: nspecials++;
1.1 deraadt 598: return;
599: }