Annotation of src/usr.bin/indent/lexi.c, Revision 1.19
1.19 ! deraadt 1: /* $OpenBSD: lexi.c,v 1.18 2015/01/22 05:35:27 jsg Exp $ */
1.2 deraadt 2:
1.1 deraadt 3: /*
1.7 pjanzen 4: * Copyright (c) 1980, 1993
5: * The Regents of the University of California.
6: * Copyright (c) 1976 Board of Trustees of the University of Illinois.
1.1 deraadt 7: * Copyright (c) 1985 Sun Microsystems, Inc.
8: * All rights reserved.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
1.9 deraadt 18: * 3. Neither the name of the University nor the names of its contributors
1.1 deraadt 19: * may be used to endorse or promote products derived from this software
20: * without specific prior written permission.
21: *
22: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32: * SUCH DAMAGE.
33: */
34:
35: /*
36: * Here we have the token scanner for indent. It scans off one token and puts
37: * it in the global variable "token". It returns a code, indicating the type
38: * of token scanned.
39: */
40:
41: #include <stdio.h>
42: #include <ctype.h>
43: #include <stdlib.h>
44: #include <string.h>
1.7 pjanzen 45: #include <err.h>
1.1 deraadt 46: #include "indent_globs.h"
47: #include "indent_codes.h"
48:
49: #define alphanum 1
50: #define opchar 3
51:
52: struct templ {
53: char *rwd;
54: int rwcode;
55: };
56:
1.4 deraadt 57: struct templ specialsinit[] = {
1.3 mickey 58: { "switch", 1 },
59: { "case", 2 },
60: { "break", 0 },
61: { "struct", 3 },
62: { "union", 3 },
63: { "enum", 3 },
64: { "default", 2 },
65: { "int", 4 },
66: { "char", 4 },
67: { "float", 4 },
68: { "double", 4 },
69: { "long", 4 },
70: { "short", 4 },
71: { "typdef", 4 },
72: { "unsigned", 4 },
73: { "register", 4 },
74: { "static", 4 },
75: { "global", 4 },
76: { "extern", 4 },
77: { "void", 4 },
78: { "goto", 0 },
79: { "return", 0 },
80: { "if", 5 },
81: { "while", 5 },
82: { "for", 5 },
83: { "else", 6 },
84: { "do", 6 },
85: { "sizeof", 7 },
1.1 deraadt 86: };
87:
1.4 deraadt 88: struct templ *specials = specialsinit;
89: int nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
90: int maxspecials;
91:
1.1 deraadt 92: char chartype[128] =
93: { /* this is used to facilitate the decision of
94: * what type (alphanumeric, operator) each
95: * character is */
96: 0, 0, 0, 0, 0, 0, 0, 0,
97: 0, 0, 0, 0, 0, 0, 0, 0,
98: 0, 0, 0, 0, 0, 0, 0, 0,
99: 0, 0, 0, 0, 0, 0, 0, 0,
100: 0, 3, 0, 0, 1, 3, 3, 0,
101: 0, 0, 3, 3, 0, 3, 0, 3,
102: 1, 1, 1, 1, 1, 1, 1, 1,
103: 1, 1, 0, 0, 3, 3, 3, 3,
104: 0, 1, 1, 1, 1, 1, 1, 1,
105: 1, 1, 1, 1, 1, 1, 1, 1,
106: 1, 1, 1, 1, 1, 1, 1, 1,
107: 1, 1, 1, 0, 0, 0, 3, 1,
108: 0, 1, 1, 1, 1, 1, 1, 1,
109: 1, 1, 1, 1, 1, 1, 1, 1,
110: 1, 1, 1, 1, 1, 1, 1, 1,
111: 1, 1, 1, 0, 3, 0, 3, 0
112: };
113:
114:
115:
116:
117: int
1.11 deraadt 118: lexi(void)
1.1 deraadt 119: {
120: int unary_delim; /* this is set to 1 if the current token
121: * forces a following operator to be unary */
122: static int last_code; /* the last token type returned */
123: static int l_struct; /* set to 1 if the last token was 'struct' */
124: int code; /* internal code to be returned */
125: char qchar; /* the delimiter character for a string */
1.4 deraadt 126: int i;
1.1 deraadt 127:
128: e_token = s_token; /* point to start of place to save token */
129: unary_delim = false;
130: ps.col_1 = ps.last_nl; /* tell world that this token started in
131: * column 1 iff the last thing scanned was nl */
132: ps.last_nl = false;
133:
134: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
135: ps.col_1 = false; /* leading blanks imply token is not in column
136: * 1 */
137: if (++buf_ptr >= buf_end)
138: fill_buffer();
139: }
140:
141: /* Scan an alphanumeric token */
1.7 pjanzen 142: if (chartype[(int)*buf_ptr] == alphanum ||
1.16 deraadt 143: (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
1.1 deraadt 144: /*
145: * we have a character or number
146: */
1.7 pjanzen 147: char *j; /* used for searching thru list of
148: * reserved words */
1.16 deraadt 149: if (isdigit((unsigned char)*buf_ptr) ||
150: (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
1.1 deraadt 151: int seendot = 0,
1.6 deraadt 152: seenexp = 0,
153: seensfx = 0;
1.1 deraadt 154: if (*buf_ptr == '0' &&
155: (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
156: *e_token++ = *buf_ptr++;
157: *e_token++ = *buf_ptr++;
158: while (isxdigit(*buf_ptr)) {
159: CHECK_SIZE_TOKEN;
160: *e_token++ = *buf_ptr++;
161: }
162: }
163: else
164: while (1) {
1.7 pjanzen 165: if (*buf_ptr == '.') {
1.1 deraadt 166: if (seendot)
167: break;
168: else
169: seendot++;
1.7 pjanzen 170: }
1.1 deraadt 171: CHECK_SIZE_TOKEN;
172: *e_token++ = *buf_ptr++;
1.16 deraadt 173: if (!isdigit((unsigned char)*buf_ptr) && *buf_ptr != '.') {
1.1 deraadt 174: if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
175: break;
176: else {
177: seenexp++;
178: seendot++;
179: CHECK_SIZE_TOKEN;
180: *e_token++ = *buf_ptr++;
181: if (*buf_ptr == '+' || *buf_ptr == '-')
182: *e_token++ = *buf_ptr++;
183: }
1.7 pjanzen 184: }
1.1 deraadt 185: }
1.6 deraadt 186: while (1) {
187: if (!(seensfx & 1) &&
188: (*buf_ptr == 'U' || *buf_ptr == 'u')) {
189: CHECK_SIZE_TOKEN;
190: *e_token++ = *buf_ptr++;
191: seensfx |= 1;
192: continue;
193: }
194: if (!(seensfx & 2) &&
195: (*buf_ptr == 'L' || *buf_ptr == 'l')) {
196: CHECK_SIZE_TOKEN;
197: if (buf_ptr[1] == buf_ptr[0])
198: *e_token++ = *buf_ptr++;
199: *e_token++ = *buf_ptr++;
200: seensfx |= 2;
201: continue;
202: }
203: break;
1.18 jsg 204: }
205: if (!(seensfx & 1) &&
206: (*buf_ptr == 'F' || *buf_ptr == 'f')) {
207: CHECK_SIZE_TOKEN;
208: *e_token++ = *buf_ptr++;
209: seensfx |= 1;
1.6 deraadt 210: }
1.1 deraadt 211: }
212: else
1.7 pjanzen 213: while (chartype[(int)*buf_ptr] == alphanum) { /* copy it over */
1.1 deraadt 214: CHECK_SIZE_TOKEN;
215: *e_token++ = *buf_ptr++;
216: if (buf_ptr >= buf_end)
217: fill_buffer();
218: }
219: *e_token++ = '\0';
220: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
221: if (++buf_ptr >= buf_end)
222: fill_buffer();
223: }
224: ps.its_a_keyword = false;
225: ps.sizeof_keyword = false;
226: if (l_struct) { /* if last token was 'struct', then this token
227: * should be treated as a declaration */
228: l_struct = false;
229: last_code = ident;
230: ps.last_u_d = true;
231: return (decl);
232: }
1.14 martynas 233: ps.last_u_d = false; /* Operator after identifier is binary */
1.1 deraadt 234: last_code = ident; /* Remember that this is the code we will
235: * return */
236:
237: /*
238: * This loop will check if the token is a keyword.
239: */
1.4 deraadt 240: for (i = 0; i < nspecials; i++) {
1.7 pjanzen 241: char *p = s_token; /* point at scanned token */
1.4 deraadt 242: j = specials[i].rwd;
1.1 deraadt 243: if (*j++ != *p++ || *j++ != *p++)
244: continue; /* This test depends on the fact that
245: * identifiers are always at least 1 character
246: * long (ie. the first two bytes of the
247: * identifier are always meaningful) */
248: if (p[-1] == 0)
249: break; /* If its a one-character identifier */
250: while (*p++ == *j)
251: if (*j++ == 0)
252: goto found_keyword; /* I wish that C had a multi-level
253: * break... */
254: }
1.4 deraadt 255: if (i < nspecials) { /* we have a keyword */
1.1 deraadt 256: found_keyword:
257: ps.its_a_keyword = true;
258: ps.last_u_d = true;
1.4 deraadt 259: switch (specials[i].rwcode) {
1.1 deraadt 260: case 1: /* it is a switch */
261: return (swstmt);
262: case 2: /* a case or default */
263: return (casestmt);
264:
265: case 3: /* a "struct" */
266: if (ps.p_l_follow)
267: break; /* inside parens: cast */
268: l_struct = true;
269:
270: /*
271: * Next time around, we will want to know that we have had a
272: * 'struct'
273: */
274: case 4: /* one of the declaration keywords */
275: if (ps.p_l_follow) {
276: ps.cast_mask |= 1 << ps.p_l_follow;
277: break; /* inside parens: cast */
278: }
279: last_code = decl;
280: return (decl);
281:
282: case 5: /* if, while, for */
283: return (sp_paren);
284:
285: case 6: /* do, else */
286: return (sp_nparen);
287:
288: case 7:
289: ps.sizeof_keyword = true;
290: default: /* all others are treated like any other
291: * identifier */
292: return (ident);
293: } /* end of switch */
294: } /* end of if (found_it) */
295: if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
1.7 pjanzen 296: char *tp = buf_ptr;
1.1 deraadt 297: while (tp < buf_end)
298: if (*tp++ == ')' && (*tp == ';' || *tp == ','))
299: goto not_proc;
1.7 pjanzen 300: strlcpy(ps.procname, token, sizeof ps.procname);
1.1 deraadt 301: ps.in_parameter_declaration = 1;
302: rparen_count = 1;
303: not_proc:;
304: }
305: /*
306: * The following hack attempts to guess whether or not the current
307: * token is in fact a declaration keyword -- one that has been
308: * typedefd
309: */
1.16 deraadt 310: if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
311: isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
1.1 deraadt 312: && !ps.p_l_follow
313: && !ps.block_init
314: && (ps.last_token == rparen || ps.last_token == semicolon ||
315: ps.last_token == decl ||
316: ps.last_token == lbrace || ps.last_token == rbrace)) {
317: ps.its_a_keyword = true;
318: ps.last_u_d = true;
319: last_code = decl;
320: return decl;
321: }
322: if (last_code == decl) /* if this is a declared variable, then
323: * following sign is unary */
324: ps.last_u_d = true; /* will make "int a -1" work */
325: last_code = ident;
326: return (ident); /* the ident is not in the list */
327: } /* end of procesing for alpanum character */
328:
329: /* Scan a non-alphanumeric token */
330:
331: *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
332: * moved here */
333: *e_token = '\0';
334: if (++buf_ptr >= buf_end)
335: fill_buffer();
336:
337: switch (*token) {
338: case '\n':
339: unary_delim = ps.last_u_d;
340: ps.last_nl = true; /* remember that we just had a newline */
341: code = (had_eof ? 0 : newline);
342:
343: /*
344: * if data has been exausted, the newline is a dummy, and we should
345: * return code to stop
346: */
347: break;
348:
349: case '\'': /* start of quoted character */
350: case '"': /* start of string */
351: qchar = *token;
352: if (troff) {
353: e_token[-1] = '`';
354: if (qchar == '"')
355: *e_token++ = '`';
356: e_token = chfont(&bodyf, &stringf, e_token);
357: }
358: do { /* copy the string */
359: while (1) { /* move one character or [/<char>]<char> */
360: if (*buf_ptr == '\n') {
361: printf("%d: Unterminated literal\n", line_no);
362: goto stop_lit;
363: }
364: CHECK_SIZE_TOKEN; /* Only have to do this once in this loop,
365: * since CHECK_SIZE guarantees that there
366: * are at least 5 entries left */
367: *e_token = *buf_ptr++;
368: if (buf_ptr >= buf_end)
369: fill_buffer();
370: if (*e_token == BACKSLASH) { /* if escape, copy extra char */
371: if (*buf_ptr == '\n') /* check for escaped newline */
372: ++line_no;
373: if (troff) {
374: *++e_token = BACKSLASH;
375: if (*buf_ptr == BACKSLASH)
376: *++e_token = BACKSLASH;
377: }
378: *++e_token = *buf_ptr++;
379: ++e_token; /* we must increment this again because we
380: * copied two chars */
381: if (buf_ptr >= buf_end)
382: fill_buffer();
383: }
384: else
385: break; /* we copied one character */
386: } /* end of while (1) */
387: } while (*e_token++ != qchar);
388: if (troff) {
389: e_token = chfont(&stringf, &bodyf, e_token - 1);
390: if (qchar == '"')
391: *e_token++ = '\'';
392: }
393: stop_lit:
394: code = ident;
395: break;
396:
397: case ('('):
398: case ('['):
399: unary_delim = true;
400: code = lparen;
401: break;
402:
403: case (')'):
404: case (']'):
405: code = rparen;
406: break;
407:
408: case '#':
409: unary_delim = ps.last_u_d;
410: code = preesc;
411: break;
412:
413: case '?':
414: unary_delim = true;
415: code = question;
416: break;
417:
418: case (':'):
419: code = colon;
420: unary_delim = true;
421: break;
422:
423: case (';'):
424: unary_delim = true;
425: code = semicolon;
426: break;
427:
428: case ('{'):
429: unary_delim = true;
430:
431: /*
432: * if (ps.in_or_st) ps.block_init = 1;
433: */
434: /* ? code = ps.block_init ? lparen : lbrace; */
435: code = lbrace;
436: break;
437:
438: case ('}'):
439: unary_delim = true;
440: /* ? code = ps.block_init ? rparen : rbrace; */
441: code = rbrace;
442: break;
443:
444: case 014: /* a form feed */
445: unary_delim = ps.last_u_d;
446: ps.last_nl = true; /* remember this so we can set 'ps.col_1'
447: * right */
448: code = form_feed;
449: break;
450:
451: case (','):
452: unary_delim = true;
453: code = comma;
454: break;
455:
456: case '.':
457: unary_delim = false;
458: code = period;
459: break;
460:
461: case '-':
462: case '+': /* check for -, +, --, ++ */
463: code = (ps.last_u_d ? unary_op : binary_op);
464: unary_delim = true;
465:
466: if (*buf_ptr == token[0]) {
467: /* check for doubled character */
468: *e_token++ = *buf_ptr++;
469: /* buffer overflow will be checked at end of loop */
470: if (last_code == ident || last_code == rparen) {
471: code = (ps.last_u_d ? unary_op : postop);
472: /* check for following ++ or -- */
473: unary_delim = false;
474: }
475: }
476: else if (*buf_ptr == '=')
477: /* check for operator += */
478: *e_token++ = *buf_ptr++;
479: else if (*buf_ptr == '>') {
480: /* check for operator -> */
481: *e_token++ = *buf_ptr++;
482: if (!pointer_as_binop) {
483: unary_delim = false;
484: code = unary_op;
485: ps.want_blank = false;
486: }
487: }
488: break; /* buffer overflow will be checked at end of
489: * switch */
490:
491: case '=':
492: if (ps.in_or_st)
493: ps.block_init = 1;
494: #ifdef undef
495: if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
496: e_token[-1] = *buf_ptr++;
497: if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
498: *e_token++ = *buf_ptr++;
499: *e_token++ = '='; /* Flip =+ to += */
500: *e_token = 0;
501: }
502: #else
503: if (*buf_ptr == '=') {/* == */
504: *e_token++ = '='; /* Flip =+ to += */
505: buf_ptr++;
506: *e_token = 0;
507: }
508: #endif
509: code = binary_op;
510: unary_delim = true;
511: break;
512: /* can drop thru!!! */
513:
514: case '>':
515: case '<':
516: case '!': /* ops like <, <<, <=, !=, etc */
517: if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
518: *e_token++ = *buf_ptr;
519: if (++buf_ptr >= buf_end)
520: fill_buffer();
521: }
522: if (*buf_ptr == '=')
523: *e_token++ = *buf_ptr++;
524: code = (ps.last_u_d ? unary_op : binary_op);
525: unary_delim = true;
526: break;
527:
528: default:
529: if (token[0] == '/' && *buf_ptr == '*') {
530: /* it is start of comment */
531: *e_token++ = '*';
532:
533: if (++buf_ptr >= buf_end)
534: fill_buffer();
535:
536: code = comment;
537: unary_delim = ps.last_u_d;
538: break;
539: }
540: while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
541: /*
542: * handle ||, &&, etc, and also things as in int *****i
543: */
544: *e_token++ = *buf_ptr;
545: if (++buf_ptr >= buf_end)
546: fill_buffer();
547: }
548: code = (ps.last_u_d ? unary_op : binary_op);
549: unary_delim = true;
550:
551:
552: } /* end of switch */
553: if (code != newline) {
554: l_struct = false;
555: last_code = code;
556: }
557: if (buf_ptr >= buf_end) /* check for input buffer empty */
558: fill_buffer();
559: ps.last_u_d = unary_delim;
560: *e_token = '\0'; /* null terminate the token */
561: return (code);
562: }
563:
564: /*
565: * Add the given keyword to the keyword table, using val as the keyword type
566: */
1.3 mickey 567: void
1.11 deraadt 568: addkey(char *key, int val)
1.1 deraadt 569: {
1.7 pjanzen 570: struct templ *p;
1.12 millert 571: int i;
1.4 deraadt 572:
1.12 millert 573: for (i = 0; i < nspecials; i++) {
1.4 deraadt 574: p = &specials[i];
1.1 deraadt 575: if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
576: return;
1.4 deraadt 577: }
578:
579: if (specials == specialsinit) {
580: /*
581: * Whoa. Must reallocate special table.
582: */
583: nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
1.12 millert 584: maxspecials = nspecials + (nspecials >> 2);
1.19 ! deraadt 585: specials = calloc(maxspecials, sizeof specials[0]);
1.4 deraadt 586: if (specials == NULL)
1.8 pjanzen 587: err(1, NULL);
1.12 millert 588: memcpy(specials, specialsinit, sizeof specialsinit);
1.4 deraadt 589: } else if (nspecials >= maxspecials) {
1.12 millert 590: int newspecials = maxspecials + (maxspecials >> 2);
1.10 tedu 591: struct templ *specials2;
592:
1.17 doug 593: specials2 = reallocarray(specials, newspecials, sizeof(specials[0]));
1.10 tedu 594: if (specials2 == NULL)
1.8 pjanzen 595: err(1, NULL);
1.10 tedu 596: specials = specials2;
597: maxspecials = newspecials;
1.4 deraadt 598: }
1.7 pjanzen 599:
1.12 millert 600: p = &specials[nspecials];
1.1 deraadt 601: p->rwd = key;
602: p->rwcode = val;
1.4 deraadt 603: nspecials++;
1.1 deraadt 604: return;
605: }