Annotation of src/usr.bin/indent/lexi.c, Revision 1.15
1.15 ! deraadt 1: /* $OpenBSD: lexi.c,v 1.14 2007/11/27 16:22:14 martynas Exp $ */
1.2 deraadt 2:
1.1 deraadt 3: /*
1.7 pjanzen 4: * Copyright (c) 1980, 1993
5: * The Regents of the University of California.
6: * Copyright (c) 1976 Board of Trustees of the University of Illinois.
1.1 deraadt 7: * Copyright (c) 1985 Sun Microsystems, Inc.
8: * All rights reserved.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
1.9 deraadt 18: * 3. Neither the name of the University nor the names of its contributors
1.1 deraadt 19: * may be used to endorse or promote products derived from this software
20: * without specific prior written permission.
21: *
22: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32: * SUCH DAMAGE.
33: */
34:
35: /*
36: * Here we have the token scanner for indent. It scans off one token and puts
37: * it in the global variable "token". It returns a code, indicating the type
38: * of token scanned.
39: */
40:
41: #include <stdio.h>
42: #include <ctype.h>
43: #include <stdlib.h>
44: #include <string.h>
1.7 pjanzen 45: #include <err.h>
1.1 deraadt 46: #include "indent_globs.h"
47: #include "indent_codes.h"
48:
49: #define alphanum 1
50: #define opchar 3
51:
52: struct templ {
53: char *rwd;
54: int rwcode;
55: };
56:
1.4 deraadt 57: struct templ specialsinit[] = {
1.3 mickey 58: { "switch", 1 },
59: { "case", 2 },
60: { "break", 0 },
61: { "struct", 3 },
62: { "union", 3 },
63: { "enum", 3 },
64: { "default", 2 },
65: { "int", 4 },
66: { "char", 4 },
67: { "float", 4 },
68: { "double", 4 },
69: { "long", 4 },
70: { "short", 4 },
71: { "typdef", 4 },
72: { "unsigned", 4 },
73: { "register", 4 },
74: { "static", 4 },
75: { "global", 4 },
76: { "extern", 4 },
77: { "void", 4 },
78: { "goto", 0 },
79: { "return", 0 },
80: { "if", 5 },
81: { "while", 5 },
82: { "for", 5 },
83: { "else", 6 },
84: { "do", 6 },
85: { "sizeof", 7 },
1.1 deraadt 86: };
87:
1.4 deraadt 88: struct templ *specials = specialsinit;
89: int nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
90: int maxspecials;
91:
1.1 deraadt 92: char chartype[128] =
93: { /* this is used to facilitate the decision of
94: * what type (alphanumeric, operator) each
95: * character is */
96: 0, 0, 0, 0, 0, 0, 0, 0,
97: 0, 0, 0, 0, 0, 0, 0, 0,
98: 0, 0, 0, 0, 0, 0, 0, 0,
99: 0, 0, 0, 0, 0, 0, 0, 0,
100: 0, 3, 0, 0, 1, 3, 3, 0,
101: 0, 0, 3, 3, 0, 3, 0, 3,
102: 1, 1, 1, 1, 1, 1, 1, 1,
103: 1, 1, 0, 0, 3, 3, 3, 3,
104: 0, 1, 1, 1, 1, 1, 1, 1,
105: 1, 1, 1, 1, 1, 1, 1, 1,
106: 1, 1, 1, 1, 1, 1, 1, 1,
107: 1, 1, 1, 0, 0, 0, 3, 1,
108: 0, 1, 1, 1, 1, 1, 1, 1,
109: 1, 1, 1, 1, 1, 1, 1, 1,
110: 1, 1, 1, 1, 1, 1, 1, 1,
111: 1, 1, 1, 0, 3, 0, 3, 0
112: };
113:
114:
115:
116:
117: int
1.11 deraadt 118: lexi(void)
1.1 deraadt 119: {
120: int unary_delim; /* this is set to 1 if the current token
121: * forces a following operator to be unary */
122: static int last_code; /* the last token type returned */
123: static int l_struct; /* set to 1 if the last token was 'struct' */
124: int code; /* internal code to be returned */
125: char qchar; /* the delimiter character for a string */
1.4 deraadt 126: int i;
1.1 deraadt 127:
128: e_token = s_token; /* point to start of place to save token */
129: unary_delim = false;
130: ps.col_1 = ps.last_nl; /* tell world that this token started in
131: * column 1 iff the last thing scanned was nl */
132: ps.last_nl = false;
133:
134: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
135: ps.col_1 = false; /* leading blanks imply token is not in column
136: * 1 */
137: if (++buf_ptr >= buf_end)
138: fill_buffer();
139: }
140:
141: /* Scan an alphanumeric token */
1.7 pjanzen 142: if (chartype[(int)*buf_ptr] == alphanum ||
1.3 mickey 143: (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
1.1 deraadt 144: /*
145: * we have a character or number
146: */
1.7 pjanzen 147: char *j; /* used for searching thru list of
148: * reserved words */
1.3 mickey 149: if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
1.1 deraadt 150: int seendot = 0,
1.6 deraadt 151: seenexp = 0,
152: seensfx = 0;
1.1 deraadt 153: if (*buf_ptr == '0' &&
154: (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
155: *e_token++ = *buf_ptr++;
156: *e_token++ = *buf_ptr++;
157: while (isxdigit(*buf_ptr)) {
158: CHECK_SIZE_TOKEN;
159: *e_token++ = *buf_ptr++;
160: }
161: }
162: else
163: while (1) {
1.7 pjanzen 164: if (*buf_ptr == '.') {
1.1 deraadt 165: if (seendot)
166: break;
167: else
168: seendot++;
1.7 pjanzen 169: }
1.1 deraadt 170: CHECK_SIZE_TOKEN;
171: *e_token++ = *buf_ptr++;
1.7 pjanzen 172: if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
1.1 deraadt 173: if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
174: break;
175: else {
176: seenexp++;
177: seendot++;
178: CHECK_SIZE_TOKEN;
179: *e_token++ = *buf_ptr++;
180: if (*buf_ptr == '+' || *buf_ptr == '-')
181: *e_token++ = *buf_ptr++;
182: }
1.7 pjanzen 183: }
1.1 deraadt 184: }
1.6 deraadt 185: while (1) {
186: if (!(seensfx & 1) &&
187: (*buf_ptr == 'U' || *buf_ptr == 'u')) {
188: CHECK_SIZE_TOKEN;
189: *e_token++ = *buf_ptr++;
190: seensfx |= 1;
191: continue;
192: }
193: if (!(seensfx & 2) &&
194: (*buf_ptr == 'L' || *buf_ptr == 'l')) {
195: CHECK_SIZE_TOKEN;
196: if (buf_ptr[1] == buf_ptr[0])
197: *e_token++ = *buf_ptr++;
198: *e_token++ = *buf_ptr++;
199: seensfx |= 2;
200: continue;
201: }
202: break;
203: }
1.1 deraadt 204: }
205: else
1.7 pjanzen 206: while (chartype[(int)*buf_ptr] == alphanum) { /* copy it over */
1.1 deraadt 207: CHECK_SIZE_TOKEN;
208: *e_token++ = *buf_ptr++;
209: if (buf_ptr >= buf_end)
210: fill_buffer();
211: }
212: *e_token++ = '\0';
213: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
214: if (++buf_ptr >= buf_end)
215: fill_buffer();
216: }
217: ps.its_a_keyword = false;
218: ps.sizeof_keyword = false;
219: if (l_struct) { /* if last token was 'struct', then this token
220: * should be treated as a declaration */
221: l_struct = false;
222: last_code = ident;
223: ps.last_u_d = true;
224: return (decl);
225: }
1.14 martynas 226: ps.last_u_d = false; /* Operator after identifier is binary */
1.1 deraadt 227: last_code = ident; /* Remember that this is the code we will
228: * return */
229:
230: /*
231: * This loop will check if the token is a keyword.
232: */
1.4 deraadt 233: for (i = 0; i < nspecials; i++) {
1.7 pjanzen 234: char *p = s_token; /* point at scanned token */
1.4 deraadt 235: j = specials[i].rwd;
1.1 deraadt 236: if (*j++ != *p++ || *j++ != *p++)
237: continue; /* This test depends on the fact that
238: * identifiers are always at least 1 character
239: * long (ie. the first two bytes of the
240: * identifier are always meaningful) */
241: if (p[-1] == 0)
242: break; /* If its a one-character identifier */
243: while (*p++ == *j)
244: if (*j++ == 0)
245: goto found_keyword; /* I wish that C had a multi-level
246: * break... */
247: }
1.4 deraadt 248: if (i < nspecials) { /* we have a keyword */
1.1 deraadt 249: found_keyword:
250: ps.its_a_keyword = true;
251: ps.last_u_d = true;
1.4 deraadt 252: switch (specials[i].rwcode) {
1.1 deraadt 253: case 1: /* it is a switch */
254: return (swstmt);
255: case 2: /* a case or default */
256: return (casestmt);
257:
258: case 3: /* a "struct" */
259: if (ps.p_l_follow)
260: break; /* inside parens: cast */
261: l_struct = true;
262:
263: /*
264: * Next time around, we will want to know that we have had a
265: * 'struct'
266: */
267: case 4: /* one of the declaration keywords */
268: if (ps.p_l_follow) {
269: ps.cast_mask |= 1 << ps.p_l_follow;
270: break; /* inside parens: cast */
271: }
272: last_code = decl;
273: return (decl);
274:
275: case 5: /* if, while, for */
276: return (sp_paren);
277:
278: case 6: /* do, else */
279: return (sp_nparen);
280:
281: case 7:
282: ps.sizeof_keyword = true;
283: default: /* all others are treated like any other
284: * identifier */
285: return (ident);
286: } /* end of switch */
287: } /* end of if (found_it) */
288: if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
1.7 pjanzen 289: char *tp = buf_ptr;
1.1 deraadt 290: while (tp < buf_end)
291: if (*tp++ == ')' && (*tp == ';' || *tp == ','))
292: goto not_proc;
1.7 pjanzen 293: strlcpy(ps.procname, token, sizeof ps.procname);
1.1 deraadt 294: ps.in_parameter_declaration = 1;
295: rparen_count = 1;
296: not_proc:;
297: }
298: /*
299: * The following hack attempts to guess whether or not the current
300: * token is in fact a declaration keyword -- one that has been
301: * typedefd
302: */
303: if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
304: && !ps.p_l_follow
305: && !ps.block_init
306: && (ps.last_token == rparen || ps.last_token == semicolon ||
307: ps.last_token == decl ||
308: ps.last_token == lbrace || ps.last_token == rbrace)) {
309: ps.its_a_keyword = true;
310: ps.last_u_d = true;
311: last_code = decl;
312: return decl;
313: }
314: if (last_code == decl) /* if this is a declared variable, then
315: * following sign is unary */
316: ps.last_u_d = true; /* will make "int a -1" work */
317: last_code = ident;
318: return (ident); /* the ident is not in the list */
319: } /* end of procesing for alpanum character */
320:
321: /* Scan a non-alphanumeric token */
322:
323: *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
324: * moved here */
325: *e_token = '\0';
326: if (++buf_ptr >= buf_end)
327: fill_buffer();
328:
329: switch (*token) {
330: case '\n':
331: unary_delim = ps.last_u_d;
332: ps.last_nl = true; /* remember that we just had a newline */
333: code = (had_eof ? 0 : newline);
334:
335: /*
336: * if data has been exausted, the newline is a dummy, and we should
337: * return code to stop
338: */
339: break;
340:
341: case '\'': /* start of quoted character */
342: case '"': /* start of string */
343: qchar = *token;
344: if (troff) {
345: e_token[-1] = '`';
346: if (qchar == '"')
347: *e_token++ = '`';
348: e_token = chfont(&bodyf, &stringf, e_token);
349: }
350: do { /* copy the string */
351: while (1) { /* move one character or [/<char>]<char> */
352: if (*buf_ptr == '\n') {
353: printf("%d: Unterminated literal\n", line_no);
354: goto stop_lit;
355: }
356: CHECK_SIZE_TOKEN; /* Only have to do this once in this loop,
357: * since CHECK_SIZE guarantees that there
358: * are at least 5 entries left */
359: *e_token = *buf_ptr++;
360: if (buf_ptr >= buf_end)
361: fill_buffer();
362: if (*e_token == BACKSLASH) { /* if escape, copy extra char */
363: if (*buf_ptr == '\n') /* check for escaped newline */
364: ++line_no;
365: if (troff) {
366: *++e_token = BACKSLASH;
367: if (*buf_ptr == BACKSLASH)
368: *++e_token = BACKSLASH;
369: }
370: *++e_token = *buf_ptr++;
371: ++e_token; /* we must increment this again because we
372: * copied two chars */
373: if (buf_ptr >= buf_end)
374: fill_buffer();
375: }
376: else
377: break; /* we copied one character */
378: } /* end of while (1) */
379: } while (*e_token++ != qchar);
380: if (troff) {
381: e_token = chfont(&stringf, &bodyf, e_token - 1);
382: if (qchar == '"')
383: *e_token++ = '\'';
384: }
385: stop_lit:
386: code = ident;
387: break;
388:
389: case ('('):
390: case ('['):
391: unary_delim = true;
392: code = lparen;
393: break;
394:
395: case (')'):
396: case (']'):
397: code = rparen;
398: break;
399:
400: case '#':
401: unary_delim = ps.last_u_d;
402: code = preesc;
403: break;
404:
405: case '?':
406: unary_delim = true;
407: code = question;
408: break;
409:
410: case (':'):
411: code = colon;
412: unary_delim = true;
413: break;
414:
415: case (';'):
416: unary_delim = true;
417: code = semicolon;
418: break;
419:
420: case ('{'):
421: unary_delim = true;
422:
423: /*
424: * if (ps.in_or_st) ps.block_init = 1;
425: */
426: /* ? code = ps.block_init ? lparen : lbrace; */
427: code = lbrace;
428: break;
429:
430: case ('}'):
431: unary_delim = true;
432: /* ? code = ps.block_init ? rparen : rbrace; */
433: code = rbrace;
434: break;
435:
436: case 014: /* a form feed */
437: unary_delim = ps.last_u_d;
438: ps.last_nl = true; /* remember this so we can set 'ps.col_1'
439: * right */
440: code = form_feed;
441: break;
442:
443: case (','):
444: unary_delim = true;
445: code = comma;
446: break;
447:
448: case '.':
449: unary_delim = false;
450: code = period;
451: break;
452:
453: case '-':
454: case '+': /* check for -, +, --, ++ */
455: code = (ps.last_u_d ? unary_op : binary_op);
456: unary_delim = true;
457:
458: if (*buf_ptr == token[0]) {
459: /* check for doubled character */
460: *e_token++ = *buf_ptr++;
461: /* buffer overflow will be checked at end of loop */
462: if (last_code == ident || last_code == rparen) {
463: code = (ps.last_u_d ? unary_op : postop);
464: /* check for following ++ or -- */
465: unary_delim = false;
466: }
467: }
468: else if (*buf_ptr == '=')
469: /* check for operator += */
470: *e_token++ = *buf_ptr++;
471: else if (*buf_ptr == '>') {
472: /* check for operator -> */
473: *e_token++ = *buf_ptr++;
474: if (!pointer_as_binop) {
475: unary_delim = false;
476: code = unary_op;
477: ps.want_blank = false;
478: }
479: }
480: break; /* buffer overflow will be checked at end of
481: * switch */
482:
483: case '=':
484: if (ps.in_or_st)
485: ps.block_init = 1;
486: #ifdef undef
487: if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
488: e_token[-1] = *buf_ptr++;
489: if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
490: *e_token++ = *buf_ptr++;
491: *e_token++ = '='; /* Flip =+ to += */
492: *e_token = 0;
493: }
494: #else
495: if (*buf_ptr == '=') {/* == */
496: *e_token++ = '='; /* Flip =+ to += */
497: buf_ptr++;
498: *e_token = 0;
499: }
500: #endif
501: code = binary_op;
502: unary_delim = true;
503: break;
504: /* can drop thru!!! */
505:
506: case '>':
507: case '<':
508: case '!': /* ops like <, <<, <=, !=, etc */
509: if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
510: *e_token++ = *buf_ptr;
511: if (++buf_ptr >= buf_end)
512: fill_buffer();
513: }
514: if (*buf_ptr == '=')
515: *e_token++ = *buf_ptr++;
516: code = (ps.last_u_d ? unary_op : binary_op);
517: unary_delim = true;
518: break;
519:
520: default:
521: if (token[0] == '/' && *buf_ptr == '*') {
522: /* it is start of comment */
523: *e_token++ = '*';
524:
525: if (++buf_ptr >= buf_end)
526: fill_buffer();
527:
528: code = comment;
529: unary_delim = ps.last_u_d;
530: break;
531: }
532: while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
533: /*
534: * handle ||, &&, etc, and also things as in int *****i
535: */
536: *e_token++ = *buf_ptr;
537: if (++buf_ptr >= buf_end)
538: fill_buffer();
539: }
540: code = (ps.last_u_d ? unary_op : binary_op);
541: unary_delim = true;
542:
543:
544: } /* end of switch */
545: if (code != newline) {
546: l_struct = false;
547: last_code = code;
548: }
549: if (buf_ptr >= buf_end) /* check for input buffer empty */
550: fill_buffer();
551: ps.last_u_d = unary_delim;
552: *e_token = '\0'; /* null terminate the token */
553: return (code);
554: }
555:
556: /*
557: * Add the given keyword to the keyword table, using val as the keyword type
558: */
1.3 mickey 559: void
1.11 deraadt 560: addkey(char *key, int val)
1.1 deraadt 561: {
1.7 pjanzen 562: struct templ *p;
1.12 millert 563: int i;
1.4 deraadt 564:
1.12 millert 565: for (i = 0; i < nspecials; i++) {
1.4 deraadt 566: p = &specials[i];
1.1 deraadt 567: if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
568: return;
1.4 deraadt 569: }
570:
571: if (specials == specialsinit) {
572: /*
573: * Whoa. Must reallocate special table.
574: */
575: nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
1.12 millert 576: maxspecials = nspecials + (nspecials >> 2);
1.13 deraadt 577: specials = (struct templ *)calloc(maxspecials, sizeof specials[0]);
1.4 deraadt 578: if (specials == NULL)
1.8 pjanzen 579: err(1, NULL);
1.12 millert 580: memcpy(specials, specialsinit, sizeof specialsinit);
1.4 deraadt 581: } else if (nspecials >= maxspecials) {
1.12 millert 582: int newspecials = maxspecials + (maxspecials >> 2);
1.10 tedu 583: struct templ *specials2;
584:
585: specials2 = realloc(specials, newspecials * sizeof specials[0]);
586: if (specials2 == NULL)
1.8 pjanzen 587: err(1, NULL);
1.10 tedu 588: specials = specials2;
589: maxspecials = newspecials;
1.4 deraadt 590: }
1.7 pjanzen 591:
1.12 millert 592: p = &specials[nspecials];
1.1 deraadt 593: p->rwd = key;
594: p->rwcode = val;
1.4 deraadt 595: nspecials++;
1.1 deraadt 596: return;
597: }