Annotation of src/usr.bin/indent/lexi.c, Revision 1.7
1.7 ! pjanzen 1: /* $OpenBSD: lexi.c,v 1.6 1998/05/22 05:15:12 deraadt Exp $ */
1.2 deraadt 2:
1.1 deraadt 3: /*
1.7 ! pjanzen 4: * Copyright (c) 1980, 1993
! 5: * The Regents of the University of California.
! 6: * Copyright (c) 1976 Board of Trustees of the University of Illinois.
1.1 deraadt 7: * Copyright (c) 1985 Sun Microsystems, Inc.
8: * All rights reserved.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
18: * 3. All advertising materials mentioning features or use of this software
19: * must display the following acknowledgement:
20: * This product includes software developed by the University of
21: * California, Berkeley and its contributors.
22: * 4. Neither the name of the University nor the names of its contributors
23: * may be used to endorse or promote products derived from this software
24: * without specific prior written permission.
25: *
26: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36: * SUCH DAMAGE.
37: */
38:
39: #ifndef lint
1.7 ! pjanzen 40: /*static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";*/
! 41: static char rcsid[] = "$OpenBSD: lexi.c,v 1.6 1998/05/22 05:15:12 deraadt Exp $";
1.1 deraadt 42: #endif /* not lint */
43:
44: /*
45: * Here we have the token scanner for indent. It scans off one token and puts
46: * it in the global variable "token". It returns a code, indicating the type
47: * of token scanned.
48: */
49:
50: #include <stdio.h>
51: #include <ctype.h>
52: #include <stdlib.h>
53: #include <string.h>
1.7 ! pjanzen 54: #include <err.h>
1.1 deraadt 55: #include "indent_globs.h"
56: #include "indent_codes.h"
57:
58: #define alphanum 1
59: #define opchar 3
60:
61: struct templ {
62: char *rwd;
63: int rwcode;
64: };
65:
1.4 deraadt 66: struct templ specialsinit[] = {
1.3 mickey 67: { "switch", 1 },
68: { "case", 2 },
69: { "break", 0 },
70: { "struct", 3 },
71: { "union", 3 },
72: { "enum", 3 },
73: { "default", 2 },
74: { "int", 4 },
75: { "char", 4 },
76: { "float", 4 },
77: { "double", 4 },
78: { "long", 4 },
79: { "short", 4 },
80: { "typdef", 4 },
81: { "unsigned", 4 },
82: { "register", 4 },
83: { "static", 4 },
84: { "global", 4 },
85: { "extern", 4 },
86: { "void", 4 },
87: { "goto", 0 },
88: { "return", 0 },
89: { "if", 5 },
90: { "while", 5 },
91: { "for", 5 },
92: { "else", 6 },
93: { "do", 6 },
94: { "sizeof", 7 },
1.1 deraadt 95: };
96:
1.4 deraadt 97: struct templ *specials = specialsinit;
98: int nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
99: int maxspecials;
100:
1.1 deraadt 101: char chartype[128] =
102: { /* this is used to facilitate the decision of
103: * what type (alphanumeric, operator) each
104: * character is */
105: 0, 0, 0, 0, 0, 0, 0, 0,
106: 0, 0, 0, 0, 0, 0, 0, 0,
107: 0, 0, 0, 0, 0, 0, 0, 0,
108: 0, 0, 0, 0, 0, 0, 0, 0,
109: 0, 3, 0, 0, 1, 3, 3, 0,
110: 0, 0, 3, 3, 0, 3, 0, 3,
111: 1, 1, 1, 1, 1, 1, 1, 1,
112: 1, 1, 0, 0, 3, 3, 3, 3,
113: 0, 1, 1, 1, 1, 1, 1, 1,
114: 1, 1, 1, 1, 1, 1, 1, 1,
115: 1, 1, 1, 1, 1, 1, 1, 1,
116: 1, 1, 1, 0, 0, 0, 3, 1,
117: 0, 1, 1, 1, 1, 1, 1, 1,
118: 1, 1, 1, 1, 1, 1, 1, 1,
119: 1, 1, 1, 1, 1, 1, 1, 1,
120: 1, 1, 1, 0, 3, 0, 3, 0
121: };
122:
123:
124:
125:
126: int
127: lexi()
128: {
129: int unary_delim; /* this is set to 1 if the current token
130: * forces a following operator to be unary */
131: static int last_code; /* the last token type returned */
132: static int l_struct; /* set to 1 if the last token was 'struct' */
133: int code; /* internal code to be returned */
134: char qchar; /* the delimiter character for a string */
1.4 deraadt 135: int i;
1.1 deraadt 136:
137: e_token = s_token; /* point to start of place to save token */
138: unary_delim = false;
139: ps.col_1 = ps.last_nl; /* tell world that this token started in
140: * column 1 iff the last thing scanned was nl */
141: ps.last_nl = false;
142:
143: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
144: ps.col_1 = false; /* leading blanks imply token is not in column
145: * 1 */
146: if (++buf_ptr >= buf_end)
147: fill_buffer();
148: }
149:
150: /* Scan an alphanumeric token */
1.7 ! pjanzen 151: if (chartype[(int)*buf_ptr] == alphanum ||
1.3 mickey 152: (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
1.1 deraadt 153: /*
154: * we have a character or number
155: */
1.7 ! pjanzen 156: char *j; /* used for searching thru list of
! 157: * reserved words */
1.3 mickey 158: if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
1.1 deraadt 159: int seendot = 0,
1.6 deraadt 160: seenexp = 0,
161: seensfx = 0;
1.1 deraadt 162: if (*buf_ptr == '0' &&
163: (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
164: *e_token++ = *buf_ptr++;
165: *e_token++ = *buf_ptr++;
166: while (isxdigit(*buf_ptr)) {
167: CHECK_SIZE_TOKEN;
168: *e_token++ = *buf_ptr++;
169: }
170: }
171: else
172: while (1) {
1.7 ! pjanzen 173: if (*buf_ptr == '.') {
1.1 deraadt 174: if (seendot)
175: break;
176: else
177: seendot++;
1.7 ! pjanzen 178: }
1.1 deraadt 179: CHECK_SIZE_TOKEN;
180: *e_token++ = *buf_ptr++;
1.7 ! pjanzen 181: if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
1.1 deraadt 182: if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
183: break;
184: else {
185: seenexp++;
186: seendot++;
187: CHECK_SIZE_TOKEN;
188: *e_token++ = *buf_ptr++;
189: if (*buf_ptr == '+' || *buf_ptr == '-')
190: *e_token++ = *buf_ptr++;
191: }
1.7 ! pjanzen 192: }
1.1 deraadt 193: }
1.6 deraadt 194: while (1) {
195: if (!(seensfx & 1) &&
196: (*buf_ptr == 'U' || *buf_ptr == 'u')) {
197: CHECK_SIZE_TOKEN;
198: *e_token++ = *buf_ptr++;
199: seensfx |= 1;
200: continue;
201: }
202: if (!(seensfx & 2) &&
203: (*buf_ptr == 'L' || *buf_ptr == 'l')) {
204: CHECK_SIZE_TOKEN;
205: if (buf_ptr[1] == buf_ptr[0])
206: *e_token++ = *buf_ptr++;
207: *e_token++ = *buf_ptr++;
208: seensfx |= 2;
209: continue;
210: }
211: break;
212: }
1.1 deraadt 213: }
214: else
1.7 ! pjanzen 215: while (chartype[(int)*buf_ptr] == alphanum) { /* copy it over */
1.1 deraadt 216: CHECK_SIZE_TOKEN;
217: *e_token++ = *buf_ptr++;
218: if (buf_ptr >= buf_end)
219: fill_buffer();
220: }
221: *e_token++ = '\0';
222: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
223: if (++buf_ptr >= buf_end)
224: fill_buffer();
225: }
226: ps.its_a_keyword = false;
227: ps.sizeof_keyword = false;
228: if (l_struct) { /* if last token was 'struct', then this token
229: * should be treated as a declaration */
230: l_struct = false;
231: last_code = ident;
232: ps.last_u_d = true;
233: return (decl);
234: }
235: ps.last_u_d = false; /* Operator after indentifier is binary */
236: last_code = ident; /* Remember that this is the code we will
237: * return */
238:
239: /*
240: * This loop will check if the token is a keyword.
241: */
1.4 deraadt 242: for (i = 0; i < nspecials; i++) {
1.7 ! pjanzen 243: char *p = s_token; /* point at scanned token */
1.4 deraadt 244: j = specials[i].rwd;
1.1 deraadt 245: if (*j++ != *p++ || *j++ != *p++)
246: continue; /* This test depends on the fact that
247: * identifiers are always at least 1 character
248: * long (ie. the first two bytes of the
249: * identifier are always meaningful) */
250: if (p[-1] == 0)
251: break; /* If its a one-character identifier */
252: while (*p++ == *j)
253: if (*j++ == 0)
254: goto found_keyword; /* I wish that C had a multi-level
255: * break... */
256: }
1.4 deraadt 257: if (i < nspecials) { /* we have a keyword */
1.1 deraadt 258: found_keyword:
259: ps.its_a_keyword = true;
260: ps.last_u_d = true;
1.4 deraadt 261: switch (specials[i].rwcode) {
1.1 deraadt 262: case 1: /* it is a switch */
263: return (swstmt);
264: case 2: /* a case or default */
265: return (casestmt);
266:
267: case 3: /* a "struct" */
268: if (ps.p_l_follow)
269: break; /* inside parens: cast */
270: l_struct = true;
271:
272: /*
273: * Next time around, we will want to know that we have had a
274: * 'struct'
275: */
276: case 4: /* one of the declaration keywords */
277: if (ps.p_l_follow) {
278: ps.cast_mask |= 1 << ps.p_l_follow;
279: break; /* inside parens: cast */
280: }
281: last_code = decl;
282: return (decl);
283:
284: case 5: /* if, while, for */
285: return (sp_paren);
286:
287: case 6: /* do, else */
288: return (sp_nparen);
289:
290: case 7:
291: ps.sizeof_keyword = true;
292: default: /* all others are treated like any other
293: * identifier */
294: return (ident);
295: } /* end of switch */
296: } /* end of if (found_it) */
297: if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
1.7 ! pjanzen 298: char *tp = buf_ptr;
1.1 deraadt 299: while (tp < buf_end)
300: if (*tp++ == ')' && (*tp == ';' || *tp == ','))
301: goto not_proc;
1.7 ! pjanzen 302: strlcpy(ps.procname, token, sizeof ps.procname);
1.1 deraadt 303: ps.in_parameter_declaration = 1;
304: rparen_count = 1;
305: not_proc:;
306: }
307: /*
308: * The following hack attempts to guess whether or not the current
309: * token is in fact a declaration keyword -- one that has been
310: * typedefd
311: */
312: if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
313: && !ps.p_l_follow
314: && !ps.block_init
315: && (ps.last_token == rparen || ps.last_token == semicolon ||
316: ps.last_token == decl ||
317: ps.last_token == lbrace || ps.last_token == rbrace)) {
318: ps.its_a_keyword = true;
319: ps.last_u_d = true;
320: last_code = decl;
321: return decl;
322: }
323: if (last_code == decl) /* if this is a declared variable, then
324: * following sign is unary */
325: ps.last_u_d = true; /* will make "int a -1" work */
326: last_code = ident;
327: return (ident); /* the ident is not in the list */
328: } /* end of procesing for alpanum character */
329:
330: /* Scan a non-alphanumeric token */
331:
332: *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
333: * moved here */
334: *e_token = '\0';
335: if (++buf_ptr >= buf_end)
336: fill_buffer();
337:
338: switch (*token) {
339: case '\n':
340: unary_delim = ps.last_u_d;
341: ps.last_nl = true; /* remember that we just had a newline */
342: code = (had_eof ? 0 : newline);
343:
344: /*
345: * if data has been exausted, the newline is a dummy, and we should
346: * return code to stop
347: */
348: break;
349:
350: case '\'': /* start of quoted character */
351: case '"': /* start of string */
352: qchar = *token;
353: if (troff) {
354: e_token[-1] = '`';
355: if (qchar == '"')
356: *e_token++ = '`';
357: e_token = chfont(&bodyf, &stringf, e_token);
358: }
359: do { /* copy the string */
360: while (1) { /* move one character or [/<char>]<char> */
361: if (*buf_ptr == '\n') {
362: printf("%d: Unterminated literal\n", line_no);
363: goto stop_lit;
364: }
365: CHECK_SIZE_TOKEN; /* Only have to do this once in this loop,
366: * since CHECK_SIZE guarantees that there
367: * are at least 5 entries left */
368: *e_token = *buf_ptr++;
369: if (buf_ptr >= buf_end)
370: fill_buffer();
371: if (*e_token == BACKSLASH) { /* if escape, copy extra char */
372: if (*buf_ptr == '\n') /* check for escaped newline */
373: ++line_no;
374: if (troff) {
375: *++e_token = BACKSLASH;
376: if (*buf_ptr == BACKSLASH)
377: *++e_token = BACKSLASH;
378: }
379: *++e_token = *buf_ptr++;
380: ++e_token; /* we must increment this again because we
381: * copied two chars */
382: if (buf_ptr >= buf_end)
383: fill_buffer();
384: }
385: else
386: break; /* we copied one character */
387: } /* end of while (1) */
388: } while (*e_token++ != qchar);
389: if (troff) {
390: e_token = chfont(&stringf, &bodyf, e_token - 1);
391: if (qchar == '"')
392: *e_token++ = '\'';
393: }
394: stop_lit:
395: code = ident;
396: break;
397:
398: case ('('):
399: case ('['):
400: unary_delim = true;
401: code = lparen;
402: break;
403:
404: case (')'):
405: case (']'):
406: code = rparen;
407: break;
408:
409: case '#':
410: unary_delim = ps.last_u_d;
411: code = preesc;
412: break;
413:
414: case '?':
415: unary_delim = true;
416: code = question;
417: break;
418:
419: case (':'):
420: code = colon;
421: unary_delim = true;
422: break;
423:
424: case (';'):
425: unary_delim = true;
426: code = semicolon;
427: break;
428:
429: case ('{'):
430: unary_delim = true;
431:
432: /*
433: * if (ps.in_or_st) ps.block_init = 1;
434: */
435: /* ? code = ps.block_init ? lparen : lbrace; */
436: code = lbrace;
437: break;
438:
439: case ('}'):
440: unary_delim = true;
441: /* ? code = ps.block_init ? rparen : rbrace; */
442: code = rbrace;
443: break;
444:
445: case 014: /* a form feed */
446: unary_delim = ps.last_u_d;
447: ps.last_nl = true; /* remember this so we can set 'ps.col_1'
448: * right */
449: code = form_feed;
450: break;
451:
452: case (','):
453: unary_delim = true;
454: code = comma;
455: break;
456:
457: case '.':
458: unary_delim = false;
459: code = period;
460: break;
461:
462: case '-':
463: case '+': /* check for -, +, --, ++ */
464: code = (ps.last_u_d ? unary_op : binary_op);
465: unary_delim = true;
466:
467: if (*buf_ptr == token[0]) {
468: /* check for doubled character */
469: *e_token++ = *buf_ptr++;
470: /* buffer overflow will be checked at end of loop */
471: if (last_code == ident || last_code == rparen) {
472: code = (ps.last_u_d ? unary_op : postop);
473: /* check for following ++ or -- */
474: unary_delim = false;
475: }
476: }
477: else if (*buf_ptr == '=')
478: /* check for operator += */
479: *e_token++ = *buf_ptr++;
480: else if (*buf_ptr == '>') {
481: /* check for operator -> */
482: *e_token++ = *buf_ptr++;
483: if (!pointer_as_binop) {
484: unary_delim = false;
485: code = unary_op;
486: ps.want_blank = false;
487: }
488: }
489: break; /* buffer overflow will be checked at end of
490: * switch */
491:
492: case '=':
493: if (ps.in_or_st)
494: ps.block_init = 1;
495: #ifdef undef
496: if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
497: e_token[-1] = *buf_ptr++;
498: if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
499: *e_token++ = *buf_ptr++;
500: *e_token++ = '='; /* Flip =+ to += */
501: *e_token = 0;
502: }
503: #else
504: if (*buf_ptr == '=') {/* == */
505: *e_token++ = '='; /* Flip =+ to += */
506: buf_ptr++;
507: *e_token = 0;
508: }
509: #endif
510: code = binary_op;
511: unary_delim = true;
512: break;
513: /* can drop thru!!! */
514:
515: case '>':
516: case '<':
517: case '!': /* ops like <, <<, <=, !=, etc */
518: if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
519: *e_token++ = *buf_ptr;
520: if (++buf_ptr >= buf_end)
521: fill_buffer();
522: }
523: if (*buf_ptr == '=')
524: *e_token++ = *buf_ptr++;
525: code = (ps.last_u_d ? unary_op : binary_op);
526: unary_delim = true;
527: break;
528:
529: default:
530: if (token[0] == '/' && *buf_ptr == '*') {
531: /* it is start of comment */
532: *e_token++ = '*';
533:
534: if (++buf_ptr >= buf_end)
535: fill_buffer();
536:
537: code = comment;
538: unary_delim = ps.last_u_d;
539: break;
540: }
541: while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
542: /*
543: * handle ||, &&, etc, and also things as in int *****i
544: */
545: *e_token++ = *buf_ptr;
546: if (++buf_ptr >= buf_end)
547: fill_buffer();
548: }
549: code = (ps.last_u_d ? unary_op : binary_op);
550: unary_delim = true;
551:
552:
553: } /* end of switch */
554: if (code != newline) {
555: l_struct = false;
556: last_code = code;
557: }
558: if (buf_ptr >= buf_end) /* check for input buffer empty */
559: fill_buffer();
560: ps.last_u_d = unary_delim;
561: *e_token = '\0'; /* null terminate the token */
562: return (code);
563: }
564:
565: /*
566: * Add the given keyword to the keyword table, using val as the keyword type
567: */
1.3 mickey 568: void
1.1 deraadt 569: addkey(key, val)
570: char *key;
1.4 deraadt 571: int val;
1.1 deraadt 572: {
1.7 ! pjanzen 573: struct templ *p;
1.4 deraadt 574: int i = 0;
575:
576: while (i < nspecials) {
577: p = &specials[i];
1.1 deraadt 578: if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
579: return;
580: else
1.4 deraadt 581: i++;
582: }
583:
584: if (specials == specialsinit) {
585: /*
586: * Whoa. Must reallocate special table.
587: */
588: nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
589: maxspecials = nspecials;
590: maxspecials += maxspecials >> 2;
591: specials = (struct templ *)malloc(maxspecials * sizeof specials[0]);
592: if (specials == NULL)
1.7 ! pjanzen 593: errx(1, "out of memory");
1.4 deraadt 594: memmove(specials, specialsinit, sizeof specialsinit);
595: } else if (nspecials >= maxspecials) {
596: maxspecials += maxspecials >> 2;
597: specials = realloc(specials, maxspecials * sizeof specials[0]);
598: if (specials == NULL)
1.7 ! pjanzen 599: errx(1, "out of memory");
1.4 deraadt 600: }
1.7 ! pjanzen 601:
1.4 deraadt 602: p = &specials[i];
1.1 deraadt 603: p->rwd = key;
604: p->rwcode = val;
1.4 deraadt 605: nspecials++;
1.1 deraadt 606: return;
607: }