Annotation of src/usr.bin/awk/lex.c, Revision 1.10
1.10 ! pyr 1: /* $OpenBSD: lex.c,v 1.9 2006/04/16 02:10:18 hugh Exp $ */
1.1 kstailey 2: /****************************************************************
3: Copyright (C) Lucent Technologies 1997
4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
24: ****************************************************************/
25:
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <ctype.h>
30: #include "awk.h"
31: #include "ytab.h"
32:
33: extern YYSTYPE yylval;
34: extern int infunc;
35:
36: int lineno = 1;
37: int bracecnt = 0;
38: int brackcnt = 0;
39: int parencnt = 0;
40:
41: typedef struct Keyword {
1.6 millert 42: const char *word;
1.1 kstailey 43: int sub;
44: int type;
45: } Keyword;
46:
47: Keyword keywords[] ={ /* keep sorted: binary searched */
48: { "BEGIN", XBEGIN, XBEGIN },
49: { "END", XEND, XEND },
50: { "NF", VARNF, VARNF },
1.10 ! pyr 51: { "and", FAND, BLTIN },
1.1 kstailey 52: { "atan2", FATAN, BLTIN },
53: { "break", BREAK, BREAK },
54: { "close", CLOSE, CLOSE },
1.10 ! pyr 55: { "compl", FCOMPL, BLTIN },
1.1 kstailey 56: { "continue", CONTINUE, CONTINUE },
57: { "cos", FCOS, BLTIN },
58: { "delete", DELETE, DELETE },
59: { "do", DO, DO },
60: { "else", ELSE, ELSE },
61: { "exit", EXIT, EXIT },
62: { "exp", FEXP, BLTIN },
63: { "fflush", FFLUSH, BLTIN },
64: { "for", FOR, FOR },
65: { "func", FUNC, FUNC },
66: { "function", FUNC, FUNC },
67: { "getline", GETLINE, GETLINE },
68: { "gsub", GSUB, GSUB },
69: { "if", IF, IF },
70: { "in", IN, IN },
71: { "index", INDEX, INDEX },
72: { "int", FINT, BLTIN },
73: { "length", FLENGTH, BLTIN },
74: { "log", FLOG, BLTIN },
1.10 ! pyr 75: { "lshift", FLSHIFT, BLTIN },
1.1 kstailey 76: { "match", MATCHFCN, MATCHFCN },
77: { "next", NEXT, NEXT },
78: { "nextfile", NEXTFILE, NEXTFILE },
1.10 ! pyr 79: { "or", FFOR, BLTIN },
1.1 kstailey 80: { "print", PRINT, PRINT },
81: { "printf", PRINTF, PRINTF },
82: { "rand", FRAND, BLTIN },
83: { "return", RETURN, RETURN },
1.10 ! pyr 84: { "rshift", FRSHIFT, BLTIN },
1.1 kstailey 85: { "sin", FSIN, BLTIN },
86: { "split", SPLIT, SPLIT },
87: { "sprintf", SPRINTF, SPRINTF },
88: { "sqrt", FSQRT, BLTIN },
89: { "srand", FSRAND, BLTIN },
90: { "sub", SUB, SUB },
91: { "substr", SUBSTR, SUBSTR },
92: { "system", FSYSTEM, BLTIN },
93: { "tolower", FTOLOWER, BLTIN },
94: { "toupper", FTOUPPER, BLTIN },
95: { "while", WHILE, WHILE },
1.10 ! pyr 96: { "xor", FXOR, BLTIN },
1.1 kstailey 97: };
98:
99: #define DEBUG
100: #ifdef DEBUG
101: #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
102: #else
103: #define RET(x) return(x)
104: #endif
1.7 deraadt 105:
106: int peek(void);
107: int gettok(char **, int *);
108: int binsearch(char *, Keyword *, int);
1.1 kstailey 109:
1.2 millert 110: int peek(void)
1.1 kstailey 111: {
112: int c = input();
113: unput(c);
114: return c;
115: }
116:
117: int gettok(char **pbuf, int *psz) /* get next input token */
118: {
1.6 millert 119: int c, retc;
1.1 kstailey 120: char *buf = *pbuf;
121: int sz = *psz;
122: char *bp = buf;
123:
124: c = input();
125: if (c == 0)
126: return 0;
127: buf[0] = c;
128: buf[1] = 0;
129: if (!isalnum(c) && c != '.' && c != '_')
130: return c;
131:
132: *bp++ = c;
133: if (isalpha(c) || c == '_') { /* it's a varname */
134: for ( ; (c = input()) != 0; ) {
135: if (bp-buf >= sz)
136: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
1.4 millert 137: FATAL( "out of space for name %.10s...", buf );
1.1 kstailey 138: if (isalnum(c) || c == '_')
139: *bp++ = c;
140: else {
141: *bp = 0;
142: unput(c);
143: break;
144: }
145: }
1.4 millert 146: *bp = 0;
1.6 millert 147: retc = 'a'; /* alphanumeric */
1.1 kstailey 148: } else { /* it's a number */
149: char *rem;
150: /* read input until can't be a number */
151: for ( ; (c = input()) != 0; ) {
152: if (bp-buf >= sz)
153: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
1.4 millert 154: FATAL( "out of space for number %.10s...", buf );
1.1 kstailey 155: if (isdigit(c) || c == 'e' || c == 'E'
156: || c == '.' || c == '+' || c == '-')
157: *bp++ = c;
158: else {
159: unput(c);
160: break;
161: }
162: }
1.2 millert 163: *bp = 0;
1.1 kstailey 164: strtod(buf, &rem); /* parse the number */
165: unputstr(rem); /* put rest back for later */
1.8 millert 166: /* printf("unputstr [%s], buf [%s]\n", rem, buf); */
1.6 millert 167: if (rem == buf) { /* it wasn't a valid number at all */
168: buf[1] = 0; /* so return one character as token */
169: retc = buf[0]; /* character is its own type */
170: } else { /* some prefix was a number */
171: rem[0] = 0; /* so truncate where failure started */
172: retc = '0'; /* number */
173: }
1.1 kstailey 174: }
175: *pbuf = buf;
176: *psz = sz;
1.6 millert 177: return retc;
1.1 kstailey 178: }
179:
180: int word(char *);
181: int string(void);
182: int regexpr(void);
183: int sc = 0; /* 1 => return a } right now */
184: int reg = 0; /* 1 => return a REGEXPR now */
185:
1.3 millert 186: int yylex(void)
1.1 kstailey 187: {
1.3 millert 188: int c;
1.1 kstailey 189: static char *buf = 0;
190: static int bufsize = 500;
191:
192: if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
1.4 millert 193: FATAL( "out of space in yylex" );
1.1 kstailey 194: if (sc) {
195: sc = 0;
196: RET('}');
197: }
198: if (reg) {
199: reg = 0;
200: return regexpr();
201: }
1.8 millert 202: /* printf("top\n"); */
1.1 kstailey 203: for (;;) {
204: c = gettok(&buf, &bufsize);
1.8 millert 205: /* printf("gettok [%s]\n", buf); */
1.1 kstailey 206: if (c == 0)
207: return 0;
208: if (isalpha(c) || c == '_')
209: return word(buf);
1.6 millert 210: if (isdigit(c)) {
1.1 kstailey 211: yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
212: /* should this also have STR set? */
213: RET(NUMBER);
214: }
215:
216: yylval.i = c;
217: switch (c) {
218: case '\n': /* {EOL} */
219: RET(NL);
220: case '\r': /* assume \n is coming */
221: case ' ': /* {WS}+ */
222: case '\t':
223: break;
224: case '#': /* #.* strip comments */
225: while ((c = input()) != '\n' && c != 0)
226: ;
227: unput(c);
228: break;
229: case ';':
230: RET(';');
231: case '\\':
232: if (peek() == '\n') {
1.3 millert 233: input();
1.1 kstailey 234: } else if (peek() == '\r') {
235: input(); input(); /* \n */
236: lineno++;
237: } else {
238: RET(c);
239: }
240: break;
241: case '&':
242: if (peek() == '&') {
243: input(); RET(AND);
244: } else
245: RET('&');
246: case '|':
247: if (peek() == '|') {
248: input(); RET(BOR);
249: } else
250: RET('|');
251: case '!':
252: if (peek() == '=') {
253: input(); yylval.i = NE; RET(NE);
254: } else if (peek() == '~') {
255: input(); yylval.i = NOTMATCH; RET(MATCHOP);
256: } else
257: RET(NOT);
258: case '~':
259: yylval.i = MATCH;
260: RET(MATCHOP);
261: case '<':
262: if (peek() == '=') {
263: input(); yylval.i = LE; RET(LE);
264: } else {
265: yylval.i = LT; RET(LT);
266: }
267: case '=':
268: if (peek() == '=') {
269: input(); yylval.i = EQ; RET(EQ);
270: } else {
271: yylval.i = ASSIGN; RET(ASGNOP);
272: }
273: case '>':
274: if (peek() == '=') {
275: input(); yylval.i = GE; RET(GE);
276: } else if (peek() == '>') {
277: input(); yylval.i = APPEND; RET(APPEND);
278: } else {
279: yylval.i = GT; RET(GT);
280: }
281: case '+':
282: if (peek() == '+') {
283: input(); yylval.i = INCR; RET(INCR);
284: } else if (peek() == '=') {
285: input(); yylval.i = ADDEQ; RET(ASGNOP);
286: } else
287: RET('+');
288: case '-':
289: if (peek() == '-') {
290: input(); yylval.i = DECR; RET(DECR);
291: } else if (peek() == '=') {
292: input(); yylval.i = SUBEQ; RET(ASGNOP);
293: } else
294: RET('-');
295: case '*':
296: if (peek() == '=') { /* *= */
297: input(); yylval.i = MULTEQ; RET(ASGNOP);
298: } else if (peek() == '*') { /* ** or **= */
299: input(); /* eat 2nd * */
300: if (peek() == '=') {
301: input(); yylval.i = POWEQ; RET(ASGNOP);
302: } else {
303: RET(POWER);
304: }
305: } else
306: RET('*');
307: case '/':
1.3 millert 308: RET('/');
1.1 kstailey 309: case '%':
310: if (peek() == '=') {
311: input(); yylval.i = MODEQ; RET(ASGNOP);
312: } else
313: RET('%');
314: case '^':
315: if (peek() == '=') {
316: input(); yylval.i = POWEQ; RET(ASGNOP);
317: } else
318: RET(POWER);
1.5 millert 319:
1.1 kstailey 320: case '$':
321: /* BUG: awkward, if not wrong */
322: c = gettok(&buf, &bufsize);
1.5 millert 323: if (isalpha(c)) {
1.1 kstailey 324: if (strcmp(buf, "NF") == 0) { /* very special */
325: unputstr("(NF)");
1.5 millert 326: RET(INDIRECT);
327: }
328: c = peek();
329: if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
330: unputstr(buf);
1.1 kstailey 331: RET(INDIRECT);
332: }
333: yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
334: RET(IVAR);
1.6 millert 335: } else if (c == 0) { /* */
336: SYNTAX( "unexpected end of input after $" );
337: RET(';');
1.1 kstailey 338: } else {
339: unputstr(buf);
340: RET(INDIRECT);
341: }
342:
343: case '}':
344: if (--bracecnt < 0)
1.4 millert 345: SYNTAX( "extra }" );
1.1 kstailey 346: sc = 1;
347: RET(';');
348: case ']':
349: if (--brackcnt < 0)
1.4 millert 350: SYNTAX( "extra ]" );
1.1 kstailey 351: RET(']');
352: case ')':
353: if (--parencnt < 0)
1.4 millert 354: SYNTAX( "extra )" );
1.1 kstailey 355: RET(')');
356: case '{':
357: bracecnt++;
358: RET('{');
359: case '[':
360: brackcnt++;
361: RET('[');
362: case '(':
363: parencnt++;
364: RET('(');
365:
366: case '"':
367: return string(); /* BUG: should be like tran.c ? */
368:
369: default:
370: RET(c);
371: }
372: }
373: }
374:
1.3 millert 375: int string(void)
1.1 kstailey 376: {
377: int c, n;
378: char *s, *bp;
379: static char *buf = 0;
380: static int bufsz = 500;
381:
382: if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
1.4 millert 383: FATAL("out of space for strings");
1.1 kstailey 384: for (bp = buf; (c = input()) != '"'; ) {
385: if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
1.4 millert 386: FATAL("out of space for string %.10s...", buf);
1.1 kstailey 387: switch (c) {
388: case '\n':
389: case '\r':
390: case 0:
1.4 millert 391: SYNTAX( "non-terminated string %.10s...", buf );
1.1 kstailey 392: lineno++;
1.6 millert 393: if (c == 0) /* hopeless */
394: FATAL( "giving up" );
1.1 kstailey 395: break;
396: case '\\':
397: c = input();
398: switch (c) {
399: case '"': *bp++ = '"'; break;
400: case 'n': *bp++ = '\n'; break;
401: case 't': *bp++ = '\t'; break;
402: case 'f': *bp++ = '\f'; break;
403: case 'r': *bp++ = '\r'; break;
404: case 'b': *bp++ = '\b'; break;
405: case 'v': *bp++ = '\v'; break;
1.3 millert 406: case 'a': *bp++ = '\007'; break;
1.1 kstailey 407: case '\\': *bp++ = '\\'; break;
408:
409: case '0': case '1': case '2': /* octal: \d \dd \ddd */
410: case '3': case '4': case '5': case '6': case '7':
411: n = c - '0';
412: if ((c = peek()) >= '0' && c < '8') {
413: n = 8 * n + input() - '0';
414: if ((c = peek()) >= '0' && c < '8')
415: n = 8 * n + input() - '0';
416: }
417: *bp++ = n;
418: break;
419:
420: case 'x': /* hex \x0-9a-fA-F + */
421: { char xbuf[100], *px;
422: for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
423: if (isdigit(c)
424: || (c >= 'a' && c <= 'f')
425: || (c >= 'A' && c <= 'F'))
426: *px++ = c;
427: else
428: break;
429: }
430: *px = 0;
431: unput(c);
432: sscanf(xbuf, "%x", &n);
433: *bp++ = n;
434: break;
435: }
436:
437: default:
438: *bp++ = c;
439: break;
440: }
441: break;
442: default:
443: *bp++ = c;
444: break;
445: }
446: }
447: *bp = 0;
448: s = tostring(buf);
449: *bp++ = ' '; *bp++ = 0;
450: yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
451: RET(STRING);
452: }
453:
454:
455: int binsearch(char *w, Keyword *kp, int n)
456: {
457: int cond, low, mid, high;
458:
459: low = 0;
460: high = n - 1;
461: while (low <= high) {
462: mid = (low + high) / 2;
463: if ((cond = strcmp(w, kp[mid].word)) < 0)
464: high = mid - 1;
465: else if (cond > 0)
466: low = mid + 1;
467: else
468: return mid;
469: }
470: return -1;
471: }
472:
473: int word(char *w)
474: {
475: Keyword *kp;
476: int c, n;
477:
478: n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
479: kp = keywords + n;
480: if (n != -1) { /* found in table */
481: yylval.i = kp->sub;
482: switch (kp->type) { /* special handling */
483: case FSYSTEM:
484: if (safe)
1.4 millert 485: SYNTAX( "system is unsafe" );
1.1 kstailey 486: RET(kp->type);
487: case FUNC:
488: if (infunc)
1.4 millert 489: SYNTAX( "illegal nested function" );
1.1 kstailey 490: RET(kp->type);
491: case RETURN:
492: if (!infunc)
1.4 millert 493: SYNTAX( "return not in function" );
1.1 kstailey 494: RET(kp->type);
495: case VARNF:
496: yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
497: RET(VARNF);
498: default:
499: RET(kp->type);
500: }
501: }
502: c = peek(); /* look for '(' */
503: if (c != '(' && infunc && (n=isarg(w)) >= 0) {
504: yylval.i = n;
505: RET(ARG);
506: } else {
507: yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
508: if (c == '(') {
509: RET(CALL);
510: } else {
511: RET(VAR);
512: }
513: }
514: }
515:
1.6 millert 516: void startreg(void) /* next call to yylex will return a regular expression */
1.1 kstailey 517: {
518: reg = 1;
519: }
520:
1.3 millert 521: int regexpr(void)
1.1 kstailey 522: {
1.9 hugh 523: int c, openclass = 0;
1.1 kstailey 524: static char *buf = 0;
525: static int bufsz = 500;
526: char *bp;
527:
528: if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
1.4 millert 529: FATAL("out of space for rex expr");
1.1 kstailey 530: bp = buf;
1.9 hugh 531: for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) {
1.1 kstailey 532: if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
1.4 millert 533: FATAL("out of space for reg expr %.10s...", buf);
1.1 kstailey 534: if (c == '\n') {
1.4 millert 535: SYNTAX( "newline in regular expression %.10s...", buf );
1.1 kstailey 536: unput('\n');
537: break;
538: } else if (c == '\\') {
539: *bp++ = '\\';
540: *bp++ = input();
541: } else {
1.9 hugh 542: if (c == '[')
543: openclass = 1;
544: else if (c == ']')
545: openclass = 0;
1.1 kstailey 546: *bp++ = c;
547: }
548: }
549: *bp = 0;
1.8 millert 550: if (c == 0)
551: SYNTAX("non-terminated regular expression %.10s...", buf);
1.1 kstailey 552: yylval.s = tostring(buf);
553: unput('/');
554: RET(REGEXPR);
555: }
556:
557: /* low-level lexical stuff, sort of inherited from lex */
558:
559: char ebuf[300];
560: char *ep = ebuf;
561: char yysbuf[100]; /* pushback buffer */
562: char *yysptr = yysbuf;
563: FILE *yyin = 0;
564:
565: int input(void) /* get next lexical input character */
566: {
567: int c;
568: extern char *lexprog;
569:
570: if (yysptr > yysbuf)
1.8 millert 571: c = (uschar)*--yysptr;
1.1 kstailey 572: else if (lexprog != NULL) { /* awk '...' */
1.8 millert 573: if ((c = (uschar)*lexprog) != 0)
1.1 kstailey 574: lexprog++;
575: } else /* awk -f ... */
576: c = pgetc();
577: if (c == '\n')
578: lineno++;
579: else if (c == EOF)
580: c = 0;
581: if (ep >= ebuf + sizeof ebuf)
582: ep = ebuf;
583: return *ep++ = c;
584: }
585:
586: void unput(int c) /* put lexical character back on input */
587: {
588: if (c == '\n')
589: lineno--;
590: if (yysptr >= yysbuf + sizeof(yysbuf))
1.4 millert 591: FATAL("pushed back too much: %.20s...", yysbuf);
1.1 kstailey 592: *yysptr++ = c;
593: if (--ep < ebuf)
594: ep = ebuf + sizeof(ebuf) - 1;
595: }
596:
1.6 millert 597: void unputstr(const char *s) /* put a string back on input */
1.1 kstailey 598: {
599: int i;
600:
601: for (i = strlen(s)-1; i >= 0; i--)
602: unput(s[i]);
603: }