Annotation of src/usr.bin/awk/lex.c, Revision 1.12
1.12 ! millert 1: /* $OpenBSD: lex.c,v 1.11 2008/10/06 20:38:33 millert Exp $ */
1.1 kstailey 2: /****************************************************************
3: Copyright (C) Lucent Technologies 1997
4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
24: ****************************************************************/
25:
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <ctype.h>
30: #include "awk.h"
31: #include "ytab.h"
32:
33: extern YYSTYPE yylval;
34: extern int infunc;
35:
36: int lineno = 1;
37: int bracecnt = 0;
38: int brackcnt = 0;
39: int parencnt = 0;
40:
41: typedef struct Keyword {
1.6 millert 42: const char *word;
1.1 kstailey 43: int sub;
44: int type;
45: } Keyword;
46:
47: Keyword keywords[] ={ /* keep sorted: binary searched */
48: { "BEGIN", XBEGIN, XBEGIN },
49: { "END", XEND, XEND },
50: { "NF", VARNF, VARNF },
1.10 pyr 51: { "and", FAND, BLTIN },
1.1 kstailey 52: { "atan2", FATAN, BLTIN },
53: { "break", BREAK, BREAK },
54: { "close", CLOSE, CLOSE },
1.10 pyr 55: { "compl", FCOMPL, BLTIN },
1.1 kstailey 56: { "continue", CONTINUE, CONTINUE },
57: { "cos", FCOS, BLTIN },
58: { "delete", DELETE, DELETE },
59: { "do", DO, DO },
60: { "else", ELSE, ELSE },
61: { "exit", EXIT, EXIT },
62: { "exp", FEXP, BLTIN },
63: { "fflush", FFLUSH, BLTIN },
64: { "for", FOR, FOR },
65: { "func", FUNC, FUNC },
66: { "function", FUNC, FUNC },
67: { "getline", GETLINE, GETLINE },
68: { "gsub", GSUB, GSUB },
69: { "if", IF, IF },
70: { "in", IN, IN },
71: { "index", INDEX, INDEX },
72: { "int", FINT, BLTIN },
73: { "length", FLENGTH, BLTIN },
74: { "log", FLOG, BLTIN },
1.10 pyr 75: { "lshift", FLSHIFT, BLTIN },
1.1 kstailey 76: { "match", MATCHFCN, MATCHFCN },
77: { "next", NEXT, NEXT },
78: { "nextfile", NEXTFILE, NEXTFILE },
1.10 pyr 79: { "or", FFOR, BLTIN },
1.1 kstailey 80: { "print", PRINT, PRINT },
81: { "printf", PRINTF, PRINTF },
82: { "rand", FRAND, BLTIN },
83: { "return", RETURN, RETURN },
1.10 pyr 84: { "rshift", FRSHIFT, BLTIN },
1.1 kstailey 85: { "sin", FSIN, BLTIN },
86: { "split", SPLIT, SPLIT },
87: { "sprintf", SPRINTF, SPRINTF },
88: { "sqrt", FSQRT, BLTIN },
89: { "srand", FSRAND, BLTIN },
90: { "sub", SUB, SUB },
91: { "substr", SUBSTR, SUBSTR },
92: { "system", FSYSTEM, BLTIN },
93: { "tolower", FTOLOWER, BLTIN },
94: { "toupper", FTOUPPER, BLTIN },
95: { "while", WHILE, WHILE },
1.10 pyr 96: { "xor", FXOR, BLTIN },
1.1 kstailey 97: };
98:
99: #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
1.7 deraadt 100:
101: int peek(void);
102: int gettok(char **, int *);
103: int binsearch(char *, Keyword *, int);
1.1 kstailey 104:
1.2 millert 105: int peek(void)
1.1 kstailey 106: {
107: int c = input();
108: unput(c);
109: return c;
110: }
111:
112: int gettok(char **pbuf, int *psz) /* get next input token */
113: {
1.6 millert 114: int c, retc;
1.1 kstailey 115: char *buf = *pbuf;
116: int sz = *psz;
117: char *bp = buf;
118:
119: c = input();
120: if (c == 0)
121: return 0;
122: buf[0] = c;
123: buf[1] = 0;
124: if (!isalnum(c) && c != '.' && c != '_')
125: return c;
126:
127: *bp++ = c;
128: if (isalpha(c) || c == '_') { /* it's a varname */
129: for ( ; (c = input()) != 0; ) {
130: if (bp-buf >= sz)
1.11 millert 131: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
1.4 millert 132: FATAL( "out of space for name %.10s...", buf );
1.1 kstailey 133: if (isalnum(c) || c == '_')
134: *bp++ = c;
135: else {
136: *bp = 0;
137: unput(c);
138: break;
139: }
140: }
1.4 millert 141: *bp = 0;
1.6 millert 142: retc = 'a'; /* alphanumeric */
1.11 millert 143: } else { /* maybe it's a number, but could be . */
1.1 kstailey 144: char *rem;
145: /* read input until can't be a number */
146: for ( ; (c = input()) != 0; ) {
147: if (bp-buf >= sz)
1.11 millert 148: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
1.4 millert 149: FATAL( "out of space for number %.10s...", buf );
1.1 kstailey 150: if (isdigit(c) || c == 'e' || c == 'E'
151: || c == '.' || c == '+' || c == '-')
152: *bp++ = c;
153: else {
154: unput(c);
155: break;
156: }
157: }
1.2 millert 158: *bp = 0;
1.1 kstailey 159: strtod(buf, &rem); /* parse the number */
1.6 millert 160: if (rem == buf) { /* it wasn't a valid number at all */
1.11 millert 161: buf[1] = 0; /* return one character as token */
1.6 millert 162: retc = buf[0]; /* character is its own type */
1.11 millert 163: unputstr(rem+1); /* put rest back for later */
1.6 millert 164: } else { /* some prefix was a number */
1.11 millert 165: unputstr(rem); /* put rest back for later */
166: rem[0] = 0; /* truncate buf after number part */
167: retc = '0'; /* type is number */
1.6 millert 168: }
1.1 kstailey 169: }
170: *pbuf = buf;
171: *psz = sz;
1.6 millert 172: return retc;
1.1 kstailey 173: }
174:
175: int word(char *);
176: int string(void);
177: int regexpr(void);
178: int sc = 0; /* 1 => return a } right now */
179: int reg = 0; /* 1 => return a REGEXPR now */
180:
1.3 millert 181: int yylex(void)
1.1 kstailey 182: {
1.3 millert 183: int c;
1.1 kstailey 184: static char *buf = 0;
1.11 millert 185: static int bufsize = 5; /* BUG: setting this small causes core dump! */
1.1 kstailey 186:
187: if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
1.4 millert 188: FATAL( "out of space in yylex" );
1.1 kstailey 189: if (sc) {
190: sc = 0;
191: RET('}');
192: }
193: if (reg) {
194: reg = 0;
195: return regexpr();
196: }
197: for (;;) {
198: c = gettok(&buf, &bufsize);
199: if (c == 0)
200: return 0;
201: if (isalpha(c) || c == '_')
202: return word(buf);
1.6 millert 203: if (isdigit(c)) {
1.1 kstailey 204: yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
205: /* should this also have STR set? */
206: RET(NUMBER);
207: }
208:
209: yylval.i = c;
210: switch (c) {
211: case '\n': /* {EOL} */
212: RET(NL);
213: case '\r': /* assume \n is coming */
214: case ' ': /* {WS}+ */
215: case '\t':
216: break;
217: case '#': /* #.* strip comments */
218: while ((c = input()) != '\n' && c != 0)
219: ;
220: unput(c);
221: break;
222: case ';':
223: RET(';');
224: case '\\':
225: if (peek() == '\n') {
1.3 millert 226: input();
1.1 kstailey 227: } else if (peek() == '\r') {
228: input(); input(); /* \n */
229: lineno++;
230: } else {
231: RET(c);
232: }
233: break;
234: case '&':
235: if (peek() == '&') {
236: input(); RET(AND);
237: } else
238: RET('&');
239: case '|':
240: if (peek() == '|') {
241: input(); RET(BOR);
242: } else
243: RET('|');
244: case '!':
245: if (peek() == '=') {
246: input(); yylval.i = NE; RET(NE);
247: } else if (peek() == '~') {
248: input(); yylval.i = NOTMATCH; RET(MATCHOP);
249: } else
250: RET(NOT);
251: case '~':
252: yylval.i = MATCH;
253: RET(MATCHOP);
254: case '<':
255: if (peek() == '=') {
256: input(); yylval.i = LE; RET(LE);
257: } else {
258: yylval.i = LT; RET(LT);
259: }
260: case '=':
261: if (peek() == '=') {
262: input(); yylval.i = EQ; RET(EQ);
263: } else {
264: yylval.i = ASSIGN; RET(ASGNOP);
265: }
266: case '>':
267: if (peek() == '=') {
268: input(); yylval.i = GE; RET(GE);
269: } else if (peek() == '>') {
270: input(); yylval.i = APPEND; RET(APPEND);
271: } else {
272: yylval.i = GT; RET(GT);
273: }
274: case '+':
275: if (peek() == '+') {
276: input(); yylval.i = INCR; RET(INCR);
277: } else if (peek() == '=') {
278: input(); yylval.i = ADDEQ; RET(ASGNOP);
279: } else
280: RET('+');
281: case '-':
282: if (peek() == '-') {
283: input(); yylval.i = DECR; RET(DECR);
284: } else if (peek() == '=') {
285: input(); yylval.i = SUBEQ; RET(ASGNOP);
286: } else
287: RET('-');
288: case '*':
289: if (peek() == '=') { /* *= */
290: input(); yylval.i = MULTEQ; RET(ASGNOP);
291: } else if (peek() == '*') { /* ** or **= */
292: input(); /* eat 2nd * */
293: if (peek() == '=') {
294: input(); yylval.i = POWEQ; RET(ASGNOP);
295: } else {
296: RET(POWER);
297: }
298: } else
299: RET('*');
300: case '/':
1.3 millert 301: RET('/');
1.1 kstailey 302: case '%':
303: if (peek() == '=') {
304: input(); yylval.i = MODEQ; RET(ASGNOP);
305: } else
306: RET('%');
307: case '^':
308: if (peek() == '=') {
309: input(); yylval.i = POWEQ; RET(ASGNOP);
310: } else
311: RET(POWER);
1.5 millert 312:
1.1 kstailey 313: case '$':
314: /* BUG: awkward, if not wrong */
315: c = gettok(&buf, &bufsize);
1.5 millert 316: if (isalpha(c)) {
1.1 kstailey 317: if (strcmp(buf, "NF") == 0) { /* very special */
318: unputstr("(NF)");
1.5 millert 319: RET(INDIRECT);
320: }
321: c = peek();
322: if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
323: unputstr(buf);
1.1 kstailey 324: RET(INDIRECT);
325: }
326: yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
327: RET(IVAR);
1.6 millert 328: } else if (c == 0) { /* */
329: SYNTAX( "unexpected end of input after $" );
330: RET(';');
1.1 kstailey 331: } else {
332: unputstr(buf);
333: RET(INDIRECT);
334: }
335:
336: case '}':
337: if (--bracecnt < 0)
1.4 millert 338: SYNTAX( "extra }" );
1.1 kstailey 339: sc = 1;
340: RET(';');
341: case ']':
342: if (--brackcnt < 0)
1.4 millert 343: SYNTAX( "extra ]" );
1.1 kstailey 344: RET(']');
345: case ')':
346: if (--parencnt < 0)
1.4 millert 347: SYNTAX( "extra )" );
1.1 kstailey 348: RET(')');
349: case '{':
350: bracecnt++;
351: RET('{');
352: case '[':
353: brackcnt++;
354: RET('[');
355: case '(':
356: parencnt++;
357: RET('(');
358:
359: case '"':
360: return string(); /* BUG: should be like tran.c ? */
361:
362: default:
363: RET(c);
364: }
365: }
366: }
367:
1.3 millert 368: int string(void)
1.1 kstailey 369: {
370: int c, n;
371: char *s, *bp;
372: static char *buf = 0;
373: static int bufsz = 500;
374:
375: if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
1.4 millert 376: FATAL("out of space for strings");
1.1 kstailey 377: for (bp = buf; (c = input()) != '"'; ) {
1.11 millert 378: if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
1.4 millert 379: FATAL("out of space for string %.10s...", buf);
1.1 kstailey 380: switch (c) {
381: case '\n':
382: case '\r':
383: case 0:
1.4 millert 384: SYNTAX( "non-terminated string %.10s...", buf );
1.1 kstailey 385: lineno++;
1.6 millert 386: if (c == 0) /* hopeless */
387: FATAL( "giving up" );
1.1 kstailey 388: break;
389: case '\\':
390: c = input();
391: switch (c) {
392: case '"': *bp++ = '"'; break;
393: case 'n': *bp++ = '\n'; break;
394: case 't': *bp++ = '\t'; break;
395: case 'f': *bp++ = '\f'; break;
396: case 'r': *bp++ = '\r'; break;
397: case 'b': *bp++ = '\b'; break;
398: case 'v': *bp++ = '\v'; break;
1.3 millert 399: case 'a': *bp++ = '\007'; break;
1.1 kstailey 400: case '\\': *bp++ = '\\'; break;
401:
402: case '0': case '1': case '2': /* octal: \d \dd \ddd */
403: case '3': case '4': case '5': case '6': case '7':
404: n = c - '0';
405: if ((c = peek()) >= '0' && c < '8') {
406: n = 8 * n + input() - '0';
407: if ((c = peek()) >= '0' && c < '8')
408: n = 8 * n + input() - '0';
409: }
410: *bp++ = n;
411: break;
412:
413: case 'x': /* hex \x0-9a-fA-F + */
414: { char xbuf[100], *px;
415: for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
416: if (isdigit(c)
417: || (c >= 'a' && c <= 'f')
418: || (c >= 'A' && c <= 'F'))
419: *px++ = c;
420: else
421: break;
422: }
423: *px = 0;
424: unput(c);
1.12 ! millert 425: sscanf(xbuf, "%x", (unsigned int *) &n);
1.1 kstailey 426: *bp++ = n;
427: break;
428: }
429:
430: default:
431: *bp++ = c;
432: break;
433: }
434: break;
435: default:
436: *bp++ = c;
437: break;
438: }
439: }
440: *bp = 0;
441: s = tostring(buf);
442: *bp++ = ' '; *bp++ = 0;
443: yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
444: RET(STRING);
445: }
446:
447:
448: int binsearch(char *w, Keyword *kp, int n)
449: {
450: int cond, low, mid, high;
451:
452: low = 0;
453: high = n - 1;
454: while (low <= high) {
455: mid = (low + high) / 2;
456: if ((cond = strcmp(w, kp[mid].word)) < 0)
457: high = mid - 1;
458: else if (cond > 0)
459: low = mid + 1;
460: else
461: return mid;
462: }
463: return -1;
464: }
465:
466: int word(char *w)
467: {
468: Keyword *kp;
469: int c, n;
470:
471: n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
1.11 millert 472: /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
1.1 kstailey 473: kp = keywords + n;
474: if (n != -1) { /* found in table */
475: yylval.i = kp->sub;
476: switch (kp->type) { /* special handling */
1.11 millert 477: case BLTIN:
478: if (kp->sub == FSYSTEM && safe)
1.4 millert 479: SYNTAX( "system is unsafe" );
1.1 kstailey 480: RET(kp->type);
481: case FUNC:
482: if (infunc)
1.4 millert 483: SYNTAX( "illegal nested function" );
1.1 kstailey 484: RET(kp->type);
485: case RETURN:
486: if (!infunc)
1.4 millert 487: SYNTAX( "return not in function" );
1.1 kstailey 488: RET(kp->type);
489: case VARNF:
490: yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
491: RET(VARNF);
492: default:
493: RET(kp->type);
494: }
495: }
496: c = peek(); /* look for '(' */
497: if (c != '(' && infunc && (n=isarg(w)) >= 0) {
498: yylval.i = n;
499: RET(ARG);
500: } else {
501: yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
502: if (c == '(') {
503: RET(CALL);
504: } else {
505: RET(VAR);
506: }
507: }
508: }
509:
1.6 millert 510: void startreg(void) /* next call to yylex will return a regular expression */
1.1 kstailey 511: {
512: reg = 1;
513: }
514:
1.3 millert 515: int regexpr(void)
1.1 kstailey 516: {
1.9 hugh 517: int c, openclass = 0;
1.1 kstailey 518: static char *buf = 0;
519: static int bufsz = 500;
520: char *bp;
521:
522: if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
1.4 millert 523: FATAL("out of space for rex expr");
1.1 kstailey 524: bp = buf;
1.9 hugh 525: for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) {
1.11 millert 526: if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
1.4 millert 527: FATAL("out of space for reg expr %.10s...", buf);
1.1 kstailey 528: if (c == '\n') {
1.4 millert 529: SYNTAX( "newline in regular expression %.10s...", buf );
1.1 kstailey 530: unput('\n');
531: break;
532: } else if (c == '\\') {
533: *bp++ = '\\';
534: *bp++ = input();
535: } else {
1.9 hugh 536: if (c == '[')
537: openclass = 1;
538: else if (c == ']')
539: openclass = 0;
1.1 kstailey 540: *bp++ = c;
541: }
542: }
543: *bp = 0;
1.8 millert 544: if (c == 0)
545: SYNTAX("non-terminated regular expression %.10s...", buf);
1.1 kstailey 546: yylval.s = tostring(buf);
547: unput('/');
548: RET(REGEXPR);
549: }
550:
551: /* low-level lexical stuff, sort of inherited from lex */
552:
553: char ebuf[300];
554: char *ep = ebuf;
555: char yysbuf[100]; /* pushback buffer */
556: char *yysptr = yysbuf;
557: FILE *yyin = 0;
558:
559: int input(void) /* get next lexical input character */
560: {
561: int c;
562: extern char *lexprog;
563:
564: if (yysptr > yysbuf)
1.8 millert 565: c = (uschar)*--yysptr;
1.1 kstailey 566: else if (lexprog != NULL) { /* awk '...' */
1.8 millert 567: if ((c = (uschar)*lexprog) != 0)
1.1 kstailey 568: lexprog++;
569: } else /* awk -f ... */
570: c = pgetc();
571: if (c == '\n')
572: lineno++;
573: else if (c == EOF)
574: c = 0;
575: if (ep >= ebuf + sizeof ebuf)
576: ep = ebuf;
577: return *ep++ = c;
578: }
579:
580: void unput(int c) /* put lexical character back on input */
581: {
582: if (c == '\n')
583: lineno--;
584: if (yysptr >= yysbuf + sizeof(yysbuf))
1.4 millert 585: FATAL("pushed back too much: %.20s...", yysbuf);
1.1 kstailey 586: *yysptr++ = c;
587: if (--ep < ebuf)
588: ep = ebuf + sizeof(ebuf) - 1;
589: }
590:
1.6 millert 591: void unputstr(const char *s) /* put a string back on input */
1.1 kstailey 592: {
593: int i;
594:
595: for (i = strlen(s)-1; i >= 0; i--)
596: unput(s[i]);
597: }