Annotation of src/usr.bin/awk/lex.c, Revision 1.26
1.26 ! millert 1: /* $OpenBSD: lex.c,v 1.25 2020/07/30 17:45:44 millert Exp $ */
1.1 kstailey 2: /****************************************************************
3: Copyright (C) Lucent Technologies 1997
4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
24: ****************************************************************/
25:
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <ctype.h>
30: #include "awk.h"
1.25 millert 31: #include "awkgram.tab.h"
1.1 kstailey 32:
33: extern YYSTYPE yylval;
1.19 millert 34: extern bool infunc;
1.1 kstailey 35:
36: int lineno = 1;
37: int bracecnt = 0;
38: int brackcnt = 0;
39: int parencnt = 0;
40:
41: typedef struct Keyword {
1.6 millert 42: const char *word;
1.1 kstailey 43: int sub;
44: int type;
45: } Keyword;
46:
1.18 millert 47: const Keyword keywords[] = { /* keep sorted: binary searched */
1.1 kstailey 48: { "BEGIN", XBEGIN, XBEGIN },
49: { "END", XEND, XEND },
50: { "NF", VARNF, VARNF },
1.10 pyr 51: { "and", FAND, BLTIN },
1.1 kstailey 52: { "atan2", FATAN, BLTIN },
53: { "break", BREAK, BREAK },
54: { "close", CLOSE, CLOSE },
1.10 pyr 55: { "compl", FCOMPL, BLTIN },
1.1 kstailey 56: { "continue", CONTINUE, CONTINUE },
57: { "cos", FCOS, BLTIN },
58: { "delete", DELETE, DELETE },
59: { "do", DO, DO },
60: { "else", ELSE, ELSE },
61: { "exit", EXIT, EXIT },
62: { "exp", FEXP, BLTIN },
63: { "fflush", FFLUSH, BLTIN },
64: { "for", FOR, FOR },
65: { "func", FUNC, FUNC },
66: { "function", FUNC, FUNC },
1.21 millert 67: { "gensub", GENSUB, GENSUB },
1.1 kstailey 68: { "getline", GETLINE, GETLINE },
69: { "gsub", GSUB, GSUB },
70: { "if", IF, IF },
71: { "in", IN, IN },
72: { "index", INDEX, INDEX },
73: { "int", FINT, BLTIN },
74: { "length", FLENGTH, BLTIN },
75: { "log", FLOG, BLTIN },
1.10 pyr 76: { "lshift", FLSHIFT, BLTIN },
1.1 kstailey 77: { "match", MATCHFCN, MATCHFCN },
1.26 ! millert 78: { "mktime", FMKTIME, BLTIN },
1.1 kstailey 79: { "next", NEXT, NEXT },
80: { "nextfile", NEXTFILE, NEXTFILE },
1.10 pyr 81: { "or", FFOR, BLTIN },
1.1 kstailey 82: { "print", PRINT, PRINT },
83: { "printf", PRINTF, PRINTF },
84: { "rand", FRAND, BLTIN },
85: { "return", RETURN, RETURN },
1.10 pyr 86: { "rshift", FRSHIFT, BLTIN },
1.1 kstailey 87: { "sin", FSIN, BLTIN },
88: { "split", SPLIT, SPLIT },
89: { "sprintf", SPRINTF, SPRINTF },
90: { "sqrt", FSQRT, BLTIN },
91: { "srand", FSRAND, BLTIN },
1.21 millert 92: { "strftime", FSTRFTIME, BLTIN },
1.1 kstailey 93: { "sub", SUB, SUB },
94: { "substr", SUBSTR, SUBSTR },
95: { "system", FSYSTEM, BLTIN },
1.21 millert 96: { "systime", FSYSTIME, BLTIN },
1.1 kstailey 97: { "tolower", FTOLOWER, BLTIN },
98: { "toupper", FTOUPPER, BLTIN },
99: { "while", WHILE, WHILE },
1.10 pyr 100: { "xor", FXOR, BLTIN },
1.1 kstailey 101: };
102:
103: #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
1.7 deraadt 104:
1.18 millert 105: static int peek(void)
1.1 kstailey 106: {
107: int c = input();
108: unput(c);
109: return c;
110: }
111:
1.18 millert 112: static int gettok(char **pbuf, int *psz) /* get next input token */
1.1 kstailey 113: {
1.6 millert 114: int c, retc;
1.1 kstailey 115: char *buf = *pbuf;
116: int sz = *psz;
117: char *bp = buf;
118:
119: c = input();
120: if (c == 0)
121: return 0;
122: buf[0] = c;
123: buf[1] = 0;
124: if (!isalnum(c) && c != '.' && c != '_')
125: return c;
126:
127: *bp++ = c;
128: if (isalpha(c) || c == '_') { /* it's a varname */
129: for ( ; (c = input()) != 0; ) {
130: if (bp-buf >= sz)
1.11 millert 131: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
1.4 millert 132: FATAL( "out of space for name %.10s...", buf );
1.1 kstailey 133: if (isalnum(c) || c == '_')
134: *bp++ = c;
135: else {
136: *bp = 0;
137: unput(c);
138: break;
139: }
140: }
1.4 millert 141: *bp = 0;
1.6 millert 142: retc = 'a'; /* alphanumeric */
1.11 millert 143: } else { /* maybe it's a number, but could be . */
1.1 kstailey 144: char *rem;
145: /* read input until can't be a number */
146: for ( ; (c = input()) != 0; ) {
147: if (bp-buf >= sz)
1.11 millert 148: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
1.4 millert 149: FATAL( "out of space for number %.10s...", buf );
1.14 millert 150: if (isdigit(c) || c == 'e' || c == 'E'
1.1 kstailey 151: || c == '.' || c == '+' || c == '-')
152: *bp++ = c;
153: else {
154: unput(c);
155: break;
156: }
157: }
1.2 millert 158: *bp = 0;
1.1 kstailey 159: strtod(buf, &rem); /* parse the number */
1.6 millert 160: if (rem == buf) { /* it wasn't a valid number at all */
1.11 millert 161: buf[1] = 0; /* return one character as token */
1.23 millert 162: retc = (uschar)buf[0]; /* character is its own type */
1.11 millert 163: unputstr(rem+1); /* put rest back for later */
1.6 millert 164: } else { /* some prefix was a number */
1.11 millert 165: unputstr(rem); /* put rest back for later */
166: rem[0] = 0; /* truncate buf after number part */
167: retc = '0'; /* type is number */
1.6 millert 168: }
1.1 kstailey 169: }
170: *pbuf = buf;
171: *psz = sz;
1.6 millert 172: return retc;
1.1 kstailey 173: }
174:
175: int word(char *);
176: int string(void);
177: int regexpr(void);
1.16 millert 178: bool sc = false; /* true => return a } right now */
179: bool reg = false; /* true => return a REGEXPR now */
1.1 kstailey 180:
1.3 millert 181: int yylex(void)
1.1 kstailey 182: {
1.3 millert 183: int c;
1.14 millert 184: static char *buf = NULL;
1.11 millert 185: static int bufsize = 5; /* BUG: setting this small causes core dump! */
1.1 kstailey 186:
1.15 millert 187: if (buf == NULL && (buf = malloc(bufsize)) == NULL)
1.4 millert 188: FATAL( "out of space in yylex" );
1.1 kstailey 189: if (sc) {
1.16 millert 190: sc = false;
1.1 kstailey 191: RET('}');
192: }
193: if (reg) {
1.16 millert 194: reg = false;
1.1 kstailey 195: return regexpr();
196: }
197: for (;;) {
198: c = gettok(&buf, &bufsize);
199: if (c == 0)
200: return 0;
201: if (isalpha(c) || c == '_')
202: return word(buf);
1.6 millert 203: if (isdigit(c)) {
1.17 millert 204: char *cp = tostring(buf);
205: yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
206: free(cp);
1.1 kstailey 207: /* should this also have STR set? */
208: RET(NUMBER);
209: }
1.14 millert 210:
1.1 kstailey 211: yylval.i = c;
212: switch (c) {
213: case '\n': /* {EOL} */
1.13 millert 214: lineno++;
1.1 kstailey 215: RET(NL);
216: case '\r': /* assume \n is coming */
217: case ' ': /* {WS}+ */
218: case '\t':
219: break;
220: case '#': /* #.* strip comments */
221: while ((c = input()) != '\n' && c != 0)
222: ;
223: unput(c);
1.19 millert 224: /*
225: * Next line is a hack, itcompensates for
226: * unput's treatment of \n.
227: */
228: lineno++;
1.1 kstailey 229: break;
230: case ';':
231: RET(';');
232: case '\\':
233: if (peek() == '\n') {
1.3 millert 234: input();
1.13 millert 235: lineno++;
1.1 kstailey 236: } else if (peek() == '\r') {
237: input(); input(); /* \n */
238: lineno++;
239: } else {
240: RET(c);
241: }
242: break;
243: case '&':
244: if (peek() == '&') {
245: input(); RET(AND);
1.14 millert 246: } else
1.1 kstailey 247: RET('&');
248: case '|':
249: if (peek() == '|') {
250: input(); RET(BOR);
251: } else
252: RET('|');
253: case '!':
254: if (peek() == '=') {
255: input(); yylval.i = NE; RET(NE);
256: } else if (peek() == '~') {
257: input(); yylval.i = NOTMATCH; RET(MATCHOP);
258: } else
259: RET(NOT);
260: case '~':
261: yylval.i = MATCH;
262: RET(MATCHOP);
263: case '<':
264: if (peek() == '=') {
265: input(); yylval.i = LE; RET(LE);
266: } else {
267: yylval.i = LT; RET(LT);
268: }
269: case '=':
270: if (peek() == '=') {
271: input(); yylval.i = EQ; RET(EQ);
272: } else {
273: yylval.i = ASSIGN; RET(ASGNOP);
274: }
275: case '>':
276: if (peek() == '=') {
277: input(); yylval.i = GE; RET(GE);
278: } else if (peek() == '>') {
279: input(); yylval.i = APPEND; RET(APPEND);
280: } else {
281: yylval.i = GT; RET(GT);
282: }
283: case '+':
284: if (peek() == '+') {
285: input(); yylval.i = INCR; RET(INCR);
286: } else if (peek() == '=') {
287: input(); yylval.i = ADDEQ; RET(ASGNOP);
288: } else
289: RET('+');
290: case '-':
291: if (peek() == '-') {
292: input(); yylval.i = DECR; RET(DECR);
293: } else if (peek() == '=') {
294: input(); yylval.i = SUBEQ; RET(ASGNOP);
295: } else
296: RET('-');
297: case '*':
298: if (peek() == '=') { /* *= */
299: input(); yylval.i = MULTEQ; RET(ASGNOP);
300: } else if (peek() == '*') { /* ** or **= */
301: input(); /* eat 2nd * */
302: if (peek() == '=') {
303: input(); yylval.i = POWEQ; RET(ASGNOP);
304: } else {
305: RET(POWER);
306: }
307: } else
308: RET('*');
309: case '/':
1.3 millert 310: RET('/');
1.1 kstailey 311: case '%':
312: if (peek() == '=') {
313: input(); yylval.i = MODEQ; RET(ASGNOP);
314: } else
315: RET('%');
316: case '^':
317: if (peek() == '=') {
318: input(); yylval.i = POWEQ; RET(ASGNOP);
319: } else
320: RET(POWER);
1.5 millert 321:
1.1 kstailey 322: case '$':
323: /* BUG: awkward, if not wrong */
324: c = gettok(&buf, &bufsize);
1.5 millert 325: if (isalpha(c)) {
1.1 kstailey 326: if (strcmp(buf, "NF") == 0) { /* very special */
327: unputstr("(NF)");
1.5 millert 328: RET(INDIRECT);
329: }
330: c = peek();
331: if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
332: unputstr(buf);
1.1 kstailey 333: RET(INDIRECT);
334: }
335: yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
336: RET(IVAR);
1.6 millert 337: } else if (c == 0) { /* */
338: SYNTAX( "unexpected end of input after $" );
339: RET(';');
1.1 kstailey 340: } else {
341: unputstr(buf);
342: RET(INDIRECT);
343: }
1.14 millert 344:
1.1 kstailey 345: case '}':
346: if (--bracecnt < 0)
1.4 millert 347: SYNTAX( "extra }" );
1.16 millert 348: sc = true;
1.1 kstailey 349: RET(';');
350: case ']':
351: if (--brackcnt < 0)
1.4 millert 352: SYNTAX( "extra ]" );
1.1 kstailey 353: RET(']');
354: case ')':
355: if (--parencnt < 0)
1.4 millert 356: SYNTAX( "extra )" );
1.1 kstailey 357: RET(')');
358: case '{':
359: bracecnt++;
360: RET('{');
361: case '[':
362: brackcnt++;
363: RET('[');
364: case '(':
365: parencnt++;
366: RET('(');
1.14 millert 367:
1.1 kstailey 368: case '"':
369: return string(); /* BUG: should be like tran.c ? */
1.14 millert 370:
1.1 kstailey 371: default:
372: RET(c);
373: }
374: }
375: }
376:
1.3 millert 377: int string(void)
1.1 kstailey 378: {
379: int c, n;
380: char *s, *bp;
1.14 millert 381: static char *buf = NULL;
1.1 kstailey 382: static int bufsz = 500;
383:
1.15 millert 384: if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1.4 millert 385: FATAL("out of space for strings");
1.1 kstailey 386: for (bp = buf; (c = input()) != '"'; ) {
1.11 millert 387: if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
1.4 millert 388: FATAL("out of space for string %.10s...", buf);
1.1 kstailey 389: switch (c) {
390: case '\n':
391: case '\r':
392: case 0:
1.13 millert 393: *bp = '\0';
1.4 millert 394: SYNTAX( "non-terminated string %.10s...", buf );
1.6 millert 395: if (c == 0) /* hopeless */
396: FATAL( "giving up" );
1.13 millert 397: lineno++;
1.1 kstailey 398: break;
399: case '\\':
400: c = input();
401: switch (c) {
1.19 millert 402: case '\n': break;
1.1 kstailey 403: case '"': *bp++ = '"'; break;
1.14 millert 404: case 'n': *bp++ = '\n'; break;
1.1 kstailey 405: case 't': *bp++ = '\t'; break;
406: case 'f': *bp++ = '\f'; break;
407: case 'r': *bp++ = '\r'; break;
408: case 'b': *bp++ = '\b'; break;
409: case 'v': *bp++ = '\v'; break;
1.18 millert 410: case 'a': *bp++ = '\a'; break;
1.1 kstailey 411: case '\\': *bp++ = '\\'; break;
412:
413: case '0': case '1': case '2': /* octal: \d \dd \ddd */
414: case '3': case '4': case '5': case '6': case '7':
415: n = c - '0';
416: if ((c = peek()) >= '0' && c < '8') {
417: n = 8 * n + input() - '0';
418: if ((c = peek()) >= '0' && c < '8')
419: n = 8 * n + input() - '0';
420: }
421: *bp++ = n;
422: break;
423:
424: case 'x': /* hex \x0-9a-fA-F + */
425: { char xbuf[100], *px;
426: for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
427: if (isdigit(c)
428: || (c >= 'a' && c <= 'f')
429: || (c >= 'A' && c <= 'F'))
430: *px++ = c;
431: else
432: break;
433: }
434: *px = 0;
435: unput(c);
1.12 millert 436: sscanf(xbuf, "%x", (unsigned int *) &n);
1.1 kstailey 437: *bp++ = n;
438: break;
439: }
440:
1.14 millert 441: default:
1.1 kstailey 442: *bp++ = c;
443: break;
444: }
445: break;
446: default:
447: *bp++ = c;
448: break;
449: }
450: }
1.14 millert 451: *bp = 0;
1.1 kstailey 452: s = tostring(buf);
1.17 millert 453: *bp++ = ' '; *bp++ = '\0';
1.1 kstailey 454: yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
1.17 millert 455: free(s);
1.1 kstailey 456: RET(STRING);
457: }
458:
459:
1.18 millert 460: static int binsearch(char *w, const Keyword *kp, int n)
1.1 kstailey 461: {
462: int cond, low, mid, high;
463:
464: low = 0;
465: high = n - 1;
466: while (low <= high) {
467: mid = (low + high) / 2;
468: if ((cond = strcmp(w, kp[mid].word)) < 0)
469: high = mid - 1;
470: else if (cond > 0)
471: low = mid + 1;
472: else
473: return mid;
474: }
475: return -1;
476: }
477:
1.14 millert 478: int word(char *w)
1.1 kstailey 479: {
1.18 millert 480: const Keyword *kp;
1.1 kstailey 481: int c, n;
482:
483: n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
484: if (n != -1) { /* found in table */
1.14 millert 485: kp = keywords + n;
1.1 kstailey 486: yylval.i = kp->sub;
487: switch (kp->type) { /* special handling */
1.11 millert 488: case BLTIN:
489: if (kp->sub == FSYSTEM && safe)
1.4 millert 490: SYNTAX( "system is unsafe" );
1.1 kstailey 491: RET(kp->type);
492: case FUNC:
493: if (infunc)
1.4 millert 494: SYNTAX( "illegal nested function" );
1.1 kstailey 495: RET(kp->type);
496: case RETURN:
497: if (!infunc)
1.4 millert 498: SYNTAX( "return not in function" );
1.1 kstailey 499: RET(kp->type);
500: case VARNF:
501: yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
502: RET(VARNF);
503: default:
504: RET(kp->type);
505: }
506: }
507: c = peek(); /* look for '(' */
508: if (c != '(' && infunc && (n=isarg(w)) >= 0) {
509: yylval.i = n;
510: RET(ARG);
511: } else {
512: yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
513: if (c == '(') {
514: RET(CALL);
515: } else {
516: RET(VAR);
517: }
518: }
519: }
520:
1.6 millert 521: void startreg(void) /* next call to yylex will return a regular expression */
1.1 kstailey 522: {
1.16 millert 523: reg = true;
1.1 kstailey 524: }
525:
1.3 millert 526: int regexpr(void)
1.1 kstailey 527: {
1.9 hugh 528: int c, openclass = 0;
1.14 millert 529: static char *buf = NULL;
1.1 kstailey 530: static int bufsz = 500;
1.20 millert 531: char *bp, *cstart;
1.1 kstailey 532:
1.15 millert 533: if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1.4 millert 534: FATAL("out of space for rex expr");
1.1 kstailey 535: bp = buf;
1.20 millert 536: for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
1.11 millert 537: if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
1.4 millert 538: FATAL("out of space for reg expr %.10s...", buf);
1.1 kstailey 539: if (c == '\n') {
1.13 millert 540: *bp = '\0';
1.14 millert 541: SYNTAX( "newline in regular expression %.10s...", buf );
1.1 kstailey 542: unput('\n');
543: break;
544: } else if (c == '\\') {
1.14 millert 545: *bp++ = '\\';
1.1 kstailey 546: *bp++ = input();
547: } else {
1.20 millert 548: /*
549: * POSIX requires a slash in a regexp to be escaped,
550: * other awks don't require it to be escaped inside
551: * a character class.
552: */
553: if (!do_posix) {
554: if (c == '[') {
1.24 millert 555: int nextc = peek();
556: if (openclass == 0 || nextc == ':' ||
557: nextc == '.' || nextc == '=') {
1.22 millert 558: if (++openclass == 1)
559: cstart = bp;
560: }
1.20 millert 561: } else if (c == ']' && openclass > 0) {
562: /*
563: * A ']' as the first char in a
564: * class is treated literally.
565: */
566: if (cstart != bp - 1 &&
567: (cstart != bp - 2 || bp[-1] != '^'))
568: openclass--;
569: }
570: }
1.1 kstailey 571: *bp++ = c;
572: }
573: }
574: *bp = 0;
1.8 millert 575: if (c == 0)
576: SYNTAX("non-terminated regular expression %.10s...", buf);
1.1 kstailey 577: yylval.s = tostring(buf);
578: unput('/');
579: RET(REGEXPR);
580: }
581:
582: /* low-level lexical stuff, sort of inherited from lex */
583:
584: char ebuf[300];
585: char *ep = ebuf;
586: char yysbuf[100]; /* pushback buffer */
587: char *yysptr = yysbuf;
1.14 millert 588: FILE *yyin = NULL;
1.1 kstailey 589:
590: int input(void) /* get next lexical input character */
591: {
592: int c;
593: extern char *lexprog;
594:
595: if (yysptr > yysbuf)
1.8 millert 596: c = (uschar)*--yysptr;
1.1 kstailey 597: else if (lexprog != NULL) { /* awk '...' */
1.8 millert 598: if ((c = (uschar)*lexprog) != 0)
1.1 kstailey 599: lexprog++;
600: } else /* awk -f ... */
601: c = pgetc();
1.13 millert 602: if (c == EOF)
1.1 kstailey 603: c = 0;
604: if (ep >= ebuf + sizeof ebuf)
605: ep = ebuf;
1.13 millert 606: *ep = c;
607: if (c != 0) {
608: ep++;
609: }
610: return (c);
1.1 kstailey 611: }
612:
613: void unput(int c) /* put lexical character back on input */
614: {
1.18 millert 615: if (c == '\n')
616: lineno--;
1.1 kstailey 617: if (yysptr >= yysbuf + sizeof(yysbuf))
1.4 millert 618: FATAL("pushed back too much: %.20s...", yysbuf);
1.1 kstailey 619: *yysptr++ = c;
620: if (--ep < ebuf)
621: ep = ebuf + sizeof(ebuf) - 1;
622: }
623:
1.6 millert 624: void unputstr(const char *s) /* put a string back on input */
1.1 kstailey 625: {
626: int i;
627:
628: for (i = strlen(s)-1; i >= 0; i--)
629: unput(s[i]);
630: }