Annotation of src/usr.bin/awk/lex.c, Revision 1.23
1.23 ! millert 1: /* $OpenBSD: lex.c,v 1.22 2020/06/23 16:54:40 millert Exp $ */
1.1 kstailey 2: /****************************************************************
3: Copyright (C) Lucent Technologies 1997
4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
24: ****************************************************************/
25:
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <ctype.h>
30: #include "awk.h"
31: #include "ytab.h"
32:
33: extern YYSTYPE yylval;
1.19 millert 34: extern bool infunc;
1.1 kstailey 35:
36: int lineno = 1;
37: int bracecnt = 0;
38: int brackcnt = 0;
39: int parencnt = 0;
40:
41: typedef struct Keyword {
1.6 millert 42: const char *word;
1.1 kstailey 43: int sub;
44: int type;
45: } Keyword;
46:
1.18 millert 47: const Keyword keywords[] = { /* keep sorted: binary searched */
1.1 kstailey 48: { "BEGIN", XBEGIN, XBEGIN },
49: { "END", XEND, XEND },
50: { "NF", VARNF, VARNF },
1.10 pyr 51: { "and", FAND, BLTIN },
1.1 kstailey 52: { "atan2", FATAN, BLTIN },
53: { "break", BREAK, BREAK },
54: { "close", CLOSE, CLOSE },
1.10 pyr 55: { "compl", FCOMPL, BLTIN },
1.1 kstailey 56: { "continue", CONTINUE, CONTINUE },
57: { "cos", FCOS, BLTIN },
58: { "delete", DELETE, DELETE },
59: { "do", DO, DO },
60: { "else", ELSE, ELSE },
61: { "exit", EXIT, EXIT },
62: { "exp", FEXP, BLTIN },
63: { "fflush", FFLUSH, BLTIN },
64: { "for", FOR, FOR },
65: { "func", FUNC, FUNC },
66: { "function", FUNC, FUNC },
1.21 millert 67: { "gensub", GENSUB, GENSUB },
1.1 kstailey 68: { "getline", GETLINE, GETLINE },
69: { "gsub", GSUB, GSUB },
70: { "if", IF, IF },
71: { "in", IN, IN },
72: { "index", INDEX, INDEX },
73: { "int", FINT, BLTIN },
74: { "length", FLENGTH, BLTIN },
75: { "log", FLOG, BLTIN },
1.10 pyr 76: { "lshift", FLSHIFT, BLTIN },
1.1 kstailey 77: { "match", MATCHFCN, MATCHFCN },
78: { "next", NEXT, NEXT },
79: { "nextfile", NEXTFILE, NEXTFILE },
1.10 pyr 80: { "or", FFOR, BLTIN },
1.1 kstailey 81: { "print", PRINT, PRINT },
82: { "printf", PRINTF, PRINTF },
83: { "rand", FRAND, BLTIN },
84: { "return", RETURN, RETURN },
1.10 pyr 85: { "rshift", FRSHIFT, BLTIN },
1.1 kstailey 86: { "sin", FSIN, BLTIN },
87: { "split", SPLIT, SPLIT },
88: { "sprintf", SPRINTF, SPRINTF },
89: { "sqrt", FSQRT, BLTIN },
90: { "srand", FSRAND, BLTIN },
1.21 millert 91: { "strftime", FSTRFTIME, BLTIN },
1.1 kstailey 92: { "sub", SUB, SUB },
93: { "substr", SUBSTR, SUBSTR },
94: { "system", FSYSTEM, BLTIN },
1.21 millert 95: { "systime", FSYSTIME, BLTIN },
1.1 kstailey 96: { "tolower", FTOLOWER, BLTIN },
97: { "toupper", FTOUPPER, BLTIN },
98: { "while", WHILE, WHILE },
1.10 pyr 99: { "xor", FXOR, BLTIN },
1.1 kstailey 100: };
101:
102: #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
1.7 deraadt 103:
1.18 millert 104: static int peek(void)
1.1 kstailey 105: {
106: int c = input();
107: unput(c);
108: return c;
109: }
110:
1.18 millert 111: static int gettok(char **pbuf, int *psz) /* get next input token */
1.1 kstailey 112: {
1.6 millert 113: int c, retc;
1.1 kstailey 114: char *buf = *pbuf;
115: int sz = *psz;
116: char *bp = buf;
117:
118: c = input();
119: if (c == 0)
120: return 0;
121: buf[0] = c;
122: buf[1] = 0;
123: if (!isalnum(c) && c != '.' && c != '_')
124: return c;
125:
126: *bp++ = c;
127: if (isalpha(c) || c == '_') { /* it's a varname */
128: for ( ; (c = input()) != 0; ) {
129: if (bp-buf >= sz)
1.11 millert 130: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
1.4 millert 131: FATAL( "out of space for name %.10s...", buf );
1.1 kstailey 132: if (isalnum(c) || c == '_')
133: *bp++ = c;
134: else {
135: *bp = 0;
136: unput(c);
137: break;
138: }
139: }
1.4 millert 140: *bp = 0;
1.6 millert 141: retc = 'a'; /* alphanumeric */
1.11 millert 142: } else { /* maybe it's a number, but could be . */
1.1 kstailey 143: char *rem;
144: /* read input until can't be a number */
145: for ( ; (c = input()) != 0; ) {
146: if (bp-buf >= sz)
1.11 millert 147: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
1.4 millert 148: FATAL( "out of space for number %.10s...", buf );
1.14 millert 149: if (isdigit(c) || c == 'e' || c == 'E'
1.1 kstailey 150: || c == '.' || c == '+' || c == '-')
151: *bp++ = c;
152: else {
153: unput(c);
154: break;
155: }
156: }
1.2 millert 157: *bp = 0;
1.1 kstailey 158: strtod(buf, &rem); /* parse the number */
1.6 millert 159: if (rem == buf) { /* it wasn't a valid number at all */
1.11 millert 160: buf[1] = 0; /* return one character as token */
1.23 ! millert 161: retc = (uschar)buf[0]; /* character is its own type */
1.11 millert 162: unputstr(rem+1); /* put rest back for later */
1.6 millert 163: } else { /* some prefix was a number */
1.11 millert 164: unputstr(rem); /* put rest back for later */
165: rem[0] = 0; /* truncate buf after number part */
166: retc = '0'; /* type is number */
1.6 millert 167: }
1.1 kstailey 168: }
169: *pbuf = buf;
170: *psz = sz;
1.6 millert 171: return retc;
1.1 kstailey 172: }
173:
174: int word(char *);
175: int string(void);
176: int regexpr(void);
1.16 millert 177: bool sc = false; /* true => return a } right now */
178: bool reg = false; /* true => return a REGEXPR now */
1.1 kstailey 179:
1.3 millert 180: int yylex(void)
1.1 kstailey 181: {
1.3 millert 182: int c;
1.14 millert 183: static char *buf = NULL;
1.11 millert 184: static int bufsize = 5; /* BUG: setting this small causes core dump! */
1.1 kstailey 185:
1.15 millert 186: if (buf == NULL && (buf = malloc(bufsize)) == NULL)
1.4 millert 187: FATAL( "out of space in yylex" );
1.1 kstailey 188: if (sc) {
1.16 millert 189: sc = false;
1.1 kstailey 190: RET('}');
191: }
192: if (reg) {
1.16 millert 193: reg = false;
1.1 kstailey 194: return regexpr();
195: }
196: for (;;) {
197: c = gettok(&buf, &bufsize);
198: if (c == 0)
199: return 0;
200: if (isalpha(c) || c == '_')
201: return word(buf);
1.6 millert 202: if (isdigit(c)) {
1.17 millert 203: char *cp = tostring(buf);
204: yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
205: free(cp);
1.1 kstailey 206: /* should this also have STR set? */
207: RET(NUMBER);
208: }
1.14 millert 209:
1.1 kstailey 210: yylval.i = c;
211: switch (c) {
212: case '\n': /* {EOL} */
1.13 millert 213: lineno++;
1.1 kstailey 214: RET(NL);
215: case '\r': /* assume \n is coming */
216: case ' ': /* {WS}+ */
217: case '\t':
218: break;
219: case '#': /* #.* strip comments */
220: while ((c = input()) != '\n' && c != 0)
221: ;
222: unput(c);
1.19 millert 223: /*
224: * Next line is a hack, itcompensates for
225: * unput's treatment of \n.
226: */
227: lineno++;
1.1 kstailey 228: break;
229: case ';':
230: RET(';');
231: case '\\':
232: if (peek() == '\n') {
1.3 millert 233: input();
1.13 millert 234: lineno++;
1.1 kstailey 235: } else if (peek() == '\r') {
236: input(); input(); /* \n */
237: lineno++;
238: } else {
239: RET(c);
240: }
241: break;
242: case '&':
243: if (peek() == '&') {
244: input(); RET(AND);
1.14 millert 245: } else
1.1 kstailey 246: RET('&');
247: case '|':
248: if (peek() == '|') {
249: input(); RET(BOR);
250: } else
251: RET('|');
252: case '!':
253: if (peek() == '=') {
254: input(); yylval.i = NE; RET(NE);
255: } else if (peek() == '~') {
256: input(); yylval.i = NOTMATCH; RET(MATCHOP);
257: } else
258: RET(NOT);
259: case '~':
260: yylval.i = MATCH;
261: RET(MATCHOP);
262: case '<':
263: if (peek() == '=') {
264: input(); yylval.i = LE; RET(LE);
265: } else {
266: yylval.i = LT; RET(LT);
267: }
268: case '=':
269: if (peek() == '=') {
270: input(); yylval.i = EQ; RET(EQ);
271: } else {
272: yylval.i = ASSIGN; RET(ASGNOP);
273: }
274: case '>':
275: if (peek() == '=') {
276: input(); yylval.i = GE; RET(GE);
277: } else if (peek() == '>') {
278: input(); yylval.i = APPEND; RET(APPEND);
279: } else {
280: yylval.i = GT; RET(GT);
281: }
282: case '+':
283: if (peek() == '+') {
284: input(); yylval.i = INCR; RET(INCR);
285: } else if (peek() == '=') {
286: input(); yylval.i = ADDEQ; RET(ASGNOP);
287: } else
288: RET('+');
289: case '-':
290: if (peek() == '-') {
291: input(); yylval.i = DECR; RET(DECR);
292: } else if (peek() == '=') {
293: input(); yylval.i = SUBEQ; RET(ASGNOP);
294: } else
295: RET('-');
296: case '*':
297: if (peek() == '=') { /* *= */
298: input(); yylval.i = MULTEQ; RET(ASGNOP);
299: } else if (peek() == '*') { /* ** or **= */
300: input(); /* eat 2nd * */
301: if (peek() == '=') {
302: input(); yylval.i = POWEQ; RET(ASGNOP);
303: } else {
304: RET(POWER);
305: }
306: } else
307: RET('*');
308: case '/':
1.3 millert 309: RET('/');
1.1 kstailey 310: case '%':
311: if (peek() == '=') {
312: input(); yylval.i = MODEQ; RET(ASGNOP);
313: } else
314: RET('%');
315: case '^':
316: if (peek() == '=') {
317: input(); yylval.i = POWEQ; RET(ASGNOP);
318: } else
319: RET(POWER);
1.5 millert 320:
1.1 kstailey 321: case '$':
322: /* BUG: awkward, if not wrong */
323: c = gettok(&buf, &bufsize);
1.5 millert 324: if (isalpha(c)) {
1.1 kstailey 325: if (strcmp(buf, "NF") == 0) { /* very special */
326: unputstr("(NF)");
1.5 millert 327: RET(INDIRECT);
328: }
329: c = peek();
330: if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
331: unputstr(buf);
1.1 kstailey 332: RET(INDIRECT);
333: }
334: yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
335: RET(IVAR);
1.6 millert 336: } else if (c == 0) { /* */
337: SYNTAX( "unexpected end of input after $" );
338: RET(';');
1.1 kstailey 339: } else {
340: unputstr(buf);
341: RET(INDIRECT);
342: }
1.14 millert 343:
1.1 kstailey 344: case '}':
345: if (--bracecnt < 0)
1.4 millert 346: SYNTAX( "extra }" );
1.16 millert 347: sc = true;
1.1 kstailey 348: RET(';');
349: case ']':
350: if (--brackcnt < 0)
1.4 millert 351: SYNTAX( "extra ]" );
1.1 kstailey 352: RET(']');
353: case ')':
354: if (--parencnt < 0)
1.4 millert 355: SYNTAX( "extra )" );
1.1 kstailey 356: RET(')');
357: case '{':
358: bracecnt++;
359: RET('{');
360: case '[':
361: brackcnt++;
362: RET('[');
363: case '(':
364: parencnt++;
365: RET('(');
1.14 millert 366:
1.1 kstailey 367: case '"':
368: return string(); /* BUG: should be like tran.c ? */
1.14 millert 369:
1.1 kstailey 370: default:
371: RET(c);
372: }
373: }
374: }
375:
1.3 millert 376: int string(void)
1.1 kstailey 377: {
378: int c, n;
379: char *s, *bp;
1.14 millert 380: static char *buf = NULL;
1.1 kstailey 381: static int bufsz = 500;
382:
1.15 millert 383: if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1.4 millert 384: FATAL("out of space for strings");
1.1 kstailey 385: for (bp = buf; (c = input()) != '"'; ) {
1.11 millert 386: if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
1.4 millert 387: FATAL("out of space for string %.10s...", buf);
1.1 kstailey 388: switch (c) {
389: case '\n':
390: case '\r':
391: case 0:
1.13 millert 392: *bp = '\0';
1.4 millert 393: SYNTAX( "non-terminated string %.10s...", buf );
1.6 millert 394: if (c == 0) /* hopeless */
395: FATAL( "giving up" );
1.13 millert 396: lineno++;
1.1 kstailey 397: break;
398: case '\\':
399: c = input();
400: switch (c) {
1.19 millert 401: case '\n': break;
1.1 kstailey 402: case '"': *bp++ = '"'; break;
1.14 millert 403: case 'n': *bp++ = '\n'; break;
1.1 kstailey 404: case 't': *bp++ = '\t'; break;
405: case 'f': *bp++ = '\f'; break;
406: case 'r': *bp++ = '\r'; break;
407: case 'b': *bp++ = '\b'; break;
408: case 'v': *bp++ = '\v'; break;
1.18 millert 409: case 'a': *bp++ = '\a'; break;
1.1 kstailey 410: case '\\': *bp++ = '\\'; break;
411:
412: case '0': case '1': case '2': /* octal: \d \dd \ddd */
413: case '3': case '4': case '5': case '6': case '7':
414: n = c - '0';
415: if ((c = peek()) >= '0' && c < '8') {
416: n = 8 * n + input() - '0';
417: if ((c = peek()) >= '0' && c < '8')
418: n = 8 * n + input() - '0';
419: }
420: *bp++ = n;
421: break;
422:
423: case 'x': /* hex \x0-9a-fA-F + */
424: { char xbuf[100], *px;
425: for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
426: if (isdigit(c)
427: || (c >= 'a' && c <= 'f')
428: || (c >= 'A' && c <= 'F'))
429: *px++ = c;
430: else
431: break;
432: }
433: *px = 0;
434: unput(c);
1.12 millert 435: sscanf(xbuf, "%x", (unsigned int *) &n);
1.1 kstailey 436: *bp++ = n;
437: break;
438: }
439:
1.14 millert 440: default:
1.1 kstailey 441: *bp++ = c;
442: break;
443: }
444: break;
445: default:
446: *bp++ = c;
447: break;
448: }
449: }
1.14 millert 450: *bp = 0;
1.1 kstailey 451: s = tostring(buf);
1.17 millert 452: *bp++ = ' '; *bp++ = '\0';
1.1 kstailey 453: yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
1.17 millert 454: free(s);
1.1 kstailey 455: RET(STRING);
456: }
457:
458:
1.18 millert 459: static int binsearch(char *w, const Keyword *kp, int n)
1.1 kstailey 460: {
461: int cond, low, mid, high;
462:
463: low = 0;
464: high = n - 1;
465: while (low <= high) {
466: mid = (low + high) / 2;
467: if ((cond = strcmp(w, kp[mid].word)) < 0)
468: high = mid - 1;
469: else if (cond > 0)
470: low = mid + 1;
471: else
472: return mid;
473: }
474: return -1;
475: }
476:
1.14 millert 477: int word(char *w)
1.1 kstailey 478: {
1.18 millert 479: const Keyword *kp;
1.1 kstailey 480: int c, n;
481:
482: n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
483: if (n != -1) { /* found in table */
1.14 millert 484: kp = keywords + n;
1.1 kstailey 485: yylval.i = kp->sub;
486: switch (kp->type) { /* special handling */
1.11 millert 487: case BLTIN:
488: if (kp->sub == FSYSTEM && safe)
1.4 millert 489: SYNTAX( "system is unsafe" );
1.1 kstailey 490: RET(kp->type);
491: case FUNC:
492: if (infunc)
1.4 millert 493: SYNTAX( "illegal nested function" );
1.1 kstailey 494: RET(kp->type);
495: case RETURN:
496: if (!infunc)
1.4 millert 497: SYNTAX( "return not in function" );
1.1 kstailey 498: RET(kp->type);
499: case VARNF:
500: yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
501: RET(VARNF);
502: default:
503: RET(kp->type);
504: }
505: }
506: c = peek(); /* look for '(' */
507: if (c != '(' && infunc && (n=isarg(w)) >= 0) {
508: yylval.i = n;
509: RET(ARG);
510: } else {
511: yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
512: if (c == '(') {
513: RET(CALL);
514: } else {
515: RET(VAR);
516: }
517: }
518: }
519:
1.6 millert 520: void startreg(void) /* next call to yylex will return a regular expression */
1.1 kstailey 521: {
1.16 millert 522: reg = true;
1.1 kstailey 523: }
524:
1.3 millert 525: int regexpr(void)
1.1 kstailey 526: {
1.9 hugh 527: int c, openclass = 0;
1.14 millert 528: static char *buf = NULL;
1.1 kstailey 529: static int bufsz = 500;
1.20 millert 530: char *bp, *cstart;
1.1 kstailey 531:
1.15 millert 532: if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1.4 millert 533: FATAL("out of space for rex expr");
1.1 kstailey 534: bp = buf;
1.20 millert 535: for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
1.11 millert 536: if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
1.4 millert 537: FATAL("out of space for reg expr %.10s...", buf);
1.1 kstailey 538: if (c == '\n') {
1.13 millert 539: *bp = '\0';
1.14 millert 540: SYNTAX( "newline in regular expression %.10s...", buf );
1.1 kstailey 541: unput('\n');
542: break;
543: } else if (c == '\\') {
1.14 millert 544: *bp++ = '\\';
1.1 kstailey 545: *bp++ = input();
546: } else {
1.20 millert 547: /*
548: * POSIX requires a slash in a regexp to be escaped,
549: * other awks don't require it to be escaped inside
550: * a character class.
551: */
552: if (!do_posix) {
553: if (c == '[') {
1.22 millert 554: if (openclass == 0 || peek() == ':') {
555: if (++openclass == 1)
556: cstart = bp;
557: }
1.20 millert 558: } else if (c == ']' && openclass > 0) {
559: /*
560: * A ']' as the first char in a
561: * class is treated literally.
562: */
563: if (cstart != bp - 1 &&
564: (cstart != bp - 2 || bp[-1] != '^'))
565: openclass--;
566: }
567: }
1.1 kstailey 568: *bp++ = c;
569: }
570: }
571: *bp = 0;
1.8 millert 572: if (c == 0)
573: SYNTAX("non-terminated regular expression %.10s...", buf);
1.1 kstailey 574: yylval.s = tostring(buf);
575: unput('/');
576: RET(REGEXPR);
577: }
578:
579: /* low-level lexical stuff, sort of inherited from lex */
580:
581: char ebuf[300];
582: char *ep = ebuf;
583: char yysbuf[100]; /* pushback buffer */
584: char *yysptr = yysbuf;
1.14 millert 585: FILE *yyin = NULL;
1.1 kstailey 586:
587: int input(void) /* get next lexical input character */
588: {
589: int c;
590: extern char *lexprog;
591:
592: if (yysptr > yysbuf)
1.8 millert 593: c = (uschar)*--yysptr;
1.1 kstailey 594: else if (lexprog != NULL) { /* awk '...' */
1.8 millert 595: if ((c = (uschar)*lexprog) != 0)
1.1 kstailey 596: lexprog++;
597: } else /* awk -f ... */
598: c = pgetc();
1.13 millert 599: if (c == EOF)
1.1 kstailey 600: c = 0;
601: if (ep >= ebuf + sizeof ebuf)
602: ep = ebuf;
1.13 millert 603: *ep = c;
604: if (c != 0) {
605: ep++;
606: }
607: return (c);
1.1 kstailey 608: }
609:
610: void unput(int c) /* put lexical character back on input */
611: {
1.18 millert 612: if (c == '\n')
613: lineno--;
1.1 kstailey 614: if (yysptr >= yysbuf + sizeof(yysbuf))
1.4 millert 615: FATAL("pushed back too much: %.20s...", yysbuf);
1.1 kstailey 616: *yysptr++ = c;
617: if (--ep < ebuf)
618: ep = ebuf + sizeof(ebuf) - 1;
619: }
620:
1.6 millert 621: void unputstr(const char *s) /* put a string back on input */
1.1 kstailey 622: {
623: int i;
624:
625: for (i = strlen(s)-1; i >= 0; i--)
626: unput(s[i]);
627: }