Annotation of src/usr.bin/awk/lex.c, Revision 1.29
1.29 ! millert 1: /* $OpenBSD: lex.c,v 1.28 2022/09/01 15:21:28 millert Exp $ */
1.1 kstailey 2: /****************************************************************
3: Copyright (C) Lucent Technologies 1997
4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
24: ****************************************************************/
25:
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <ctype.h>
30: #include "awk.h"
1.25 millert 31: #include "awkgram.tab.h"
1.1 kstailey 32:
33: extern YYSTYPE yylval;
1.19 millert 34: extern bool infunc;
1.1 kstailey 35:
36: int lineno = 1;
37: int bracecnt = 0;
38: int brackcnt = 0;
39: int parencnt = 0;
40:
41: typedef struct Keyword {
1.6 millert 42: const char *word;
1.1 kstailey 43: int sub;
44: int type;
45: } Keyword;
46:
1.18 millert 47: const Keyword keywords[] = { /* keep sorted: binary searched */
1.1 kstailey 48: { "BEGIN", XBEGIN, XBEGIN },
49: { "END", XEND, XEND },
50: { "NF", VARNF, VARNF },
1.10 pyr 51: { "and", FAND, BLTIN },
1.1 kstailey 52: { "atan2", FATAN, BLTIN },
53: { "break", BREAK, BREAK },
54: { "close", CLOSE, CLOSE },
1.10 pyr 55: { "compl", FCOMPL, BLTIN },
1.1 kstailey 56: { "continue", CONTINUE, CONTINUE },
57: { "cos", FCOS, BLTIN },
58: { "delete", DELETE, DELETE },
59: { "do", DO, DO },
60: { "else", ELSE, ELSE },
61: { "exit", EXIT, EXIT },
62: { "exp", FEXP, BLTIN },
63: { "fflush", FFLUSH, BLTIN },
64: { "for", FOR, FOR },
65: { "func", FUNC, FUNC },
66: { "function", FUNC, FUNC },
1.21 millert 67: { "gensub", GENSUB, GENSUB },
1.1 kstailey 68: { "getline", GETLINE, GETLINE },
69: { "gsub", GSUB, GSUB },
70: { "if", IF, IF },
71: { "in", IN, IN },
72: { "index", INDEX, INDEX },
73: { "int", FINT, BLTIN },
74: { "length", FLENGTH, BLTIN },
75: { "log", FLOG, BLTIN },
1.10 pyr 76: { "lshift", FLSHIFT, BLTIN },
1.1 kstailey 77: { "match", MATCHFCN, MATCHFCN },
1.26 millert 78: { "mktime", FMKTIME, BLTIN },
1.1 kstailey 79: { "next", NEXT, NEXT },
80: { "nextfile", NEXTFILE, NEXTFILE },
1.10 pyr 81: { "or", FFOR, BLTIN },
1.1 kstailey 82: { "print", PRINT, PRINT },
83: { "printf", PRINTF, PRINTF },
84: { "rand", FRAND, BLTIN },
85: { "return", RETURN, RETURN },
1.10 pyr 86: { "rshift", FRSHIFT, BLTIN },
1.1 kstailey 87: { "sin", FSIN, BLTIN },
88: { "split", SPLIT, SPLIT },
89: { "sprintf", SPRINTF, SPRINTF },
90: { "sqrt", FSQRT, BLTIN },
91: { "srand", FSRAND, BLTIN },
1.21 millert 92: { "strftime", FSTRFTIME, BLTIN },
1.1 kstailey 93: { "sub", SUB, SUB },
94: { "substr", SUBSTR, SUBSTR },
95: { "system", FSYSTEM, BLTIN },
1.21 millert 96: { "systime", FSYSTIME, BLTIN },
1.1 kstailey 97: { "tolower", FTOLOWER, BLTIN },
98: { "toupper", FTOUPPER, BLTIN },
99: { "while", WHILE, WHILE },
1.10 pyr 100: { "xor", FXOR, BLTIN },
1.1 kstailey 101: };
102:
103: #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
1.7 deraadt 104:
1.18 millert 105: static int peek(void)
1.1 kstailey 106: {
107: int c = input();
108: unput(c);
109: return c;
110: }
111:
1.18 millert 112: static int gettok(char **pbuf, int *psz) /* get next input token */
1.1 kstailey 113: {
1.6 millert 114: int c, retc;
1.1 kstailey 115: char *buf = *pbuf;
116: int sz = *psz;
117: char *bp = buf;
118:
119: c = input();
120: if (c == 0)
121: return 0;
122: buf[0] = c;
123: buf[1] = 0;
124: if (!isalnum(c) && c != '.' && c != '_')
125: return c;
126:
127: *bp++ = c;
128: if (isalpha(c) || c == '_') { /* it's a varname */
129: for ( ; (c = input()) != 0; ) {
130: if (bp-buf >= sz)
1.11 millert 131: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
1.4 millert 132: FATAL( "out of space for name %.10s...", buf );
1.1 kstailey 133: if (isalnum(c) || c == '_')
134: *bp++ = c;
135: else {
136: *bp = 0;
137: unput(c);
138: break;
139: }
140: }
1.4 millert 141: *bp = 0;
1.6 millert 142: retc = 'a'; /* alphanumeric */
1.11 millert 143: } else { /* maybe it's a number, but could be . */
1.1 kstailey 144: char *rem;
145: /* read input until can't be a number */
146: for ( ; (c = input()) != 0; ) {
147: if (bp-buf >= sz)
1.11 millert 148: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
1.4 millert 149: FATAL( "out of space for number %.10s...", buf );
1.14 millert 150: if (isdigit(c) || c == 'e' || c == 'E'
1.1 kstailey 151: || c == '.' || c == '+' || c == '-')
152: *bp++ = c;
153: else {
154: unput(c);
155: break;
156: }
157: }
1.2 millert 158: *bp = 0;
1.1 kstailey 159: strtod(buf, &rem); /* parse the number */
1.6 millert 160: if (rem == buf) { /* it wasn't a valid number at all */
1.11 millert 161: buf[1] = 0; /* return one character as token */
1.23 millert 162: retc = (uschar)buf[0]; /* character is its own type */
1.11 millert 163: unputstr(rem+1); /* put rest back for later */
1.6 millert 164: } else { /* some prefix was a number */
1.11 millert 165: unputstr(rem); /* put rest back for later */
166: rem[0] = 0; /* truncate buf after number part */
167: retc = '0'; /* type is number */
1.6 millert 168: }
1.1 kstailey 169: }
170: *pbuf = buf;
171: *psz = sz;
1.6 millert 172: return retc;
1.1 kstailey 173: }
174:
175: int word(char *);
176: int string(void);
177: int regexpr(void);
1.16 millert 178: bool sc = false; /* true => return a } right now */
179: bool reg = false; /* true => return a REGEXPR now */
1.1 kstailey 180:
1.3 millert 181: int yylex(void)
1.1 kstailey 182: {
1.3 millert 183: int c;
1.14 millert 184: static char *buf = NULL;
1.11 millert 185: static int bufsize = 5; /* BUG: setting this small causes core dump! */
1.1 kstailey 186:
1.27 millert 187: if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
1.4 millert 188: FATAL( "out of space in yylex" );
1.1 kstailey 189: if (sc) {
1.16 millert 190: sc = false;
1.1 kstailey 191: RET('}');
192: }
193: if (reg) {
1.16 millert 194: reg = false;
1.1 kstailey 195: return regexpr();
196: }
197: for (;;) {
198: c = gettok(&buf, &bufsize);
199: if (c == 0)
200: return 0;
201: if (isalpha(c) || c == '_')
202: return word(buf);
1.6 millert 203: if (isdigit(c)) {
1.17 millert 204: char *cp = tostring(buf);
1.27 millert 205: double result;
206:
207: if (is_number(cp, & result))
208: yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
209: else
210: yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
1.17 millert 211: free(cp);
1.1 kstailey 212: /* should this also have STR set? */
213: RET(NUMBER);
214: }
1.14 millert 215:
1.1 kstailey 216: yylval.i = c;
217: switch (c) {
218: case '\n': /* {EOL} */
1.13 millert 219: lineno++;
1.1 kstailey 220: RET(NL);
221: case '\r': /* assume \n is coming */
222: case ' ': /* {WS}+ */
223: case '\t':
224: break;
225: case '#': /* #.* strip comments */
226: while ((c = input()) != '\n' && c != 0)
227: ;
228: unput(c);
1.19 millert 229: /*
230: * Next line is a hack, itcompensates for
231: * unput's treatment of \n.
232: */
233: lineno++;
1.1 kstailey 234: break;
235: case ';':
236: RET(';');
237: case '\\':
238: if (peek() == '\n') {
1.3 millert 239: input();
1.13 millert 240: lineno++;
1.1 kstailey 241: } else if (peek() == '\r') {
242: input(); input(); /* \n */
243: lineno++;
244: } else {
245: RET(c);
246: }
247: break;
248: case '&':
249: if (peek() == '&') {
250: input(); RET(AND);
1.14 millert 251: } else
1.1 kstailey 252: RET('&');
253: case '|':
254: if (peek() == '|') {
255: input(); RET(BOR);
256: } else
257: RET('|');
258: case '!':
259: if (peek() == '=') {
260: input(); yylval.i = NE; RET(NE);
261: } else if (peek() == '~') {
262: input(); yylval.i = NOTMATCH; RET(MATCHOP);
263: } else
264: RET(NOT);
265: case '~':
266: yylval.i = MATCH;
267: RET(MATCHOP);
268: case '<':
269: if (peek() == '=') {
270: input(); yylval.i = LE; RET(LE);
271: } else {
272: yylval.i = LT; RET(LT);
273: }
274: case '=':
275: if (peek() == '=') {
276: input(); yylval.i = EQ; RET(EQ);
277: } else {
278: yylval.i = ASSIGN; RET(ASGNOP);
279: }
280: case '>':
281: if (peek() == '=') {
282: input(); yylval.i = GE; RET(GE);
283: } else if (peek() == '>') {
284: input(); yylval.i = APPEND; RET(APPEND);
285: } else {
286: yylval.i = GT; RET(GT);
287: }
288: case '+':
289: if (peek() == '+') {
290: input(); yylval.i = INCR; RET(INCR);
291: } else if (peek() == '=') {
292: input(); yylval.i = ADDEQ; RET(ASGNOP);
293: } else
294: RET('+');
295: case '-':
296: if (peek() == '-') {
297: input(); yylval.i = DECR; RET(DECR);
298: } else if (peek() == '=') {
299: input(); yylval.i = SUBEQ; RET(ASGNOP);
300: } else
301: RET('-');
302: case '*':
303: if (peek() == '=') { /* *= */
304: input(); yylval.i = MULTEQ; RET(ASGNOP);
305: } else if (peek() == '*') { /* ** or **= */
306: input(); /* eat 2nd * */
307: if (peek() == '=') {
308: input(); yylval.i = POWEQ; RET(ASGNOP);
309: } else {
310: RET(POWER);
311: }
312: } else
313: RET('*');
314: case '/':
1.3 millert 315: RET('/');
1.1 kstailey 316: case '%':
317: if (peek() == '=') {
318: input(); yylval.i = MODEQ; RET(ASGNOP);
319: } else
320: RET('%');
321: case '^':
322: if (peek() == '=') {
323: input(); yylval.i = POWEQ; RET(ASGNOP);
324: } else
325: RET(POWER);
1.5 millert 326:
1.1 kstailey 327: case '$':
328: /* BUG: awkward, if not wrong */
329: c = gettok(&buf, &bufsize);
1.5 millert 330: if (isalpha(c)) {
1.1 kstailey 331: if (strcmp(buf, "NF") == 0) { /* very special */
332: unputstr("(NF)");
1.5 millert 333: RET(INDIRECT);
334: }
335: c = peek();
336: if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
337: unputstr(buf);
1.1 kstailey 338: RET(INDIRECT);
339: }
340: yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
341: RET(IVAR);
1.6 millert 342: } else if (c == 0) { /* */
343: SYNTAX( "unexpected end of input after $" );
344: RET(';');
1.1 kstailey 345: } else {
346: unputstr(buf);
347: RET(INDIRECT);
348: }
1.14 millert 349:
1.1 kstailey 350: case '}':
351: if (--bracecnt < 0)
1.4 millert 352: SYNTAX( "extra }" );
1.16 millert 353: sc = true;
1.1 kstailey 354: RET(';');
355: case ']':
356: if (--brackcnt < 0)
1.4 millert 357: SYNTAX( "extra ]" );
1.1 kstailey 358: RET(']');
359: case ')':
360: if (--parencnt < 0)
1.4 millert 361: SYNTAX( "extra )" );
1.1 kstailey 362: RET(')');
363: case '{':
364: bracecnt++;
365: RET('{');
366: case '[':
367: brackcnt++;
368: RET('[');
369: case '(':
370: parencnt++;
371: RET('(');
1.14 millert 372:
1.1 kstailey 373: case '"':
374: return string(); /* BUG: should be like tran.c ? */
1.14 millert 375:
1.1 kstailey 376: default:
377: RET(c);
378: }
379: }
380: }
381:
1.3 millert 382: int string(void)
1.1 kstailey 383: {
384: int c, n;
385: char *s, *bp;
1.14 millert 386: static char *buf = NULL;
1.1 kstailey 387: static int bufsz = 500;
388:
1.27 millert 389: if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
1.4 millert 390: FATAL("out of space for strings");
1.1 kstailey 391: for (bp = buf; (c = input()) != '"'; ) {
1.11 millert 392: if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
1.4 millert 393: FATAL("out of space for string %.10s...", buf);
1.1 kstailey 394: switch (c) {
395: case '\n':
396: case '\r':
397: case 0:
1.13 millert 398: *bp = '\0';
1.4 millert 399: SYNTAX( "non-terminated string %.10s...", buf );
1.6 millert 400: if (c == 0) /* hopeless */
401: FATAL( "giving up" );
1.13 millert 402: lineno++;
1.1 kstailey 403: break;
404: case '\\':
405: c = input();
406: switch (c) {
1.19 millert 407: case '\n': break;
1.1 kstailey 408: case '"': *bp++ = '"'; break;
1.14 millert 409: case 'n': *bp++ = '\n'; break;
1.1 kstailey 410: case 't': *bp++ = '\t'; break;
411: case 'f': *bp++ = '\f'; break;
412: case 'r': *bp++ = '\r'; break;
413: case 'b': *bp++ = '\b'; break;
414: case 'v': *bp++ = '\v'; break;
1.18 millert 415: case 'a': *bp++ = '\a'; break;
1.1 kstailey 416: case '\\': *bp++ = '\\'; break;
417:
418: case '0': case '1': case '2': /* octal: \d \dd \ddd */
419: case '3': case '4': case '5': case '6': case '7':
420: n = c - '0';
421: if ((c = peek()) >= '0' && c < '8') {
422: n = 8 * n + input() - '0';
423: if ((c = peek()) >= '0' && c < '8')
424: n = 8 * n + input() - '0';
425: }
426: *bp++ = n;
427: break;
428:
429: case 'x': /* hex \x0-9a-fA-F + */
1.29 ! millert 430: {
! 431: int i;
! 432:
! 433: n = 0;
! 434: for (i = 1; i <= 2; i++) {
! 435: c = input();
! 436: if (c == 0)
! 437: break;
! 438: if (isxdigit(c)) {
! 439: c = tolower(c);
! 440: n *= 16;
! 441: if (isdigit(c))
! 442: n += (c - '0');
! 443: else
! 444: n += 10 + (c - 'a');
! 445: } else
1.1 kstailey 446: break;
447: }
1.29 ! millert 448: if (n)
! 449: *bp++ = n;
! 450: else
! 451: unput(c);
1.1 kstailey 452: break;
453: }
454:
1.14 millert 455: default:
1.1 kstailey 456: *bp++ = c;
457: break;
458: }
459: break;
460: default:
461: *bp++ = c;
462: break;
463: }
464: }
1.14 millert 465: *bp = 0;
1.1 kstailey 466: s = tostring(buf);
1.17 millert 467: *bp++ = ' '; *bp++ = '\0';
1.1 kstailey 468: yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
1.17 millert 469: free(s);
1.1 kstailey 470: RET(STRING);
471: }
472:
473:
1.18 millert 474: static int binsearch(char *w, const Keyword *kp, int n)
1.1 kstailey 475: {
476: int cond, low, mid, high;
477:
478: low = 0;
479: high = n - 1;
480: while (low <= high) {
481: mid = (low + high) / 2;
482: if ((cond = strcmp(w, kp[mid].word)) < 0)
483: high = mid - 1;
484: else if (cond > 0)
485: low = mid + 1;
486: else
487: return mid;
488: }
489: return -1;
490: }
491:
1.14 millert 492: int word(char *w)
1.1 kstailey 493: {
1.18 millert 494: const Keyword *kp;
1.1 kstailey 495: int c, n;
496:
497: n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
498: if (n != -1) { /* found in table */
1.14 millert 499: kp = keywords + n;
1.1 kstailey 500: yylval.i = kp->sub;
501: switch (kp->type) { /* special handling */
1.11 millert 502: case BLTIN:
503: if (kp->sub == FSYSTEM && safe)
1.4 millert 504: SYNTAX( "system is unsafe" );
1.1 kstailey 505: RET(kp->type);
506: case FUNC:
507: if (infunc)
1.4 millert 508: SYNTAX( "illegal nested function" );
1.1 kstailey 509: RET(kp->type);
510: case RETURN:
511: if (!infunc)
1.4 millert 512: SYNTAX( "return not in function" );
1.1 kstailey 513: RET(kp->type);
514: case VARNF:
515: yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
516: RET(VARNF);
517: default:
518: RET(kp->type);
519: }
520: }
521: c = peek(); /* look for '(' */
522: if (c != '(' && infunc && (n=isarg(w)) >= 0) {
523: yylval.i = n;
524: RET(ARG);
525: } else {
526: yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
527: if (c == '(') {
528: RET(CALL);
529: } else {
530: RET(VAR);
531: }
532: }
533: }
534:
1.6 millert 535: void startreg(void) /* next call to yylex will return a regular expression */
1.1 kstailey 536: {
1.16 millert 537: reg = true;
1.1 kstailey 538: }
539:
1.3 millert 540: int regexpr(void)
1.1 kstailey 541: {
1.9 hugh 542: int c, openclass = 0;
1.14 millert 543: static char *buf = NULL;
1.1 kstailey 544: static int bufsz = 500;
1.20 millert 545: char *bp, *cstart;
1.1 kstailey 546:
1.27 millert 547: if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
1.28 millert 548: FATAL("out of space for reg expr");
1.1 kstailey 549: bp = buf;
1.20 millert 550: for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
1.11 millert 551: if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
1.4 millert 552: FATAL("out of space for reg expr %.10s...", buf);
1.1 kstailey 553: if (c == '\n') {
1.13 millert 554: *bp = '\0';
1.14 millert 555: SYNTAX( "newline in regular expression %.10s...", buf );
1.1 kstailey 556: unput('\n');
557: break;
558: } else if (c == '\\') {
1.14 millert 559: *bp++ = '\\';
1.1 kstailey 560: *bp++ = input();
561: } else {
1.20 millert 562: /*
563: * POSIX requires a slash in a regexp to be escaped,
564: * other awks don't require it to be escaped inside
565: * a character class.
566: */
567: if (!do_posix) {
568: if (c == '[') {
1.24 millert 569: int nextc = peek();
570: if (openclass == 0 || nextc == ':' ||
571: nextc == '.' || nextc == '=') {
1.22 millert 572: if (++openclass == 1)
573: cstart = bp;
574: }
1.20 millert 575: } else if (c == ']' && openclass > 0) {
576: /*
577: * A ']' as the first char in a
578: * class is treated literally.
579: */
580: if (cstart != bp - 1 &&
581: (cstart != bp - 2 || bp[-1] != '^'))
582: openclass--;
583: }
584: }
1.1 kstailey 585: *bp++ = c;
586: }
587: }
588: *bp = 0;
1.8 millert 589: if (c == 0)
590: SYNTAX("non-terminated regular expression %.10s...", buf);
1.28 millert 591: yylval.s = buf;
1.1 kstailey 592: unput('/');
593: RET(REGEXPR);
594: }
595:
596: /* low-level lexical stuff, sort of inherited from lex */
597:
598: char ebuf[300];
599: char *ep = ebuf;
600: char yysbuf[100]; /* pushback buffer */
601: char *yysptr = yysbuf;
1.14 millert 602: FILE *yyin = NULL;
1.1 kstailey 603:
604: int input(void) /* get next lexical input character */
605: {
606: int c;
607: extern char *lexprog;
608:
609: if (yysptr > yysbuf)
1.8 millert 610: c = (uschar)*--yysptr;
1.1 kstailey 611: else if (lexprog != NULL) { /* awk '...' */
1.8 millert 612: if ((c = (uschar)*lexprog) != 0)
1.1 kstailey 613: lexprog++;
614: } else /* awk -f ... */
615: c = pgetc();
1.13 millert 616: if (c == EOF)
1.1 kstailey 617: c = 0;
618: if (ep >= ebuf + sizeof ebuf)
619: ep = ebuf;
1.13 millert 620: *ep = c;
621: if (c != 0) {
622: ep++;
623: }
624: return (c);
1.1 kstailey 625: }
626:
627: void unput(int c) /* put lexical character back on input */
628: {
1.18 millert 629: if (c == '\n')
630: lineno--;
1.1 kstailey 631: if (yysptr >= yysbuf + sizeof(yysbuf))
1.4 millert 632: FATAL("pushed back too much: %.20s...", yysbuf);
1.1 kstailey 633: *yysptr++ = c;
634: if (--ep < ebuf)
635: ep = ebuf + sizeof(ebuf) - 1;
636: }
637:
1.6 millert 638: void unputstr(const char *s) /* put a string back on input */
1.1 kstailey 639: {
640: int i;
641:
642: for (i = strlen(s)-1; i >= 0; i--)
643: unput(s[i]);
644: }