Annotation of src/usr.bin/awk/lex.c, Revision 1.19
1.19 ! millert 1: /* $OpenBSD: lex.c,v 1.18 2020/06/10 21:05:02 millert Exp $ */
1.1 kstailey 2: /****************************************************************
3: Copyright (C) Lucent Technologies 1997
4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
24: ****************************************************************/
25:
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <ctype.h>
30: #include "awk.h"
31: #include "ytab.h"
32:
33: extern YYSTYPE yylval;
1.19 ! millert 34: extern bool infunc;
1.1 kstailey 35:
36: int lineno = 1;
37: int bracecnt = 0;
38: int brackcnt = 0;
39: int parencnt = 0;
40:
41: typedef struct Keyword {
1.6 millert 42: const char *word;
1.1 kstailey 43: int sub;
44: int type;
45: } Keyword;
46:
1.18 millert 47: const Keyword keywords[] = { /* keep sorted: binary searched */
1.1 kstailey 48: { "BEGIN", XBEGIN, XBEGIN },
49: { "END", XEND, XEND },
50: { "NF", VARNF, VARNF },
1.10 pyr 51: { "and", FAND, BLTIN },
1.1 kstailey 52: { "atan2", FATAN, BLTIN },
53: { "break", BREAK, BREAK },
54: { "close", CLOSE, CLOSE },
1.10 pyr 55: { "compl", FCOMPL, BLTIN },
1.1 kstailey 56: { "continue", CONTINUE, CONTINUE },
57: { "cos", FCOS, BLTIN },
58: { "delete", DELETE, DELETE },
59: { "do", DO, DO },
60: { "else", ELSE, ELSE },
61: { "exit", EXIT, EXIT },
62: { "exp", FEXP, BLTIN },
63: { "fflush", FFLUSH, BLTIN },
64: { "for", FOR, FOR },
65: { "func", FUNC, FUNC },
66: { "function", FUNC, FUNC },
67: { "getline", GETLINE, GETLINE },
68: { "gsub", GSUB, GSUB },
69: { "if", IF, IF },
70: { "in", IN, IN },
71: { "index", INDEX, INDEX },
72: { "int", FINT, BLTIN },
73: { "length", FLENGTH, BLTIN },
74: { "log", FLOG, BLTIN },
1.10 pyr 75: { "lshift", FLSHIFT, BLTIN },
1.1 kstailey 76: { "match", MATCHFCN, MATCHFCN },
77: { "next", NEXT, NEXT },
78: { "nextfile", NEXTFILE, NEXTFILE },
1.10 pyr 79: { "or", FFOR, BLTIN },
1.1 kstailey 80: { "print", PRINT, PRINT },
81: { "printf", PRINTF, PRINTF },
82: { "rand", FRAND, BLTIN },
83: { "return", RETURN, RETURN },
1.10 pyr 84: { "rshift", FRSHIFT, BLTIN },
1.1 kstailey 85: { "sin", FSIN, BLTIN },
86: { "split", SPLIT, SPLIT },
87: { "sprintf", SPRINTF, SPRINTF },
88: { "sqrt", FSQRT, BLTIN },
89: { "srand", FSRAND, BLTIN },
90: { "sub", SUB, SUB },
91: { "substr", SUBSTR, SUBSTR },
92: { "system", FSYSTEM, BLTIN },
93: { "tolower", FTOLOWER, BLTIN },
94: { "toupper", FTOUPPER, BLTIN },
95: { "while", WHILE, WHILE },
1.10 pyr 96: { "xor", FXOR, BLTIN },
1.1 kstailey 97: };
98:
99: #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
1.7 deraadt 100:
1.18 millert 101: static int peek(void)
1.1 kstailey 102: {
103: int c = input();
104: unput(c);
105: return c;
106: }
107:
1.18 millert 108: static int gettok(char **pbuf, int *psz) /* get next input token */
1.1 kstailey 109: {
1.6 millert 110: int c, retc;
1.1 kstailey 111: char *buf = *pbuf;
112: int sz = *psz;
113: char *bp = buf;
114:
115: c = input();
116: if (c == 0)
117: return 0;
118: buf[0] = c;
119: buf[1] = 0;
120: if (!isalnum(c) && c != '.' && c != '_')
121: return c;
122:
123: *bp++ = c;
124: if (isalpha(c) || c == '_') { /* it's a varname */
125: for ( ; (c = input()) != 0; ) {
126: if (bp-buf >= sz)
1.11 millert 127: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
1.4 millert 128: FATAL( "out of space for name %.10s...", buf );
1.1 kstailey 129: if (isalnum(c) || c == '_')
130: *bp++ = c;
131: else {
132: *bp = 0;
133: unput(c);
134: break;
135: }
136: }
1.4 millert 137: *bp = 0;
1.6 millert 138: retc = 'a'; /* alphanumeric */
1.11 millert 139: } else { /* maybe it's a number, but could be . */
1.1 kstailey 140: char *rem;
141: /* read input until can't be a number */
142: for ( ; (c = input()) != 0; ) {
143: if (bp-buf >= sz)
1.11 millert 144: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
1.4 millert 145: FATAL( "out of space for number %.10s...", buf );
1.14 millert 146: if (isdigit(c) || c == 'e' || c == 'E'
1.1 kstailey 147: || c == '.' || c == '+' || c == '-')
148: *bp++ = c;
149: else {
150: unput(c);
151: break;
152: }
153: }
1.2 millert 154: *bp = 0;
1.1 kstailey 155: strtod(buf, &rem); /* parse the number */
1.6 millert 156: if (rem == buf) { /* it wasn't a valid number at all */
1.11 millert 157: buf[1] = 0; /* return one character as token */
1.6 millert 158: retc = buf[0]; /* character is its own type */
1.11 millert 159: unputstr(rem+1); /* put rest back for later */
1.6 millert 160: } else { /* some prefix was a number */
1.11 millert 161: unputstr(rem); /* put rest back for later */
162: rem[0] = 0; /* truncate buf after number part */
163: retc = '0'; /* type is number */
1.6 millert 164: }
1.1 kstailey 165: }
166: *pbuf = buf;
167: *psz = sz;
1.6 millert 168: return retc;
1.1 kstailey 169: }
170:
171: int word(char *);
172: int string(void);
173: int regexpr(void);
1.16 millert 174: bool sc = false; /* true => return a } right now */
175: bool reg = false; /* true => return a REGEXPR now */
1.1 kstailey 176:
1.3 millert 177: int yylex(void)
1.1 kstailey 178: {
1.3 millert 179: int c;
1.14 millert 180: static char *buf = NULL;
1.11 millert 181: static int bufsize = 5; /* BUG: setting this small causes core dump! */
1.1 kstailey 182:
1.15 millert 183: if (buf == NULL && (buf = malloc(bufsize)) == NULL)
1.4 millert 184: FATAL( "out of space in yylex" );
1.1 kstailey 185: if (sc) {
1.16 millert 186: sc = false;
1.1 kstailey 187: RET('}');
188: }
189: if (reg) {
1.16 millert 190: reg = false;
1.1 kstailey 191: return regexpr();
192: }
193: for (;;) {
194: c = gettok(&buf, &bufsize);
195: if (c == 0)
196: return 0;
197: if (isalpha(c) || c == '_')
198: return word(buf);
1.6 millert 199: if (isdigit(c)) {
1.17 millert 200: char *cp = tostring(buf);
201: yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
202: free(cp);
1.1 kstailey 203: /* should this also have STR set? */
204: RET(NUMBER);
205: }
1.14 millert 206:
1.1 kstailey 207: yylval.i = c;
208: switch (c) {
209: case '\n': /* {EOL} */
1.13 millert 210: lineno++;
1.1 kstailey 211: RET(NL);
212: case '\r': /* assume \n is coming */
213: case ' ': /* {WS}+ */
214: case '\t':
215: break;
216: case '#': /* #.* strip comments */
217: while ((c = input()) != '\n' && c != 0)
218: ;
219: unput(c);
1.19 ! millert 220: /*
! 221: * Next line is a hack, itcompensates for
! 222: * unput's treatment of \n.
! 223: */
! 224: lineno++;
1.1 kstailey 225: break;
226: case ';':
227: RET(';');
228: case '\\':
229: if (peek() == '\n') {
1.3 millert 230: input();
1.13 millert 231: lineno++;
1.1 kstailey 232: } else if (peek() == '\r') {
233: input(); input(); /* \n */
234: lineno++;
235: } else {
236: RET(c);
237: }
238: break;
239: case '&':
240: if (peek() == '&') {
241: input(); RET(AND);
1.14 millert 242: } else
1.1 kstailey 243: RET('&');
244: case '|':
245: if (peek() == '|') {
246: input(); RET(BOR);
247: } else
248: RET('|');
249: case '!':
250: if (peek() == '=') {
251: input(); yylval.i = NE; RET(NE);
252: } else if (peek() == '~') {
253: input(); yylval.i = NOTMATCH; RET(MATCHOP);
254: } else
255: RET(NOT);
256: case '~':
257: yylval.i = MATCH;
258: RET(MATCHOP);
259: case '<':
260: if (peek() == '=') {
261: input(); yylval.i = LE; RET(LE);
262: } else {
263: yylval.i = LT; RET(LT);
264: }
265: case '=':
266: if (peek() == '=') {
267: input(); yylval.i = EQ; RET(EQ);
268: } else {
269: yylval.i = ASSIGN; RET(ASGNOP);
270: }
271: case '>':
272: if (peek() == '=') {
273: input(); yylval.i = GE; RET(GE);
274: } else if (peek() == '>') {
275: input(); yylval.i = APPEND; RET(APPEND);
276: } else {
277: yylval.i = GT; RET(GT);
278: }
279: case '+':
280: if (peek() == '+') {
281: input(); yylval.i = INCR; RET(INCR);
282: } else if (peek() == '=') {
283: input(); yylval.i = ADDEQ; RET(ASGNOP);
284: } else
285: RET('+');
286: case '-':
287: if (peek() == '-') {
288: input(); yylval.i = DECR; RET(DECR);
289: } else if (peek() == '=') {
290: input(); yylval.i = SUBEQ; RET(ASGNOP);
291: } else
292: RET('-');
293: case '*':
294: if (peek() == '=') { /* *= */
295: input(); yylval.i = MULTEQ; RET(ASGNOP);
296: } else if (peek() == '*') { /* ** or **= */
297: input(); /* eat 2nd * */
298: if (peek() == '=') {
299: input(); yylval.i = POWEQ; RET(ASGNOP);
300: } else {
301: RET(POWER);
302: }
303: } else
304: RET('*');
305: case '/':
1.3 millert 306: RET('/');
1.1 kstailey 307: case '%':
308: if (peek() == '=') {
309: input(); yylval.i = MODEQ; RET(ASGNOP);
310: } else
311: RET('%');
312: case '^':
313: if (peek() == '=') {
314: input(); yylval.i = POWEQ; RET(ASGNOP);
315: } else
316: RET(POWER);
1.5 millert 317:
1.1 kstailey 318: case '$':
319: /* BUG: awkward, if not wrong */
320: c = gettok(&buf, &bufsize);
1.5 millert 321: if (isalpha(c)) {
1.1 kstailey 322: if (strcmp(buf, "NF") == 0) { /* very special */
323: unputstr("(NF)");
1.5 millert 324: RET(INDIRECT);
325: }
326: c = peek();
327: if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
328: unputstr(buf);
1.1 kstailey 329: RET(INDIRECT);
330: }
331: yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
332: RET(IVAR);
1.6 millert 333: } else if (c == 0) { /* */
334: SYNTAX( "unexpected end of input after $" );
335: RET(';');
1.1 kstailey 336: } else {
337: unputstr(buf);
338: RET(INDIRECT);
339: }
1.14 millert 340:
1.1 kstailey 341: case '}':
342: if (--bracecnt < 0)
1.4 millert 343: SYNTAX( "extra }" );
1.16 millert 344: sc = true;
1.1 kstailey 345: RET(';');
346: case ']':
347: if (--brackcnt < 0)
1.4 millert 348: SYNTAX( "extra ]" );
1.1 kstailey 349: RET(']');
350: case ')':
351: if (--parencnt < 0)
1.4 millert 352: SYNTAX( "extra )" );
1.1 kstailey 353: RET(')');
354: case '{':
355: bracecnt++;
356: RET('{');
357: case '[':
358: brackcnt++;
359: RET('[');
360: case '(':
361: parencnt++;
362: RET('(');
1.14 millert 363:
1.1 kstailey 364: case '"':
365: return string(); /* BUG: should be like tran.c ? */
1.14 millert 366:
1.1 kstailey 367: default:
368: RET(c);
369: }
370: }
371: }
372:
1.3 millert 373: int string(void)
1.1 kstailey 374: {
375: int c, n;
376: char *s, *bp;
1.14 millert 377: static char *buf = NULL;
1.1 kstailey 378: static int bufsz = 500;
379:
1.15 millert 380: if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1.4 millert 381: FATAL("out of space for strings");
1.1 kstailey 382: for (bp = buf; (c = input()) != '"'; ) {
1.11 millert 383: if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
1.4 millert 384: FATAL("out of space for string %.10s...", buf);
1.1 kstailey 385: switch (c) {
386: case '\n':
387: case '\r':
388: case 0:
1.13 millert 389: *bp = '\0';
1.4 millert 390: SYNTAX( "non-terminated string %.10s...", buf );
1.6 millert 391: if (c == 0) /* hopeless */
392: FATAL( "giving up" );
1.13 millert 393: lineno++;
1.1 kstailey 394: break;
395: case '\\':
396: c = input();
397: switch (c) {
1.19 ! millert 398: case '\n': break;
1.1 kstailey 399: case '"': *bp++ = '"'; break;
1.14 millert 400: case 'n': *bp++ = '\n'; break;
1.1 kstailey 401: case 't': *bp++ = '\t'; break;
402: case 'f': *bp++ = '\f'; break;
403: case 'r': *bp++ = '\r'; break;
404: case 'b': *bp++ = '\b'; break;
405: case 'v': *bp++ = '\v'; break;
1.18 millert 406: case 'a': *bp++ = '\a'; break;
1.1 kstailey 407: case '\\': *bp++ = '\\'; break;
408:
409: case '0': case '1': case '2': /* octal: \d \dd \ddd */
410: case '3': case '4': case '5': case '6': case '7':
411: n = c - '0';
412: if ((c = peek()) >= '0' && c < '8') {
413: n = 8 * n + input() - '0';
414: if ((c = peek()) >= '0' && c < '8')
415: n = 8 * n + input() - '0';
416: }
417: *bp++ = n;
418: break;
419:
420: case 'x': /* hex \x0-9a-fA-F + */
421: { char xbuf[100], *px;
422: for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
423: if (isdigit(c)
424: || (c >= 'a' && c <= 'f')
425: || (c >= 'A' && c <= 'F'))
426: *px++ = c;
427: else
428: break;
429: }
430: *px = 0;
431: unput(c);
1.12 millert 432: sscanf(xbuf, "%x", (unsigned int *) &n);
1.1 kstailey 433: *bp++ = n;
434: break;
435: }
436:
1.14 millert 437: default:
1.1 kstailey 438: *bp++ = c;
439: break;
440: }
441: break;
442: default:
443: *bp++ = c;
444: break;
445: }
446: }
1.14 millert 447: *bp = 0;
1.1 kstailey 448: s = tostring(buf);
1.17 millert 449: *bp++ = ' '; *bp++ = '\0';
1.1 kstailey 450: yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
1.17 millert 451: free(s);
1.1 kstailey 452: RET(STRING);
453: }
454:
455:
1.18 millert 456: static int binsearch(char *w, const Keyword *kp, int n)
1.1 kstailey 457: {
458: int cond, low, mid, high;
459:
460: low = 0;
461: high = n - 1;
462: while (low <= high) {
463: mid = (low + high) / 2;
464: if ((cond = strcmp(w, kp[mid].word)) < 0)
465: high = mid - 1;
466: else if (cond > 0)
467: low = mid + 1;
468: else
469: return mid;
470: }
471: return -1;
472: }
473:
1.14 millert 474: int word(char *w)
1.1 kstailey 475: {
1.18 millert 476: const Keyword *kp;
1.1 kstailey 477: int c, n;
478:
479: n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
480: if (n != -1) { /* found in table */
1.14 millert 481: kp = keywords + n;
1.1 kstailey 482: yylval.i = kp->sub;
483: switch (kp->type) { /* special handling */
1.11 millert 484: case BLTIN:
485: if (kp->sub == FSYSTEM && safe)
1.4 millert 486: SYNTAX( "system is unsafe" );
1.1 kstailey 487: RET(kp->type);
488: case FUNC:
489: if (infunc)
1.4 millert 490: SYNTAX( "illegal nested function" );
1.1 kstailey 491: RET(kp->type);
492: case RETURN:
493: if (!infunc)
1.4 millert 494: SYNTAX( "return not in function" );
1.1 kstailey 495: RET(kp->type);
496: case VARNF:
497: yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
498: RET(VARNF);
499: default:
500: RET(kp->type);
501: }
502: }
503: c = peek(); /* look for '(' */
504: if (c != '(' && infunc && (n=isarg(w)) >= 0) {
505: yylval.i = n;
506: RET(ARG);
507: } else {
508: yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
509: if (c == '(') {
510: RET(CALL);
511: } else {
512: RET(VAR);
513: }
514: }
515: }
516:
1.6 millert 517: void startreg(void) /* next call to yylex will return a regular expression */
1.1 kstailey 518: {
1.16 millert 519: reg = true;
1.1 kstailey 520: }
521:
1.3 millert 522: int regexpr(void)
1.1 kstailey 523: {
1.9 hugh 524: int c, openclass = 0;
1.14 millert 525: static char *buf = NULL;
1.1 kstailey 526: static int bufsz = 500;
527: char *bp;
528:
1.15 millert 529: if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1.4 millert 530: FATAL("out of space for rex expr");
1.1 kstailey 531: bp = buf;
1.9 hugh 532: for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) {
1.11 millert 533: if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
1.4 millert 534: FATAL("out of space for reg expr %.10s...", buf);
1.1 kstailey 535: if (c == '\n') {
1.13 millert 536: *bp = '\0';
1.14 millert 537: SYNTAX( "newline in regular expression %.10s...", buf );
1.1 kstailey 538: unput('\n');
539: break;
540: } else if (c == '\\') {
1.14 millert 541: *bp++ = '\\';
1.1 kstailey 542: *bp++ = input();
543: } else {
1.9 hugh 544: if (c == '[')
545: openclass = 1;
546: else if (c == ']')
547: openclass = 0;
1.1 kstailey 548: *bp++ = c;
549: }
550: }
551: *bp = 0;
1.8 millert 552: if (c == 0)
553: SYNTAX("non-terminated regular expression %.10s...", buf);
1.1 kstailey 554: yylval.s = tostring(buf);
555: unput('/');
556: RET(REGEXPR);
557: }
558:
559: /* low-level lexical stuff, sort of inherited from lex */
560:
561: char ebuf[300];
562: char *ep = ebuf;
563: char yysbuf[100]; /* pushback buffer */
564: char *yysptr = yysbuf;
1.14 millert 565: FILE *yyin = NULL;
1.1 kstailey 566:
567: int input(void) /* get next lexical input character */
568: {
569: int c;
570: extern char *lexprog;
571:
572: if (yysptr > yysbuf)
1.8 millert 573: c = (uschar)*--yysptr;
1.1 kstailey 574: else if (lexprog != NULL) { /* awk '...' */
1.8 millert 575: if ((c = (uschar)*lexprog) != 0)
1.1 kstailey 576: lexprog++;
577: } else /* awk -f ... */
578: c = pgetc();
1.13 millert 579: if (c == EOF)
1.1 kstailey 580: c = 0;
581: if (ep >= ebuf + sizeof ebuf)
582: ep = ebuf;
1.13 millert 583: *ep = c;
584: if (c != 0) {
585: ep++;
586: }
587: return (c);
1.1 kstailey 588: }
589:
590: void unput(int c) /* put lexical character back on input */
591: {
1.18 millert 592: if (c == '\n')
593: lineno--;
1.1 kstailey 594: if (yysptr >= yysbuf + sizeof(yysbuf))
1.4 millert 595: FATAL("pushed back too much: %.20s...", yysbuf);
1.1 kstailey 596: *yysptr++ = c;
597: if (--ep < ebuf)
598: ep = ebuf + sizeof(ebuf) - 1;
599: }
600:
1.6 millert 601: void unputstr(const char *s) /* put a string back on input */
1.1 kstailey 602: {
603: int i;
604:
605: for (i = strlen(s)-1; i >= 0; i--)
606: unput(s[i]);
607: }