Annotation of src/usr.bin/awk/lex.c, Revision 1.7
1.7 ! deraadt 1: /* $OpenBSD: lex.c,v 1.6 2002/12/19 21:24:28 millert Exp $ */
1.1 kstailey 2: /****************************************************************
3: Copyright (C) Lucent Technologies 1997
4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
24: ****************************************************************/
25:
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <ctype.h>
30: #include "awk.h"
31: #include "ytab.h"
32:
33: extern YYSTYPE yylval;
34: extern int infunc;
35:
36: int lineno = 1;
37: int bracecnt = 0;
38: int brackcnt = 0;
39: int parencnt = 0;
40:
41: typedef struct Keyword {
1.6 millert 42: const char *word;
1.1 kstailey 43: int sub;
44: int type;
45: } Keyword;
46:
47: Keyword keywords[] ={ /* keep sorted: binary searched */
48: { "BEGIN", XBEGIN, XBEGIN },
49: { "END", XEND, XEND },
50: { "NF", VARNF, VARNF },
51: { "atan2", FATAN, BLTIN },
52: { "break", BREAK, BREAK },
53: { "close", CLOSE, CLOSE },
54: { "continue", CONTINUE, CONTINUE },
55: { "cos", FCOS, BLTIN },
56: { "delete", DELETE, DELETE },
57: { "do", DO, DO },
58: { "else", ELSE, ELSE },
59: { "exit", EXIT, EXIT },
60: { "exp", FEXP, BLTIN },
61: { "fflush", FFLUSH, BLTIN },
62: { "for", FOR, FOR },
63: { "func", FUNC, FUNC },
64: { "function", FUNC, FUNC },
65: { "getline", GETLINE, GETLINE },
66: { "gsub", GSUB, GSUB },
67: { "if", IF, IF },
68: { "in", IN, IN },
69: { "index", INDEX, INDEX },
70: { "int", FINT, BLTIN },
71: { "length", FLENGTH, BLTIN },
72: { "log", FLOG, BLTIN },
73: { "match", MATCHFCN, MATCHFCN },
74: { "next", NEXT, NEXT },
75: { "nextfile", NEXTFILE, NEXTFILE },
76: { "print", PRINT, PRINT },
77: { "printf", PRINTF, PRINTF },
78: { "rand", FRAND, BLTIN },
79: { "return", RETURN, RETURN },
80: { "sin", FSIN, BLTIN },
81: { "split", SPLIT, SPLIT },
82: { "sprintf", SPRINTF, SPRINTF },
83: { "sqrt", FSQRT, BLTIN },
84: { "srand", FSRAND, BLTIN },
85: { "sub", SUB, SUB },
86: { "substr", SUBSTR, SUBSTR },
87: { "system", FSYSTEM, BLTIN },
88: { "tolower", FTOLOWER, BLTIN },
89: { "toupper", FTOUPPER, BLTIN },
90: { "while", WHILE, WHILE },
91: };
92:
93: #define DEBUG
94: #ifdef DEBUG
95: #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
96: #else
97: #define RET(x) return(x)
98: #endif
1.7 ! deraadt 99:
! 100: int peek(void);
! 101: int gettok(char **, int *);
! 102: int binsearch(char *, Keyword *, int);
1.1 kstailey 103:
1.2 millert 104: int peek(void)
1.1 kstailey 105: {
106: int c = input();
107: unput(c);
108: return c;
109: }
110:
111: int gettok(char **pbuf, int *psz) /* get next input token */
112: {
1.6 millert 113: int c, retc;
1.1 kstailey 114: char *buf = *pbuf;
115: int sz = *psz;
116: char *bp = buf;
117:
118: c = input();
119: if (c == 0)
120: return 0;
121: buf[0] = c;
122: buf[1] = 0;
123: if (!isalnum(c) && c != '.' && c != '_')
124: return c;
125:
126: *bp++ = c;
127: if (isalpha(c) || c == '_') { /* it's a varname */
128: for ( ; (c = input()) != 0; ) {
129: if (bp-buf >= sz)
130: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
1.4 millert 131: FATAL( "out of space for name %.10s...", buf );
1.1 kstailey 132: if (isalnum(c) || c == '_')
133: *bp++ = c;
134: else {
135: *bp = 0;
136: unput(c);
137: break;
138: }
139: }
1.4 millert 140: *bp = 0;
1.6 millert 141: retc = 'a'; /* alphanumeric */
1.1 kstailey 142: } else { /* it's a number */
143: char *rem;
144: /* read input until can't be a number */
145: for ( ; (c = input()) != 0; ) {
146: if (bp-buf >= sz)
147: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
1.4 millert 148: FATAL( "out of space for number %.10s...", buf );
1.1 kstailey 149: if (isdigit(c) || c == 'e' || c == 'E'
150: || c == '.' || c == '+' || c == '-')
151: *bp++ = c;
152: else {
153: unput(c);
154: break;
155: }
156: }
1.2 millert 157: *bp = 0;
1.1 kstailey 158: strtod(buf, &rem); /* parse the number */
159: unputstr(rem); /* put rest back for later */
1.6 millert 160: if (rem == buf) { /* it wasn't a valid number at all */
161: buf[1] = 0; /* so return one character as token */
162: retc = buf[0]; /* character is its own type */
163: } else { /* some prefix was a number */
164: rem[0] = 0; /* so truncate where failure started */
165: retc = '0'; /* number */
166: }
1.1 kstailey 167: }
168: *pbuf = buf;
169: *psz = sz;
1.6 millert 170: return retc;
1.1 kstailey 171: }
172:
173: int word(char *);
174: int string(void);
175: int regexpr(void);
176: int sc = 0; /* 1 => return a } right now */
177: int reg = 0; /* 1 => return a REGEXPR now */
178:
1.3 millert 179: int yylex(void)
1.1 kstailey 180: {
1.3 millert 181: int c;
1.1 kstailey 182: static char *buf = 0;
183: static int bufsize = 500;
184:
185: if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
1.4 millert 186: FATAL( "out of space in yylex" );
1.1 kstailey 187: if (sc) {
188: sc = 0;
189: RET('}');
190: }
191: if (reg) {
192: reg = 0;
193: return regexpr();
194: }
195: for (;;) {
196: c = gettok(&buf, &bufsize);
197: if (c == 0)
198: return 0;
199: if (isalpha(c) || c == '_')
200: return word(buf);
1.6 millert 201: if (isdigit(c)) {
1.1 kstailey 202: yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
203: /* should this also have STR set? */
204: RET(NUMBER);
205: }
206:
207: yylval.i = c;
208: switch (c) {
209: case '\n': /* {EOL} */
210: RET(NL);
211: case '\r': /* assume \n is coming */
212: case ' ': /* {WS}+ */
213: case '\t':
214: break;
215: case '#': /* #.* strip comments */
216: while ((c = input()) != '\n' && c != 0)
217: ;
218: unput(c);
219: break;
220: case ';':
221: RET(';');
222: case '\\':
223: if (peek() == '\n') {
1.3 millert 224: input();
1.1 kstailey 225: } else if (peek() == '\r') {
226: input(); input(); /* \n */
227: lineno++;
228: } else {
229: RET(c);
230: }
231: break;
232: case '&':
233: if (peek() == '&') {
234: input(); RET(AND);
235: } else
236: RET('&');
237: case '|':
238: if (peek() == '|') {
239: input(); RET(BOR);
240: } else
241: RET('|');
242: case '!':
243: if (peek() == '=') {
244: input(); yylval.i = NE; RET(NE);
245: } else if (peek() == '~') {
246: input(); yylval.i = NOTMATCH; RET(MATCHOP);
247: } else
248: RET(NOT);
249: case '~':
250: yylval.i = MATCH;
251: RET(MATCHOP);
252: case '<':
253: if (peek() == '=') {
254: input(); yylval.i = LE; RET(LE);
255: } else {
256: yylval.i = LT; RET(LT);
257: }
258: case '=':
259: if (peek() == '=') {
260: input(); yylval.i = EQ; RET(EQ);
261: } else {
262: yylval.i = ASSIGN; RET(ASGNOP);
263: }
264: case '>':
265: if (peek() == '=') {
266: input(); yylval.i = GE; RET(GE);
267: } else if (peek() == '>') {
268: input(); yylval.i = APPEND; RET(APPEND);
269: } else {
270: yylval.i = GT; RET(GT);
271: }
272: case '+':
273: if (peek() == '+') {
274: input(); yylval.i = INCR; RET(INCR);
275: } else if (peek() == '=') {
276: input(); yylval.i = ADDEQ; RET(ASGNOP);
277: } else
278: RET('+');
279: case '-':
280: if (peek() == '-') {
281: input(); yylval.i = DECR; RET(DECR);
282: } else if (peek() == '=') {
283: input(); yylval.i = SUBEQ; RET(ASGNOP);
284: } else
285: RET('-');
286: case '*':
287: if (peek() == '=') { /* *= */
288: input(); yylval.i = MULTEQ; RET(ASGNOP);
289: } else if (peek() == '*') { /* ** or **= */
290: input(); /* eat 2nd * */
291: if (peek() == '=') {
292: input(); yylval.i = POWEQ; RET(ASGNOP);
293: } else {
294: RET(POWER);
295: }
296: } else
297: RET('*');
298: case '/':
1.3 millert 299: RET('/');
1.1 kstailey 300: case '%':
301: if (peek() == '=') {
302: input(); yylval.i = MODEQ; RET(ASGNOP);
303: } else
304: RET('%');
305: case '^':
306: if (peek() == '=') {
307: input(); yylval.i = POWEQ; RET(ASGNOP);
308: } else
309: RET(POWER);
1.5 millert 310:
1.1 kstailey 311: case '$':
312: /* BUG: awkward, if not wrong */
313: c = gettok(&buf, &bufsize);
1.5 millert 314: if (isalpha(c)) {
1.1 kstailey 315: if (strcmp(buf, "NF") == 0) { /* very special */
316: unputstr("(NF)");
1.5 millert 317: RET(INDIRECT);
318: }
319: c = peek();
320: if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
321: unputstr(buf);
1.1 kstailey 322: RET(INDIRECT);
323: }
324: yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
325: RET(IVAR);
1.6 millert 326: } else if (c == 0) { /* */
327: SYNTAX( "unexpected end of input after $" );
328: RET(';');
1.1 kstailey 329: } else {
330: unputstr(buf);
331: RET(INDIRECT);
332: }
333:
334: case '}':
335: if (--bracecnt < 0)
1.4 millert 336: SYNTAX( "extra }" );
1.1 kstailey 337: sc = 1;
338: RET(';');
339: case ']':
340: if (--brackcnt < 0)
1.4 millert 341: SYNTAX( "extra ]" );
1.1 kstailey 342: RET(']');
343: case ')':
344: if (--parencnt < 0)
1.4 millert 345: SYNTAX( "extra )" );
1.1 kstailey 346: RET(')');
347: case '{':
348: bracecnt++;
349: RET('{');
350: case '[':
351: brackcnt++;
352: RET('[');
353: case '(':
354: parencnt++;
355: RET('(');
356:
357: case '"':
358: return string(); /* BUG: should be like tran.c ? */
359:
360: default:
361: RET(c);
362: }
363: }
364: }
365:
1.3 millert 366: int string(void)
1.1 kstailey 367: {
368: int c, n;
369: char *s, *bp;
370: static char *buf = 0;
371: static int bufsz = 500;
372:
373: if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
1.4 millert 374: FATAL("out of space for strings");
1.1 kstailey 375: for (bp = buf; (c = input()) != '"'; ) {
376: if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
1.4 millert 377: FATAL("out of space for string %.10s...", buf);
1.1 kstailey 378: switch (c) {
379: case '\n':
380: case '\r':
381: case 0:
1.4 millert 382: SYNTAX( "non-terminated string %.10s...", buf );
1.1 kstailey 383: lineno++;
1.6 millert 384: if (c == 0) /* hopeless */
385: FATAL( "giving up" );
1.1 kstailey 386: break;
387: case '\\':
388: c = input();
389: switch (c) {
390: case '"': *bp++ = '"'; break;
391: case 'n': *bp++ = '\n'; break;
392: case 't': *bp++ = '\t'; break;
393: case 'f': *bp++ = '\f'; break;
394: case 'r': *bp++ = '\r'; break;
395: case 'b': *bp++ = '\b'; break;
396: case 'v': *bp++ = '\v'; break;
1.3 millert 397: case 'a': *bp++ = '\007'; break;
1.1 kstailey 398: case '\\': *bp++ = '\\'; break;
399:
400: case '0': case '1': case '2': /* octal: \d \dd \ddd */
401: case '3': case '4': case '5': case '6': case '7':
402: n = c - '0';
403: if ((c = peek()) >= '0' && c < '8') {
404: n = 8 * n + input() - '0';
405: if ((c = peek()) >= '0' && c < '8')
406: n = 8 * n + input() - '0';
407: }
408: *bp++ = n;
409: break;
410:
411: case 'x': /* hex \x0-9a-fA-F + */
412: { char xbuf[100], *px;
413: for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
414: if (isdigit(c)
415: || (c >= 'a' && c <= 'f')
416: || (c >= 'A' && c <= 'F'))
417: *px++ = c;
418: else
419: break;
420: }
421: *px = 0;
422: unput(c);
423: sscanf(xbuf, "%x", &n);
424: *bp++ = n;
425: break;
426: }
427:
428: default:
429: *bp++ = c;
430: break;
431: }
432: break;
433: default:
434: *bp++ = c;
435: break;
436: }
437: }
438: *bp = 0;
439: s = tostring(buf);
440: *bp++ = ' '; *bp++ = 0;
441: yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
442: RET(STRING);
443: }
444:
445:
446: int binsearch(char *w, Keyword *kp, int n)
447: {
448: int cond, low, mid, high;
449:
450: low = 0;
451: high = n - 1;
452: while (low <= high) {
453: mid = (low + high) / 2;
454: if ((cond = strcmp(w, kp[mid].word)) < 0)
455: high = mid - 1;
456: else if (cond > 0)
457: low = mid + 1;
458: else
459: return mid;
460: }
461: return -1;
462: }
463:
464: int word(char *w)
465: {
466: Keyword *kp;
467: int c, n;
468:
469: n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
470: kp = keywords + n;
471: if (n != -1) { /* found in table */
472: yylval.i = kp->sub;
473: switch (kp->type) { /* special handling */
474: case FSYSTEM:
475: if (safe)
1.4 millert 476: SYNTAX( "system is unsafe" );
1.1 kstailey 477: RET(kp->type);
478: case FUNC:
479: if (infunc)
1.4 millert 480: SYNTAX( "illegal nested function" );
1.1 kstailey 481: RET(kp->type);
482: case RETURN:
483: if (!infunc)
1.4 millert 484: SYNTAX( "return not in function" );
1.1 kstailey 485: RET(kp->type);
486: case VARNF:
487: yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
488: RET(VARNF);
489: default:
490: RET(kp->type);
491: }
492: }
493: c = peek(); /* look for '(' */
494: if (c != '(' && infunc && (n=isarg(w)) >= 0) {
495: yylval.i = n;
496: RET(ARG);
497: } else {
498: yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
499: if (c == '(') {
500: RET(CALL);
501: } else {
502: RET(VAR);
503: }
504: }
505: }
506:
1.6 millert 507: void startreg(void) /* next call to yylex will return a regular expression */
1.1 kstailey 508: {
509: reg = 1;
510: }
511:
1.3 millert 512: int regexpr(void)
1.1 kstailey 513: {
514: int c;
515: static char *buf = 0;
516: static int bufsz = 500;
517: char *bp;
518:
519: if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
1.4 millert 520: FATAL("out of space for rex expr");
1.1 kstailey 521: bp = buf;
522: for ( ; (c = input()) != '/' && c != 0; ) {
523: if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
1.4 millert 524: FATAL("out of space for reg expr %.10s...", buf);
1.1 kstailey 525: if (c == '\n') {
1.4 millert 526: SYNTAX( "newline in regular expression %.10s...", buf );
1.1 kstailey 527: unput('\n');
528: break;
529: } else if (c == '\\') {
530: *bp++ = '\\';
531: *bp++ = input();
532: } else {
533: *bp++ = c;
534: }
535: }
536: *bp = 0;
537: yylval.s = tostring(buf);
538: unput('/');
539: RET(REGEXPR);
540: }
541:
542: /* low-level lexical stuff, sort of inherited from lex */
543:
544: char ebuf[300];
545: char *ep = ebuf;
546: char yysbuf[100]; /* pushback buffer */
547: char *yysptr = yysbuf;
548: FILE *yyin = 0;
549:
550: int input(void) /* get next lexical input character */
551: {
552: int c;
553: extern char *lexprog;
554:
555: if (yysptr > yysbuf)
556: c = *--yysptr;
557: else if (lexprog != NULL) { /* awk '...' */
558: if ((c = *lexprog) != 0)
559: lexprog++;
560: } else /* awk -f ... */
561: c = pgetc();
562: if (c == '\n')
563: lineno++;
564: else if (c == EOF)
565: c = 0;
566: if (ep >= ebuf + sizeof ebuf)
567: ep = ebuf;
568: return *ep++ = c;
569: }
570:
571: void unput(int c) /* put lexical character back on input */
572: {
573: if (c == '\n')
574: lineno--;
575: if (yysptr >= yysbuf + sizeof(yysbuf))
1.4 millert 576: FATAL("pushed back too much: %.20s...", yysbuf);
1.1 kstailey 577: *yysptr++ = c;
578: if (--ep < ebuf)
579: ep = ebuf + sizeof(ebuf) - 1;
580: }
581:
1.6 millert 582: void unputstr(const char *s) /* put a string back on input */
1.1 kstailey 583: {
584: int i;
585:
586: for (i = strlen(s)-1; i >= 0; i--)
587: unput(s[i]);
588: }