Annotation of src/usr.bin/awk/lex.c, Revision 1.5
1.5 ! millert 1: /* $OpenBSD: lex.c,v 1.4 1999/12/08 23:09:45 millert Exp $ */
1.1 kstailey 2: /****************************************************************
3: Copyright (C) Lucent Technologies 1997
4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
24: ****************************************************************/
25:
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <ctype.h>
30: #include "awk.h"
31: #include "ytab.h"
32:
33: extern YYSTYPE yylval;
34: extern int infunc;
35:
36: int lineno = 1;
37: int bracecnt = 0;
38: int brackcnt = 0;
39: int parencnt = 0;
40:
41: typedef struct Keyword {
42: char *word;
43: int sub;
44: int type;
45: } Keyword;
46:
47: Keyword keywords[] ={ /* keep sorted: binary searched */
48: { "BEGIN", XBEGIN, XBEGIN },
49: { "END", XEND, XEND },
50: { "NF", VARNF, VARNF },
51: { "atan2", FATAN, BLTIN },
52: { "break", BREAK, BREAK },
53: { "close", CLOSE, CLOSE },
54: { "continue", CONTINUE, CONTINUE },
55: { "cos", FCOS, BLTIN },
56: { "delete", DELETE, DELETE },
57: { "do", DO, DO },
58: { "else", ELSE, ELSE },
59: { "exit", EXIT, EXIT },
60: { "exp", FEXP, BLTIN },
61: { "fflush", FFLUSH, BLTIN },
62: { "for", FOR, FOR },
63: { "func", FUNC, FUNC },
64: { "function", FUNC, FUNC },
65: { "getline", GETLINE, GETLINE },
66: { "gsub", GSUB, GSUB },
67: { "if", IF, IF },
68: { "in", IN, IN },
69: { "index", INDEX, INDEX },
70: { "int", FINT, BLTIN },
71: { "length", FLENGTH, BLTIN },
72: { "log", FLOG, BLTIN },
73: { "match", MATCHFCN, MATCHFCN },
74: { "next", NEXT, NEXT },
75: { "nextfile", NEXTFILE, NEXTFILE },
76: { "print", PRINT, PRINT },
77: { "printf", PRINTF, PRINTF },
78: { "rand", FRAND, BLTIN },
79: { "return", RETURN, RETURN },
80: { "sin", FSIN, BLTIN },
81: { "split", SPLIT, SPLIT },
82: { "sprintf", SPRINTF, SPRINTF },
83: { "sqrt", FSQRT, BLTIN },
84: { "srand", FSRAND, BLTIN },
85: { "sub", SUB, SUB },
86: { "substr", SUBSTR, SUBSTR },
87: { "system", FSYSTEM, BLTIN },
88: { "tolower", FTOLOWER, BLTIN },
89: { "toupper", FTOUPPER, BLTIN },
90: { "while", WHILE, WHILE },
91: };
92:
93: #define DEBUG
94: #ifdef DEBUG
95: #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
96: #else
97: #define RET(x) return(x)
98: #endif
99:
1.2 millert 100: int peek(void)
1.1 kstailey 101: {
102: int c = input();
103: unput(c);
104: return c;
105: }
106:
107: int gettok(char **pbuf, int *psz) /* get next input token */
108: {
109: int c;
110: char *buf = *pbuf;
111: int sz = *psz;
112: char *bp = buf;
113:
114: c = input();
115: if (c == 0)
116: return 0;
117: buf[0] = c;
118: buf[1] = 0;
119: if (!isalnum(c) && c != '.' && c != '_')
120: return c;
121:
122: *bp++ = c;
123: if (isalpha(c) || c == '_') { /* it's a varname */
124: for ( ; (c = input()) != 0; ) {
125: if (bp-buf >= sz)
126: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
1.4 millert 127: FATAL( "out of space for name %.10s...", buf );
1.1 kstailey 128: if (isalnum(c) || c == '_')
129: *bp++ = c;
130: else {
131: *bp = 0;
132: unput(c);
133: break;
134: }
135: }
1.4 millert 136: *bp = 0;
1.1 kstailey 137: } else { /* it's a number */
138: char *rem;
139: /* read input until can't be a number */
140: for ( ; (c = input()) != 0; ) {
141: if (bp-buf >= sz)
142: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
1.4 millert 143: FATAL( "out of space for number %.10s...", buf );
1.1 kstailey 144: if (isdigit(c) || c == 'e' || c == 'E'
145: || c == '.' || c == '+' || c == '-')
146: *bp++ = c;
147: else {
148: unput(c);
149: break;
150: }
151: }
1.2 millert 152: *bp = 0;
1.1 kstailey 153: strtod(buf, &rem); /* parse the number */
154: unputstr(rem); /* put rest back for later */
155: rem[0] = 0;
156: }
157: *pbuf = buf;
158: *psz = sz;
159: return buf[0];
160: }
161:
162: int word(char *);
163: int string(void);
164: int regexpr(void);
165: int sc = 0; /* 1 => return a } right now */
166: int reg = 0; /* 1 => return a REGEXPR now */
167:
1.3 millert 168: int yylex(void)
1.1 kstailey 169: {
1.3 millert 170: int c;
1.1 kstailey 171: static char *buf = 0;
172: static int bufsize = 500;
173:
174: if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
1.4 millert 175: FATAL( "out of space in yylex" );
1.1 kstailey 176: if (sc) {
177: sc = 0;
178: RET('}');
179: }
180: if (reg) {
181: reg = 0;
182: return regexpr();
183: }
184: for (;;) {
185: c = gettok(&buf, &bufsize);
186: if (c == 0)
187: return 0;
188: if (isalpha(c) || c == '_')
189: return word(buf);
190: if (isdigit(c) || c == '.') {
191: yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
192: /* should this also have STR set? */
193: RET(NUMBER);
194: }
195:
196: yylval.i = c;
197: switch (c) {
198: case '\n': /* {EOL} */
199: RET(NL);
200: case '\r': /* assume \n is coming */
201: case ' ': /* {WS}+ */
202: case '\t':
203: break;
204: case '#': /* #.* strip comments */
205: while ((c = input()) != '\n' && c != 0)
206: ;
207: unput(c);
208: break;
209: case ';':
210: RET(';');
211: case '\\':
212: if (peek() == '\n') {
1.3 millert 213: input();
1.1 kstailey 214: } else if (peek() == '\r') {
215: input(); input(); /* \n */
216: lineno++;
217: } else {
218: RET(c);
219: }
220: break;
221: case '&':
222: if (peek() == '&') {
223: input(); RET(AND);
224: } else
225: RET('&');
226: case '|':
227: if (peek() == '|') {
228: input(); RET(BOR);
229: } else
230: RET('|');
231: case '!':
232: if (peek() == '=') {
233: input(); yylval.i = NE; RET(NE);
234: } else if (peek() == '~') {
235: input(); yylval.i = NOTMATCH; RET(MATCHOP);
236: } else
237: RET(NOT);
238: case '~':
239: yylval.i = MATCH;
240: RET(MATCHOP);
241: case '<':
242: if (peek() == '=') {
243: input(); yylval.i = LE; RET(LE);
244: } else {
245: yylval.i = LT; RET(LT);
246: }
247: case '=':
248: if (peek() == '=') {
249: input(); yylval.i = EQ; RET(EQ);
250: } else {
251: yylval.i = ASSIGN; RET(ASGNOP);
252: }
253: case '>':
254: if (peek() == '=') {
255: input(); yylval.i = GE; RET(GE);
256: } else if (peek() == '>') {
257: input(); yylval.i = APPEND; RET(APPEND);
258: } else {
259: yylval.i = GT; RET(GT);
260: }
261: case '+':
262: if (peek() == '+') {
263: input(); yylval.i = INCR; RET(INCR);
264: } else if (peek() == '=') {
265: input(); yylval.i = ADDEQ; RET(ASGNOP);
266: } else
267: RET('+');
268: case '-':
269: if (peek() == '-') {
270: input(); yylval.i = DECR; RET(DECR);
271: } else if (peek() == '=') {
272: input(); yylval.i = SUBEQ; RET(ASGNOP);
273: } else
274: RET('-');
275: case '*':
276: if (peek() == '=') { /* *= */
277: input(); yylval.i = MULTEQ; RET(ASGNOP);
278: } else if (peek() == '*') { /* ** or **= */
279: input(); /* eat 2nd * */
280: if (peek() == '=') {
281: input(); yylval.i = POWEQ; RET(ASGNOP);
282: } else {
283: RET(POWER);
284: }
285: } else
286: RET('*');
287: case '/':
1.3 millert 288: RET('/');
1.1 kstailey 289: case '%':
290: if (peek() == '=') {
291: input(); yylval.i = MODEQ; RET(ASGNOP);
292: } else
293: RET('%');
294: case '^':
295: if (peek() == '=') {
296: input(); yylval.i = POWEQ; RET(ASGNOP);
297: } else
298: RET(POWER);
1.5 ! millert 299:
1.1 kstailey 300: case '$':
301: /* BUG: awkward, if not wrong */
302: c = gettok(&buf, &bufsize);
1.5 ! millert 303: if (isalpha(c)) {
1.1 kstailey 304: if (strcmp(buf, "NF") == 0) { /* very special */
305: unputstr("(NF)");
1.5 ! millert 306: RET(INDIRECT);
! 307: }
! 308: c = peek();
! 309: if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
! 310: unputstr(buf);
1.1 kstailey 311: RET(INDIRECT);
312: }
313: yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
314: RET(IVAR);
315: } else {
316: unputstr(buf);
317: RET(INDIRECT);
318: }
319:
320: case '}':
321: if (--bracecnt < 0)
1.4 millert 322: SYNTAX( "extra }" );
1.1 kstailey 323: sc = 1;
324: RET(';');
325: case ']':
326: if (--brackcnt < 0)
1.4 millert 327: SYNTAX( "extra ]" );
1.1 kstailey 328: RET(']');
329: case ')':
330: if (--parencnt < 0)
1.4 millert 331: SYNTAX( "extra )" );
1.1 kstailey 332: RET(')');
333: case '{':
334: bracecnt++;
335: RET('{');
336: case '[':
337: brackcnt++;
338: RET('[');
339: case '(':
340: parencnt++;
341: RET('(');
342:
343: case '"':
344: return string(); /* BUG: should be like tran.c ? */
345:
346: default:
347: RET(c);
348: }
349: }
350: }
351:
1.3 millert 352: int string(void)
1.1 kstailey 353: {
354: int c, n;
355: char *s, *bp;
356: static char *buf = 0;
357: static int bufsz = 500;
358:
359: if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
1.4 millert 360: FATAL("out of space for strings");
1.1 kstailey 361: for (bp = buf; (c = input()) != '"'; ) {
362: if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
1.4 millert 363: FATAL("out of space for string %.10s...", buf);
1.1 kstailey 364: switch (c) {
365: case '\n':
366: case '\r':
367: case 0:
1.4 millert 368: SYNTAX( "non-terminated string %.10s...", buf );
1.1 kstailey 369: lineno++;
370: break;
371: case '\\':
372: c = input();
373: switch (c) {
374: case '"': *bp++ = '"'; break;
375: case 'n': *bp++ = '\n'; break;
376: case 't': *bp++ = '\t'; break;
377: case 'f': *bp++ = '\f'; break;
378: case 'r': *bp++ = '\r'; break;
379: case 'b': *bp++ = '\b'; break;
380: case 'v': *bp++ = '\v'; break;
1.3 millert 381: case 'a': *bp++ = '\007'; break;
1.1 kstailey 382: case '\\': *bp++ = '\\'; break;
383:
384: case '0': case '1': case '2': /* octal: \d \dd \ddd */
385: case '3': case '4': case '5': case '6': case '7':
386: n = c - '0';
387: if ((c = peek()) >= '0' && c < '8') {
388: n = 8 * n + input() - '0';
389: if ((c = peek()) >= '0' && c < '8')
390: n = 8 * n + input() - '0';
391: }
392: *bp++ = n;
393: break;
394:
395: case 'x': /* hex \x0-9a-fA-F + */
396: { char xbuf[100], *px;
397: for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
398: if (isdigit(c)
399: || (c >= 'a' && c <= 'f')
400: || (c >= 'A' && c <= 'F'))
401: *px++ = c;
402: else
403: break;
404: }
405: *px = 0;
406: unput(c);
407: sscanf(xbuf, "%x", &n);
408: *bp++ = n;
409: break;
410: }
411:
412: default:
413: *bp++ = c;
414: break;
415: }
416: break;
417: default:
418: *bp++ = c;
419: break;
420: }
421: }
422: *bp = 0;
423: s = tostring(buf);
424: *bp++ = ' '; *bp++ = 0;
425: yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
426: RET(STRING);
427: }
428:
429:
430: int binsearch(char *w, Keyword *kp, int n)
431: {
432: int cond, low, mid, high;
433:
434: low = 0;
435: high = n - 1;
436: while (low <= high) {
437: mid = (low + high) / 2;
438: if ((cond = strcmp(w, kp[mid].word)) < 0)
439: high = mid - 1;
440: else if (cond > 0)
441: low = mid + 1;
442: else
443: return mid;
444: }
445: return -1;
446: }
447:
448: int word(char *w)
449: {
450: Keyword *kp;
451: int c, n;
452:
453: n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
454: kp = keywords + n;
455: if (n != -1) { /* found in table */
456: yylval.i = kp->sub;
457: switch (kp->type) { /* special handling */
458: case FSYSTEM:
459: if (safe)
1.4 millert 460: SYNTAX( "system is unsafe" );
1.1 kstailey 461: RET(kp->type);
462: case FUNC:
463: if (infunc)
1.4 millert 464: SYNTAX( "illegal nested function" );
1.1 kstailey 465: RET(kp->type);
466: case RETURN:
467: if (!infunc)
1.4 millert 468: SYNTAX( "return not in function" );
1.1 kstailey 469: RET(kp->type);
470: case VARNF:
471: yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
472: RET(VARNF);
473: default:
474: RET(kp->type);
475: }
476: }
477: c = peek(); /* look for '(' */
478: if (c != '(' && infunc && (n=isarg(w)) >= 0) {
479: yylval.i = n;
480: RET(ARG);
481: } else {
482: yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
483: if (c == '(') {
484: RET(CALL);
485: } else {
486: RET(VAR);
487: }
488: }
489: }
490:
491: void startreg(void) /* next call to yyles will return a regular expression */
492: {
493: reg = 1;
494: }
495:
1.3 millert 496: int regexpr(void)
1.1 kstailey 497: {
498: int c;
499: static char *buf = 0;
500: static int bufsz = 500;
501: char *bp;
502:
503: if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
1.4 millert 504: FATAL("out of space for rex expr");
1.1 kstailey 505: bp = buf;
506: for ( ; (c = input()) != '/' && c != 0; ) {
507: if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
1.4 millert 508: FATAL("out of space for reg expr %.10s...", buf);
1.1 kstailey 509: if (c == '\n') {
1.4 millert 510: SYNTAX( "newline in regular expression %.10s...", buf );
1.1 kstailey 511: unput('\n');
512: break;
513: } else if (c == '\\') {
514: *bp++ = '\\';
515: *bp++ = input();
516: } else {
517: *bp++ = c;
518: }
519: }
520: *bp = 0;
521: yylval.s = tostring(buf);
522: unput('/');
523: RET(REGEXPR);
524: }
525:
526: /* low-level lexical stuff, sort of inherited from lex */
527:
528: char ebuf[300];
529: char *ep = ebuf;
530: char yysbuf[100]; /* pushback buffer */
531: char *yysptr = yysbuf;
532: FILE *yyin = 0;
533:
534: int input(void) /* get next lexical input character */
535: {
536: int c;
537: extern char *lexprog;
538:
539: if (yysptr > yysbuf)
540: c = *--yysptr;
541: else if (lexprog != NULL) { /* awk '...' */
542: if ((c = *lexprog) != 0)
543: lexprog++;
544: } else /* awk -f ... */
545: c = pgetc();
546: if (c == '\n')
547: lineno++;
548: else if (c == EOF)
549: c = 0;
550: if (ep >= ebuf + sizeof ebuf)
551: ep = ebuf;
552: return *ep++ = c;
553: }
554:
555: void unput(int c) /* put lexical character back on input */
556: {
557: if (c == '\n')
558: lineno--;
559: if (yysptr >= yysbuf + sizeof(yysbuf))
1.4 millert 560: FATAL("pushed back too much: %.20s...", yysbuf);
1.1 kstailey 561: *yysptr++ = c;
562: if (--ep < ebuf)
563: ep = ebuf + sizeof(ebuf) - 1;
564: }
565:
566: void unputstr(char *s) /* put a string back on input */
567: {
568: int i;
569:
570: for (i = strlen(s)-1; i >= 0; i--)
571: unput(s[i]);
572: }