Annotation of src/usr.bin/awk/lex.c, Revision 1.3
1.3 ! millert 1: /* $OpenBSD: lex.c,v 1.2 1999/04/18 17:06:30 millert Exp $ */
1.1 kstailey 2: /****************************************************************
3: Copyright (C) Lucent Technologies 1997
4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
24: ****************************************************************/
25:
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <ctype.h>
30: #include "awk.h"
31: #include "ytab.h"
32:
33: extern YYSTYPE yylval;
34: extern int infunc;
35:
36: int lineno = 1;
37: int bracecnt = 0;
38: int brackcnt = 0;
39: int parencnt = 0;
40:
41: typedef struct Keyword {
42: char *word;
43: int sub;
44: int type;
45: } Keyword;
46:
47: Keyword keywords[] ={ /* keep sorted: binary searched */
48: { "BEGIN", XBEGIN, XBEGIN },
49: { "END", XEND, XEND },
50: { "NF", VARNF, VARNF },
51: { "atan2", FATAN, BLTIN },
52: { "break", BREAK, BREAK },
53: { "close", CLOSE, CLOSE },
54: { "continue", CONTINUE, CONTINUE },
55: { "cos", FCOS, BLTIN },
56: { "delete", DELETE, DELETE },
57: { "do", DO, DO },
58: { "else", ELSE, ELSE },
59: { "exit", EXIT, EXIT },
60: { "exp", FEXP, BLTIN },
61: { "fflush", FFLUSH, BLTIN },
62: { "for", FOR, FOR },
63: { "func", FUNC, FUNC },
64: { "function", FUNC, FUNC },
65: { "getline", GETLINE, GETLINE },
66: { "gsub", GSUB, GSUB },
67: { "if", IF, IF },
68: { "in", IN, IN },
69: { "index", INDEX, INDEX },
70: { "int", FINT, BLTIN },
71: { "length", FLENGTH, BLTIN },
72: { "log", FLOG, BLTIN },
73: { "match", MATCHFCN, MATCHFCN },
74: { "next", NEXT, NEXT },
75: { "nextfile", NEXTFILE, NEXTFILE },
76: { "print", PRINT, PRINT },
77: { "printf", PRINTF, PRINTF },
78: { "rand", FRAND, BLTIN },
79: { "return", RETURN, RETURN },
80: { "sin", FSIN, BLTIN },
81: { "split", SPLIT, SPLIT },
82: { "sprintf", SPRINTF, SPRINTF },
83: { "sqrt", FSQRT, BLTIN },
84: { "srand", FSRAND, BLTIN },
85: { "sub", SUB, SUB },
86: { "substr", SUBSTR, SUBSTR },
87: { "system", FSYSTEM, BLTIN },
88: { "tolower", FTOLOWER, BLTIN },
89: { "toupper", FTOUPPER, BLTIN },
90: { "while", WHILE, WHILE },
91: };
92:
93: #define DEBUG
94: #ifdef DEBUG
95: #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
96: #else
97: #define RET(x) return(x)
98: #endif
99:
1.2 millert 100: int peek(void)
1.1 kstailey 101: {
102: int c = input();
103: unput(c);
104: return c;
105: }
106:
107: int gettok(char **pbuf, int *psz) /* get next input token */
108: {
109: int c;
110: char *buf = *pbuf;
111: int sz = *psz;
112: char *bp = buf;
113:
114: c = input();
115: if (c == 0)
116: return 0;
117: buf[0] = c;
118: buf[1] = 0;
119: if (!isalnum(c) && c != '.' && c != '_')
120: return c;
121:
122: *bp++ = c;
123: if (isalpha(c) || c == '_') { /* it's a varname */
124: for ( ; (c = input()) != 0; ) {
125: if (bp-buf >= sz)
126: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
127: ERROR "out of space for name %.10s...", buf FATAL;
128: if (isalnum(c) || c == '_')
129: *bp++ = c;
130: else {
131: *bp = 0;
132: unput(c);
133: break;
134: }
135: }
136: } else { /* it's a number */
137: char *rem;
138: /* read input until can't be a number */
139: for ( ; (c = input()) != 0; ) {
140: if (bp-buf >= sz)
141: if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
142: ERROR "out of space for number %.10s...", buf FATAL;
143: if (isdigit(c) || c == 'e' || c == 'E'
144: || c == '.' || c == '+' || c == '-')
145: *bp++ = c;
146: else {
147: unput(c);
148: break;
149: }
150: }
1.2 millert 151: *bp = 0;
1.1 kstailey 152: strtod(buf, &rem); /* parse the number */
153: unputstr(rem); /* put rest back for later */
154: rem[0] = 0;
155: }
156: *pbuf = buf;
157: *psz = sz;
158: return buf[0];
159: }
160:
161: int word(char *);
162: int string(void);
163: int regexpr(void);
164: int sc = 0; /* 1 => return a } right now */
165: int reg = 0; /* 1 => return a REGEXPR now */
166:
1.3 ! millert 167: int yylex(void)
1.1 kstailey 168: {
1.3 ! millert 169: int c;
1.1 kstailey 170: static char *buf = 0;
171: static int bufsize = 500;
172:
173: if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
174: ERROR "out of space in yylex" FATAL;
175: if (sc) {
176: sc = 0;
177: RET('}');
178: }
179: if (reg) {
180: reg = 0;
181: return regexpr();
182: }
183: for (;;) {
184: c = gettok(&buf, &bufsize);
185: if (c == 0)
186: return 0;
187: if (isalpha(c) || c == '_')
188: return word(buf);
189: if (isdigit(c) || c == '.') {
190: yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
191: /* should this also have STR set? */
192: RET(NUMBER);
193: }
194:
195: yylval.i = c;
196: switch (c) {
197: case '\n': /* {EOL} */
198: RET(NL);
199: case '\r': /* assume \n is coming */
200: case ' ': /* {WS}+ */
201: case '\t':
202: break;
203: case '#': /* #.* strip comments */
204: while ((c = input()) != '\n' && c != 0)
205: ;
206: unput(c);
207: break;
208: case ';':
209: RET(';');
210: case '\\':
211: if (peek() == '\n') {
1.3 ! millert 212: input();
1.1 kstailey 213: } else if (peek() == '\r') {
214: input(); input(); /* \n */
215: lineno++;
216: } else {
217: RET(c);
218: }
219: break;
220: case '&':
221: if (peek() == '&') {
222: input(); RET(AND);
223: } else
224: RET('&');
225: case '|':
226: if (peek() == '|') {
227: input(); RET(BOR);
228: } else
229: RET('|');
230: case '!':
231: if (peek() == '=') {
232: input(); yylval.i = NE; RET(NE);
233: } else if (peek() == '~') {
234: input(); yylval.i = NOTMATCH; RET(MATCHOP);
235: } else
236: RET(NOT);
237: case '~':
238: yylval.i = MATCH;
239: RET(MATCHOP);
240: case '<':
241: if (peek() == '=') {
242: input(); yylval.i = LE; RET(LE);
243: } else {
244: yylval.i = LT; RET(LT);
245: }
246: case '=':
247: if (peek() == '=') {
248: input(); yylval.i = EQ; RET(EQ);
249: } else {
250: yylval.i = ASSIGN; RET(ASGNOP);
251: }
252: case '>':
253: if (peek() == '=') {
254: input(); yylval.i = GE; RET(GE);
255: } else if (peek() == '>') {
256: input(); yylval.i = APPEND; RET(APPEND);
257: } else {
258: yylval.i = GT; RET(GT);
259: }
260: case '+':
261: if (peek() == '+') {
262: input(); yylval.i = INCR; RET(INCR);
263: } else if (peek() == '=') {
264: input(); yylval.i = ADDEQ; RET(ASGNOP);
265: } else
266: RET('+');
267: case '-':
268: if (peek() == '-') {
269: input(); yylval.i = DECR; RET(DECR);
270: } else if (peek() == '=') {
271: input(); yylval.i = SUBEQ; RET(ASGNOP);
272: } else
273: RET('-');
274: case '*':
275: if (peek() == '=') { /* *= */
276: input(); yylval.i = MULTEQ; RET(ASGNOP);
277: } else if (peek() == '*') { /* ** or **= */
278: input(); /* eat 2nd * */
279: if (peek() == '=') {
280: input(); yylval.i = POWEQ; RET(ASGNOP);
281: } else {
282: RET(POWER);
283: }
284: } else
285: RET('*');
286: case '/':
1.3 ! millert 287: RET('/');
1.1 kstailey 288: case '%':
289: if (peek() == '=') {
290: input(); yylval.i = MODEQ; RET(ASGNOP);
291: } else
292: RET('%');
293: case '^':
294: if (peek() == '=') {
295: input(); yylval.i = POWEQ; RET(ASGNOP);
296: } else
297: RET(POWER);
298:
299: case '$':
300: /* BUG: awkward, if not wrong */
301: c = gettok(&buf, &bufsize);
1.3 ! millert 302: if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
1.1 kstailey 303: unputstr(buf);
304: RET(INDIRECT);
305: } else if (isalpha(c)) {
306: if (strcmp(buf, "NF") == 0) { /* very special */
307: unputstr("(NF)");
308: RET(INDIRECT);
309: }
310: yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
311: RET(IVAR);
312: } else {
313: unputstr(buf);
314: RET(INDIRECT);
315: }
316:
317: case '}':
318: if (--bracecnt < 0)
319: ERROR "extra }" SYNTAX;
320: sc = 1;
321: RET(';');
322: case ']':
323: if (--brackcnt < 0)
324: ERROR "extra ]" SYNTAX;
325: RET(']');
326: case ')':
327: if (--parencnt < 0)
328: ERROR "extra )" SYNTAX;
329: RET(')');
330: case '{':
331: bracecnt++;
332: RET('{');
333: case '[':
334: brackcnt++;
335: RET('[');
336: case '(':
337: parencnt++;
338: RET('(');
339:
340: case '"':
341: return string(); /* BUG: should be like tran.c ? */
342:
343: default:
344: RET(c);
345: }
346: }
347: }
348:
1.3 ! millert 349: int string(void)
1.1 kstailey 350: {
351: int c, n;
352: char *s, *bp;
353: static char *buf = 0;
354: static int bufsz = 500;
355:
356: if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
357: ERROR "out of space for strings" FATAL;
358: for (bp = buf; (c = input()) != '"'; ) {
359: if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
360: ERROR "out of space for string %.10s...", buf FATAL;
361: switch (c) {
362: case '\n':
363: case '\r':
364: case 0:
365: ERROR "non-terminated string %.10s...", buf SYNTAX;
366: lineno++;
367: break;
368: case '\\':
369: c = input();
370: switch (c) {
371: case '"': *bp++ = '"'; break;
372: case 'n': *bp++ = '\n'; break;
373: case 't': *bp++ = '\t'; break;
374: case 'f': *bp++ = '\f'; break;
375: case 'r': *bp++ = '\r'; break;
376: case 'b': *bp++ = '\b'; break;
377: case 'v': *bp++ = '\v'; break;
1.3 ! millert 378: case 'a': *bp++ = '\007'; break;
1.1 kstailey 379: case '\\': *bp++ = '\\'; break;
380:
381: case '0': case '1': case '2': /* octal: \d \dd \ddd */
382: case '3': case '4': case '5': case '6': case '7':
383: n = c - '0';
384: if ((c = peek()) >= '0' && c < '8') {
385: n = 8 * n + input() - '0';
386: if ((c = peek()) >= '0' && c < '8')
387: n = 8 * n + input() - '0';
388: }
389: *bp++ = n;
390: break;
391:
392: case 'x': /* hex \x0-9a-fA-F + */
393: { char xbuf[100], *px;
394: for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
395: if (isdigit(c)
396: || (c >= 'a' && c <= 'f')
397: || (c >= 'A' && c <= 'F'))
398: *px++ = c;
399: else
400: break;
401: }
402: *px = 0;
403: unput(c);
404: sscanf(xbuf, "%x", &n);
405: *bp++ = n;
406: break;
407: }
408:
409: default:
410: *bp++ = c;
411: break;
412: }
413: break;
414: default:
415: *bp++ = c;
416: break;
417: }
418: }
419: *bp = 0;
420: s = tostring(buf);
421: *bp++ = ' '; *bp++ = 0;
422: yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
423: RET(STRING);
424: }
425:
426:
427: int binsearch(char *w, Keyword *kp, int n)
428: {
429: int cond, low, mid, high;
430:
431: low = 0;
432: high = n - 1;
433: while (low <= high) {
434: mid = (low + high) / 2;
435: if ((cond = strcmp(w, kp[mid].word)) < 0)
436: high = mid - 1;
437: else if (cond > 0)
438: low = mid + 1;
439: else
440: return mid;
441: }
442: return -1;
443: }
444:
445: int word(char *w)
446: {
447: Keyword *kp;
448: int c, n;
449:
450: n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
451: kp = keywords + n;
452: if (n != -1) { /* found in table */
453: yylval.i = kp->sub;
454: switch (kp->type) { /* special handling */
455: case FSYSTEM:
456: if (safe)
457: ERROR "system is unsafe" SYNTAX;
458: RET(kp->type);
459: case FUNC:
460: if (infunc)
461: ERROR "illegal nested function" SYNTAX;
462: RET(kp->type);
463: case RETURN:
464: if (!infunc)
465: ERROR "return not in function" SYNTAX;
466: RET(kp->type);
467: case VARNF:
468: yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
469: RET(VARNF);
470: default:
471: RET(kp->type);
472: }
473: }
474: c = peek(); /* look for '(' */
475: if (c != '(' && infunc && (n=isarg(w)) >= 0) {
476: yylval.i = n;
477: RET(ARG);
478: } else {
479: yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
480: if (c == '(') {
481: RET(CALL);
482: } else {
483: RET(VAR);
484: }
485: }
486: }
487:
488: void startreg(void) /* next call to yyles will return a regular expression */
489: {
490: reg = 1;
491: }
492:
1.3 ! millert 493: int regexpr(void)
1.1 kstailey 494: {
495: int c;
496: static char *buf = 0;
497: static int bufsz = 500;
498: char *bp;
499:
500: if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
501: ERROR "out of space for rex expr" FATAL;
502: bp = buf;
503: for ( ; (c = input()) != '/' && c != 0; ) {
504: if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
505: ERROR "out of space for reg expr %.10s...", buf FATAL;
506: if (c == '\n') {
507: ERROR "newline in regular expression %.10s...", buf SYNTAX;
508: unput('\n');
509: break;
510: } else if (c == '\\') {
511: *bp++ = '\\';
512: *bp++ = input();
513: } else {
514: *bp++ = c;
515: }
516: }
517: *bp = 0;
518: yylval.s = tostring(buf);
519: unput('/');
520: RET(REGEXPR);
521: }
522:
523: /* low-level lexical stuff, sort of inherited from lex */
524:
525: char ebuf[300];
526: char *ep = ebuf;
527: char yysbuf[100]; /* pushback buffer */
528: char *yysptr = yysbuf;
529: FILE *yyin = 0;
530:
531: int input(void) /* get next lexical input character */
532: {
533: int c;
534: extern char *lexprog;
535:
536: if (yysptr > yysbuf)
537: c = *--yysptr;
538: else if (lexprog != NULL) { /* awk '...' */
539: if ((c = *lexprog) != 0)
540: lexprog++;
541: } else /* awk -f ... */
542: c = pgetc();
543: if (c == '\n')
544: lineno++;
545: else if (c == EOF)
546: c = 0;
547: if (ep >= ebuf + sizeof ebuf)
548: ep = ebuf;
549: return *ep++ = c;
550: }
551:
552: void unput(int c) /* put lexical character back on input */
553: {
554: if (c == '\n')
555: lineno--;
556: if (yysptr >= yysbuf + sizeof(yysbuf))
557: ERROR "pushed back too much: %.20s...", yysbuf FATAL;
558: *yysptr++ = c;
559: if (--ep < ebuf)
560: ep = ebuf + sizeof(ebuf) - 1;
561: }
562:
563: void unputstr(char *s) /* put a string back on input */
564: {
565: int i;
566:
567: for (i = strlen(s)-1; i >= 0; i--)
568: unput(s[i]);
569: }