Annotation of src/usr.bin/grep/util.c, Revision 1.34
1.34 ! otto 1: /* $OpenBSD: util.c,v 1.33 2006/11/17 02:01:29 jaredy Exp $ */
1.3 deraadt 2:
1.1 deraadt 3: /*-
4: * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5: * All rights reserved.
6: *
7: * Redistribution and use in source and binary forms, with or without
8: * modification, are permitted provided that the following conditions
9: * are met:
10: * 1. Redistributions of source code must retain the above copyright
11: * notice, this list of conditions and the following disclaimer.
12: * 2. Redistributions in binary form must reproduce the above copyright
13: * notice, this list of conditions and the following disclaimer in the
14: * documentation and/or other materials provided with the distribution.
15: *
16: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26: * SUCH DAMAGE.
27: */
28:
29: #include <sys/types.h>
30: #include <sys/stat.h>
31:
32: #include <ctype.h>
33: #include <err.h>
34: #include <errno.h>
35: #include <fts.h>
36: #include <regex.h>
37: #include <stdio.h>
38: #include <stdlib.h>
39: #include <string.h>
40: #include <unistd.h>
41: #include <zlib.h>
42:
43: #include "grep.h"
44:
45: /*
46: * Process a file line by line...
47: */
48:
49: static int linesqueued;
1.4 tedu 50: static int procline(str_t *l, int);
1.23 millert 51: static int grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *pmatch);
1.6 tedu 52: static int grep_cmp(const unsigned char *, const unsigned char *, size_t);
53: static void grep_revstr(unsigned char *, int);
1.1 deraadt 54:
1.2 deraadt 55: int
1.1 deraadt 56: grep_tree(char **argv)
57: {
1.10 deraadt 58: FTS *fts;
59: FTSENT *p;
60: int c, fts_flags;
1.1 deraadt 61:
62: c = fts_flags = 0;
63:
64: if (Hflag)
65: fts_flags = FTS_COMFOLLOW;
66: if (Pflag)
67: fts_flags = FTS_PHYSICAL;
68: if (Sflag)
69: fts_flags = FTS_LOGICAL;
70:
71: fts_flags |= FTS_NOSTAT | FTS_NOCHDIR;
72:
1.11 millert 73: if (!(fts = fts_open(argv, fts_flags, NULL)))
1.14 millert 74: err(2, NULL);
1.1 deraadt 75: while ((p = fts_read(fts)) != NULL) {
76: switch (p->fts_info) {
77: case FTS_DNR:
78: break;
79: case FTS_ERR:
1.14 millert 80: errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno));
1.1 deraadt 81: break;
82: case FTS_DP:
83: break;
84: default:
85: c += procfile(p->fts_path);
86: break;
87: }
88: }
1.34 ! otto 89: if (errno)
! 90: err(2, "fts_read");
1.1 deraadt 91:
92: return c;
93: }
94:
95: int
96: procfile(char *fn)
97: {
98: str_t ln;
99: file_t *f;
1.4 tedu 100: int c, t, z, nottext;
1.1 deraadt 101:
102: if (fn == NULL) {
103: fn = "(standard input)";
104: f = grep_fdopen(STDIN_FILENO, "r");
105: } else {
106: f = grep_open(fn, "r");
107: }
108: if (f == NULL) {
109: if (!sflag)
110: warn("%s", fn);
111: return 0;
112: }
1.4 tedu 113:
114: nottext = grep_bin_file(f);
115: if (nottext && binbehave == BIN_FILE_SKIP) {
1.1 deraadt 116: grep_close(f);
117: return 0;
118: }
119:
120: ln.file = fn;
121: ln.line_no = 0;
1.20 espie 122: ln.len = 0;
1.1 deraadt 123: linesqueued = 0;
1.33 jaredy 124: tail = 0;
1.1 deraadt 125: ln.off = -1;
126:
127: if (Bflag > 0)
128: initqueue();
1.27 otto 129: for (c = 0; c == 0 || !(lflag || qflag); ) {
1.1 deraadt 130: ln.off += ln.len + 1;
131: if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
132: break;
133: if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
134: --ln.len;
135: ln.line_no++;
136:
137: z = tail;
1.2 deraadt 138:
1.4 tedu 139: if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
1.1 deraadt 140: enqueue(&ln);
141: linesqueued++;
142: }
143: c += t;
144: }
145: if (Bflag > 0)
146: clearqueue();
147: grep_close(f);
148:
149: if (cflag) {
150: if (!hflag)
151: printf("%s:", ln.file);
152: printf("%u\n", c);
153: }
154: if (lflag && c != 0)
155: printf("%s\n", fn);
156: if (Lflag && c == 0)
157: printf("%s\n", fn);
1.4 tedu 158: if (c && !cflag && !lflag && !Lflag &&
1.7 tedu 159: binbehave == BIN_FILE_BIN && nottext && !qflag)
1.4 tedu 160: printf("Binary file %s matches\n", fn);
161:
1.1 deraadt 162: return c;
163: }
164:
165:
166: /*
167: * Process an individual line in a file. Return non-zero if it matches.
168: */
169:
170: #define isword(x) (isalnum(x) || (x) == '_')
171:
172: static int
1.4 tedu 173: procline(str_t *l, int nottext)
1.1 deraadt 174: {
175: regmatch_t pmatch;
1.15 dhartmei 176: int c, i, r;
1.1 deraadt 177:
178: if (matchall) {
179: c = !vflag;
180: goto print;
181: }
1.2 deraadt 182:
1.1 deraadt 183: for (c = i = 0; i < patterns; i++) {
1.22 millert 184: if (fg_pattern[i].pattern) {
1.6 tedu 185: r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
1.9 millert 186: l->len, &pmatch);
1.22 millert 187: } else {
188: pmatch.rm_so = 0;
189: pmatch.rm_eo = l->len;
1.9 millert 190: r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
1.22 millert 191: }
192: if (r == 0 && xflag) {
193: if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
194: r = REG_NOMATCH;
1.1 deraadt 195: }
1.15 dhartmei 196: if (r == 0) {
1.1 deraadt 197: c++;
198: break;
199: }
200: }
1.15 dhartmei 201: if (vflag)
202: c = !c;
1.2 deraadt 203:
1.1 deraadt 204: print:
1.4 tedu 205: if (c && binbehave == BIN_FILE_BIN && nottext)
206: return c; /* Binary file */
207:
1.1 deraadt 208: if ((tail > 0 || c) && !cflag && !qflag) {
209: if (c) {
1.5 deraadt 210: if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
211: (Aflag || Bflag))
1.1 deraadt 212: printf("--\n");
213: first = 1;
214: tail = Aflag;
215: if (Bflag > 0)
216: printqueue();
217: linesqueued = 0;
218: printline(l, ':');
219: } else {
220: printline(l, '-');
221: tail--;
222: }
223: }
224: return c;
225: }
226:
1.31 otto 227: void
1.25 millert 228: fgrepcomp(fastgrep_t *fg, const char *pattern)
229: {
230: int i;
231:
232: /* Initialize. */
233: fg->patternLen = strlen(pattern);
234: fg->bol = 0;
235: fg->eol = 0;
236: fg->wmatch = wflag;
237: fg->reversedSearch = 0;
238:
239: /*
240: * Make a copy and upper case it for later if in -i mode,
241: * else just copy the pointer.
242: */
243: if (iflag) {
244: fg->pattern = grep_malloc(fg->patternLen + 1);
245: for (i = 0; i < fg->patternLen; i++)
246: fg->pattern[i] = toupper(pattern[i]);
247: fg->pattern[fg->patternLen] = '\0';
248: } else
1.28 deraadt 249: fg->pattern = (unsigned char *)pattern; /* really const */
1.25 millert 250:
251: /* Preprocess pattern. */
252: for (i = 0; i <= UCHAR_MAX; i++)
253: fg->qsBc[i] = fg->patternLen;
254: for (i = 1; i < fg->patternLen; i++) {
255: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
256: /*
257: * If case is ignored, make the jump apply to both upper and
258: * lower cased characters. As the pattern is stored in upper
259: * case, apply the same to the lower case equivalents.
260: */
261: if (iflag)
262: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
263: }
264: }
265:
266: /*
267: * Returns: -1 on failure, 0 on success
268: */
269: int
1.6 tedu 270: fastcomp(fastgrep_t *fg, const char *pattern)
271: {
272: int i;
273: int bol = 0;
274: int eol = 0;
275: int shiftPatternLen;
276: int hasDot = 0;
277: int firstHalfDot = -1;
278: int firstLastHalfDot = -1;
279: int lastHalfDot = 0;
280:
281: /* Initialize. */
1.28 deraadt 282: fg->patternLen = strlen(pattern);
1.6 tedu 283: fg->bol = 0;
284: fg->eol = 0;
1.22 millert 285: fg->wmatch = 0;
1.6 tedu 286: fg->reversedSearch = 0;
287:
288: /* Remove end-of-line character ('$'). */
289: if (pattern[fg->patternLen - 1] == '$') {
290: eol++;
291: fg->eol = 1;
292: fg->patternLen--;
293: }
294:
295: /* Remove beginning-of-line character ('^'). */
296: if (pattern[0] == '^') {
297: bol++;
298: fg->bol = 1;
299: fg->patternLen--;
300: }
301:
1.22 millert 302: /* Remove enclosing [[:<:]] and [[:>:]] (word match). */
1.30 otto 303: if (wflag) {
304: /* basic re's use \( \), extended re's ( ) */
305: int extra = Eflag ? 1 : 2;
306: fg->patternLen -= 14 + 2 * extra;
307: fg->wmatch = 7 + extra;
308: } else if (fg->patternLen >= 14 &&
1.22 millert 309: strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
1.24 millert 310: strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
1.22 millert 311: fg->patternLen -= 14;
312: fg->wmatch = 7;
313: }
314:
1.6 tedu 315: /*
1.22 millert 316: * Copy pattern minus '^' and '$' characters as well as word
317: * match character classes at the beginning and ending of the
318: * string respectively.
1.6 tedu 319: */
1.22 millert 320: fg->pattern = grep_malloc(fg->patternLen + 1);
321: memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
322: fg->pattern[fg->patternLen] = '\0';
1.6 tedu 323:
324: /* Look for ways to cheat...er...avoid the full regex engine. */
325: for (i = 0; i < fg->patternLen; i++)
326: {
327: /* Can still cheat? */
328: if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
329: (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
330: (fg->pattern[i] == '^') || (fg->pattern[i] == '$') ||
331: (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
332: (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
333: /* As long as it is good, upper case it for later. */
334: if (iflag)
335: fg->pattern[i] = toupper(fg->pattern[i]);
336: } else if (fg->pattern[i] == '.') {
337: hasDot = i;
338: if (i < fg->patternLen / 2) {
1.19 otto 339: if (firstHalfDot < 0)
1.6 tedu 340: /* Closest dot to the beginning */
341: firstHalfDot = i;
342: } else {
343: /* Closest dot to the end of the pattern. */
344: lastHalfDot = i;
345: if (firstLastHalfDot < 0)
346: firstLastHalfDot = i;
347: }
348: } else {
349: /* Free memory and let others know this is empty. */
350: free(fg->pattern);
351: fg->pattern = NULL;
352: return (-1);
353: }
354: }
355:
356: /*
357: * Determine if a reverse search would be faster based on the placement
358: * of the dots.
359: */
360: if ((!(lflag || cflag)) && ((!(bol || eol)) &&
361: ((lastHalfDot) && ((firstHalfDot < 0) ||
362: ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
363: fg->reversedSearch = 1;
364: hasDot = fg->patternLen - (firstHalfDot < 0 ?
365: firstLastHalfDot : firstHalfDot) - 1;
366: grep_revstr(fg->pattern, fg->patternLen);
367: }
368:
369: /*
370: * Normal Quick Search would require a shift based on the position the
371: * next character after the comparison is within the pattern. With
372: * wildcards, the position of the last dot effects the maximum shift
373: * distance.
374: * The closer to the end the wild card is the slower the search. A
1.10 deraadt 375: * reverse version of this algorithm would be useful for wildcards near
1.6 tedu 376: * the end of the string.
377: *
378: * Examples:
379: * Pattern Max shift
380: * ------- ---------
381: * this 5
382: * .his 4
383: * t.is 3
384: * th.s 2
385: * thi. 1
386: */
387:
388: /* Adjust the shift based on location of the last dot ('.'). */
389: shiftPatternLen = fg->patternLen - hasDot;
390:
391: /* Preprocess pattern. */
392: for (i = 0; i <= UCHAR_MAX; i++)
393: fg->qsBc[i] = shiftPatternLen;
394: for (i = hasDot + 1; i < fg->patternLen; i++) {
395: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
396: /*
397: * If case is ignored, make the jump apply to both upper and
398: * lower cased characters. As the pattern is stored in upper
399: * case, apply the same to the lower case equivalents.
400: */
401: if (iflag)
402: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
403: }
404:
405: /*
406: * Put pattern back to normal after pre-processing to allow for easy
407: * comparisons later.
408: */
409: if (fg->reversedSearch)
410: grep_revstr(fg->pattern, fg->patternLen);
411:
412: return (0);
413: }
414:
1.26 otto 415: /*
416: * Word boundaries using regular expressions are defined as the point
417: * of transition from a non-word char to a word char, or vice versa.
418: * This means that grep -w +a and grep -w a+ never match anything,
419: * because they lack a starting or ending transition, but grep -w a+b
420: * does match a line containing a+b.
421: */
1.22 millert 422: #define wmatch(d, l, s, e) \
1.26 otto 423: ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
424: e > s && isword(d[s]) && isword(d[e-1]))
1.22 millert 425:
1.9 millert 426: static int
1.23 millert 427: grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen, regmatch_t *pmatch)
1.6 tedu 428: {
429: int j;
430: int rtrnVal = REG_NOMATCH;
431:
1.9 millert 432: pmatch->rm_so = -1;
433: pmatch->rm_eo = -1;
434:
1.6 tedu 435: /* No point in going farther if we do not have enough data. */
436: if (dataLen < fg->patternLen)
437: return (rtrnVal);
438:
439: /* Only try once at the beginning or ending of the line. */
440: if (fg->bol || fg->eol) {
441: /* Simple text comparison. */
442: /* Verify data is >= pattern length before searching on it. */
443: if (dataLen >= fg->patternLen) {
444: /* Determine where in data to start search at. */
445: if (fg->eol)
446: j = dataLen - fg->patternLen;
447: else
448: j = 0;
449: if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
1.22 millert 450: if (grep_cmp(fg->pattern, data + j,
451: fg->patternLen) == -1) {
1.9 millert 452: pmatch->rm_so = j;
453: pmatch->rm_eo = j + fg->patternLen;
1.22 millert 454: if (!fg->wmatch || wmatch(data, dataLen,
455: pmatch->rm_so, pmatch->rm_eo))
456: rtrnVal = 0;
1.9 millert 457: }
1.6 tedu 458: }
459: } else if (fg->reversedSearch) {
460: /* Quick Search algorithm. */
1.17 millert 461: j = dataLen;
462: do {
1.6 tedu 463: if (grep_cmp(fg->pattern, data + j - fg->patternLen,
464: fg->patternLen) == -1) {
1.9 millert 465: pmatch->rm_so = j - fg->patternLen;
466: pmatch->rm_eo = j;
1.22 millert 467: if (!fg->wmatch || wmatch(data, dataLen,
468: pmatch->rm_so, pmatch->rm_eo)) {
469: rtrnVal = 0;
470: break;
471: }
1.6 tedu 472: }
1.17 millert 473: /* Shift if within bounds, otherwise, we are done. */
474: if (j == fg->patternLen)
475: break;
476: j -= fg->qsBc[data[j - fg->patternLen - 1]];
477: } while (j >= fg->patternLen);
1.6 tedu 478: } else {
479: /* Quick Search algorithm. */
480: j = 0;
481: do {
482: if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
1.9 millert 483: pmatch->rm_so = j;
484: pmatch->rm_eo = j + fg->patternLen;
1.32 jaredy 485: if (fg->patternLen == 0 || !fg->wmatch ||
486: wmatch(data, dataLen, pmatch->rm_so,
487: pmatch->rm_eo)) {
1.22 millert 488: rtrnVal = 0;
489: break;
490: }
1.6 tedu 491: }
492:
493: /* Shift if within bounds, otherwise, we are done. */
494: if (j + fg->patternLen == dataLen)
495: break;
496: else
497: j += fg->qsBc[data[j + fg->patternLen]];
498: } while (j <= (dataLen - fg->patternLen));
499: }
500:
501: return (rtrnVal);
502: }
503:
504:
1.1 deraadt 505: void *
506: grep_malloc(size_t size)
507: {
1.10 deraadt 508: void *ptr;
1.1 deraadt 509:
510: if ((ptr = malloc(size)) == NULL)
1.14 millert 511: err(2, "malloc");
1.1 deraadt 512: return ptr;
513: }
514:
515: void *
516: grep_realloc(void *ptr, size_t size)
517: {
518: if ((ptr = realloc(ptr, size)) == NULL)
1.14 millert 519: err(2, "realloc");
1.6 tedu 520: return ptr;
521: }
522:
523: /*
524: * Returns: i >= 0 on failure (position that it failed)
525: * -1 on success
526: */
1.18 avsm 527: static int
1.9 millert 528: grep_cmp(const unsigned char *pattern, const unsigned char *data, size_t len)
1.6 tedu 529: {
530: int i;
531:
532: for (i = 0; i < len; i++) {
1.25 millert 533: if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
534: || (iflag && pattern[i] == toupper(data[i])))
1.6 tedu 535: continue;
536: return (i);
537: }
538:
539: return (-1);
540: }
541:
542: static void
543: grep_revstr(unsigned char *str, int len)
544: {
545: int i;
546: char c;
547:
548: for (i = 0; i < len / 2; i++) {
549: c = str[i];
550: str[i] = str[len - i - 1];
551: str[len - i - 1] = c;
552: }
1.1 deraadt 553: }
554:
555: void
556: printline(str_t *line, int sep)
557: {
558: int n;
1.2 deraadt 559:
1.1 deraadt 560: n = 0;
561: if (!hflag) {
562: fputs(line->file, stdout);
563: ++n;
564: }
565: if (nflag) {
566: if (n)
567: putchar(sep);
568: printf("%d", line->line_no);
569: ++n;
570: }
571: if (bflag) {
572: if (n)
573: putchar(sep);
1.21 otto 574: printf("%lld", (long long)line->off);
575: ++n;
1.1 deraadt 576: }
577: if (n)
578: putchar(sep);
579: fwrite(line->dat, line->len, 1, stdout);
580: putchar('\n');
581: }