Annotation of src/usr.bin/grep/util.c, Revision 1.38
1.38 ! eric 1: /* $OpenBSD: util.c,v 1.37 2010/04/05 03:03:55 tedu Exp $ */
1.3 deraadt 2:
1.1 deraadt 3: /*-
4: * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5: * All rights reserved.
6: *
7: * Redistribution and use in source and binary forms, with or without
8: * modification, are permitted provided that the following conditions
9: * are met:
10: * 1. Redistributions of source code must retain the above copyright
11: * notice, this list of conditions and the following disclaimer.
12: * 2. Redistributions in binary form must reproduce the above copyright
13: * notice, this list of conditions and the following disclaimer in the
14: * documentation and/or other materials provided with the distribution.
15: *
16: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26: * SUCH DAMAGE.
27: */
28:
29: #include <sys/types.h>
30: #include <sys/stat.h>
31:
32: #include <ctype.h>
33: #include <err.h>
34: #include <errno.h>
35: #include <fts.h>
36: #include <regex.h>
37: #include <stdio.h>
38: #include <stdlib.h>
39: #include <string.h>
40: #include <unistd.h>
41: #include <zlib.h>
42:
43: #include "grep.h"
44:
45: /*
46: * Process a file line by line...
47: */
48:
49: static int linesqueued;
1.4 tedu 50: static int procline(str_t *l, int);
1.23 millert 51: static int grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *pmatch);
1.6 tedu 52: static int grep_cmp(const unsigned char *, const unsigned char *, size_t);
53: static void grep_revstr(unsigned char *, int);
1.1 deraadt 54:
1.2 deraadt 55: int
1.1 deraadt 56: grep_tree(char **argv)
57: {
1.10 deraadt 58: FTS *fts;
59: FTSENT *p;
60: int c, fts_flags;
1.1 deraadt 61:
1.37 tedu 62: c = 0;
1.1 deraadt 63:
1.37 tedu 64: fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
1.1 deraadt 65:
1.11 millert 66: if (!(fts = fts_open(argv, fts_flags, NULL)))
1.14 millert 67: err(2, NULL);
1.1 deraadt 68: while ((p = fts_read(fts)) != NULL) {
69: switch (p->fts_info) {
70: case FTS_DNR:
71: break;
72: case FTS_ERR:
1.14 millert 73: errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno));
1.1 deraadt 74: break;
75: case FTS_DP:
76: break;
77: default:
78: c += procfile(p->fts_path);
79: break;
80: }
81: }
1.34 otto 82: if (errno)
83: err(2, "fts_read");
1.1 deraadt 84:
85: return c;
86: }
87:
88: int
89: procfile(char *fn)
90: {
91: str_t ln;
92: file_t *f;
1.4 tedu 93: int c, t, z, nottext;
1.1 deraadt 94:
95: if (fn == NULL) {
96: fn = "(standard input)";
97: f = grep_fdopen(STDIN_FILENO, "r");
98: } else {
99: f = grep_open(fn, "r");
100: }
101: if (f == NULL) {
102: if (!sflag)
103: warn("%s", fn);
104: return 0;
105: }
1.4 tedu 106:
107: nottext = grep_bin_file(f);
108: if (nottext && binbehave == BIN_FILE_SKIP) {
1.1 deraadt 109: grep_close(f);
110: return 0;
111: }
112:
113: ln.file = fn;
114: ln.line_no = 0;
1.20 espie 115: ln.len = 0;
1.1 deraadt 116: linesqueued = 0;
1.33 jaredy 117: tail = 0;
1.1 deraadt 118: ln.off = -1;
119:
120: if (Bflag > 0)
121: initqueue();
1.27 otto 122: for (c = 0; c == 0 || !(lflag || qflag); ) {
1.1 deraadt 123: ln.off += ln.len + 1;
124: if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
125: break;
126: if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
127: --ln.len;
128: ln.line_no++;
129:
130: z = tail;
1.2 deraadt 131:
1.4 tedu 132: if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
1.1 deraadt 133: enqueue(&ln);
134: linesqueued++;
135: }
136: c += t;
137: }
138: if (Bflag > 0)
139: clearqueue();
140: grep_close(f);
141:
142: if (cflag) {
143: if (!hflag)
144: printf("%s:", ln.file);
145: printf("%u\n", c);
146: }
147: if (lflag && c != 0)
148: printf("%s\n", fn);
149: if (Lflag && c == 0)
150: printf("%s\n", fn);
1.4 tedu 151: if (c && !cflag && !lflag && !Lflag &&
1.7 tedu 152: binbehave == BIN_FILE_BIN && nottext && !qflag)
1.4 tedu 153: printf("Binary file %s matches\n", fn);
154:
1.1 deraadt 155: return c;
156: }
157:
158:
159: /*
160: * Process an individual line in a file. Return non-zero if it matches.
161: */
162:
163: #define isword(x) (isalnum(x) || (x) == '_')
164:
165: static int
1.4 tedu 166: procline(str_t *l, int nottext)
1.1 deraadt 167: {
168: regmatch_t pmatch;
1.15 dhartmei 169: int c, i, r;
1.1 deraadt 170:
171: if (matchall) {
172: c = !vflag;
173: goto print;
174: }
1.2 deraadt 175:
1.1 deraadt 176: for (c = i = 0; i < patterns; i++) {
1.22 millert 177: if (fg_pattern[i].pattern) {
1.6 tedu 178: r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
1.9 millert 179: l->len, &pmatch);
1.22 millert 180: } else {
181: pmatch.rm_so = 0;
182: pmatch.rm_eo = l->len;
1.9 millert 183: r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
1.22 millert 184: }
185: if (r == 0 && xflag) {
186: if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
187: r = REG_NOMATCH;
1.1 deraadt 188: }
1.15 dhartmei 189: if (r == 0) {
1.1 deraadt 190: c++;
191: break;
192: }
193: }
1.15 dhartmei 194: if (vflag)
195: c = !c;
1.2 deraadt 196:
1.1 deraadt 197: print:
1.4 tedu 198: if (c && binbehave == BIN_FILE_BIN && nottext)
199: return c; /* Binary file */
200:
1.1 deraadt 201: if ((tail > 0 || c) && !cflag && !qflag) {
202: if (c) {
1.5 deraadt 203: if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
204: (Aflag || Bflag))
1.1 deraadt 205: printf("--\n");
206: first = 1;
207: tail = Aflag;
208: if (Bflag > 0)
209: printqueue();
210: linesqueued = 0;
211: printline(l, ':');
212: } else {
213: printline(l, '-');
214: tail--;
215: }
216: }
217: return c;
218: }
219:
1.31 otto 220: void
1.25 millert 221: fgrepcomp(fastgrep_t *fg, const char *pattern)
222: {
223: int i;
224:
225: /* Initialize. */
226: fg->patternLen = strlen(pattern);
227: fg->bol = 0;
228: fg->eol = 0;
229: fg->wmatch = wflag;
230: fg->reversedSearch = 0;
231:
232: /*
233: * Make a copy and upper case it for later if in -i mode,
234: * else just copy the pointer.
235: */
236: if (iflag) {
237: fg->pattern = grep_malloc(fg->patternLen + 1);
238: for (i = 0; i < fg->patternLen; i++)
239: fg->pattern[i] = toupper(pattern[i]);
240: fg->pattern[fg->patternLen] = '\0';
241: } else
1.28 deraadt 242: fg->pattern = (unsigned char *)pattern; /* really const */
1.25 millert 243:
244: /* Preprocess pattern. */
245: for (i = 0; i <= UCHAR_MAX; i++)
246: fg->qsBc[i] = fg->patternLen;
247: for (i = 1; i < fg->patternLen; i++) {
248: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
249: /*
250: * If case is ignored, make the jump apply to both upper and
251: * lower cased characters. As the pattern is stored in upper
252: * case, apply the same to the lower case equivalents.
253: */
254: if (iflag)
255: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
256: }
257: }
258:
259: /*
260: * Returns: -1 on failure, 0 on success
261: */
262: int
1.6 tedu 263: fastcomp(fastgrep_t *fg, const char *pattern)
264: {
265: int i;
266: int bol = 0;
267: int eol = 0;
268: int shiftPatternLen;
269: int hasDot = 0;
270: int firstHalfDot = -1;
271: int firstLastHalfDot = -1;
272: int lastHalfDot = 0;
273:
274: /* Initialize. */
1.28 deraadt 275: fg->patternLen = strlen(pattern);
1.6 tedu 276: fg->bol = 0;
277: fg->eol = 0;
1.22 millert 278: fg->wmatch = 0;
1.6 tedu 279: fg->reversedSearch = 0;
280:
281: /* Remove end-of-line character ('$'). */
1.38 ! eric 282: if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
1.6 tedu 283: eol++;
284: fg->eol = 1;
285: fg->patternLen--;
286: }
287:
288: /* Remove beginning-of-line character ('^'). */
289: if (pattern[0] == '^') {
290: bol++;
291: fg->bol = 1;
292: fg->patternLen--;
293: }
294:
1.22 millert 295: /* Remove enclosing [[:<:]] and [[:>:]] (word match). */
1.30 otto 296: if (wflag) {
297: /* basic re's use \( \), extended re's ( ) */
298: int extra = Eflag ? 1 : 2;
299: fg->patternLen -= 14 + 2 * extra;
300: fg->wmatch = 7 + extra;
301: } else if (fg->patternLen >= 14 &&
1.22 millert 302: strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
1.24 millert 303: strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
1.22 millert 304: fg->patternLen -= 14;
305: fg->wmatch = 7;
306: }
307:
1.6 tedu 308: /*
1.22 millert 309: * Copy pattern minus '^' and '$' characters as well as word
310: * match character classes at the beginning and ending of the
311: * string respectively.
1.6 tedu 312: */
1.22 millert 313: fg->pattern = grep_malloc(fg->patternLen + 1);
314: memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
315: fg->pattern[fg->patternLen] = '\0';
1.6 tedu 316:
317: /* Look for ways to cheat...er...avoid the full regex engine. */
318: for (i = 0; i < fg->patternLen; i++)
319: {
320: /* Can still cheat? */
321: if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
322: (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
323: (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
324: (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
325: /* As long as it is good, upper case it for later. */
326: if (iflag)
327: fg->pattern[i] = toupper(fg->pattern[i]);
328: } else if (fg->pattern[i] == '.') {
329: hasDot = i;
330: if (i < fg->patternLen / 2) {
1.19 otto 331: if (firstHalfDot < 0)
1.6 tedu 332: /* Closest dot to the beginning */
333: firstHalfDot = i;
334: } else {
335: /* Closest dot to the end of the pattern. */
336: lastHalfDot = i;
337: if (firstLastHalfDot < 0)
338: firstLastHalfDot = i;
339: }
340: } else {
341: /* Free memory and let others know this is empty. */
342: free(fg->pattern);
343: fg->pattern = NULL;
344: return (-1);
345: }
346: }
347:
348: /*
349: * Determine if a reverse search would be faster based on the placement
350: * of the dots.
351: */
352: if ((!(lflag || cflag)) && ((!(bol || eol)) &&
353: ((lastHalfDot) && ((firstHalfDot < 0) ||
354: ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
355: fg->reversedSearch = 1;
356: hasDot = fg->patternLen - (firstHalfDot < 0 ?
357: firstLastHalfDot : firstHalfDot) - 1;
358: grep_revstr(fg->pattern, fg->patternLen);
359: }
360:
361: /*
362: * Normal Quick Search would require a shift based on the position the
363: * next character after the comparison is within the pattern. With
364: * wildcards, the position of the last dot effects the maximum shift
365: * distance.
366: * The closer to the end the wild card is the slower the search. A
1.10 deraadt 367: * reverse version of this algorithm would be useful for wildcards near
1.6 tedu 368: * the end of the string.
369: *
370: * Examples:
371: * Pattern Max shift
372: * ------- ---------
373: * this 5
374: * .his 4
375: * t.is 3
376: * th.s 2
377: * thi. 1
378: */
379:
380: /* Adjust the shift based on location of the last dot ('.'). */
381: shiftPatternLen = fg->patternLen - hasDot;
382:
383: /* Preprocess pattern. */
384: for (i = 0; i <= UCHAR_MAX; i++)
385: fg->qsBc[i] = shiftPatternLen;
386: for (i = hasDot + 1; i < fg->patternLen; i++) {
387: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
388: /*
389: * If case is ignored, make the jump apply to both upper and
390: * lower cased characters. As the pattern is stored in upper
391: * case, apply the same to the lower case equivalents.
392: */
393: if (iflag)
394: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
395: }
396:
397: /*
398: * Put pattern back to normal after pre-processing to allow for easy
399: * comparisons later.
400: */
401: if (fg->reversedSearch)
402: grep_revstr(fg->pattern, fg->patternLen);
403:
404: return (0);
405: }
406:
1.26 otto 407: /*
408: * Word boundaries using regular expressions are defined as the point
409: * of transition from a non-word char to a word char, or vice versa.
410: * This means that grep -w +a and grep -w a+ never match anything,
411: * because they lack a starting or ending transition, but grep -w a+b
412: * does match a line containing a+b.
413: */
1.22 millert 414: #define wmatch(d, l, s, e) \
1.26 otto 415: ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
416: e > s && isword(d[s]) && isword(d[e-1]))
1.22 millert 417:
1.9 millert 418: static int
1.23 millert 419: grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen, regmatch_t *pmatch)
1.6 tedu 420: {
421: int j;
422: int rtrnVal = REG_NOMATCH;
423:
1.9 millert 424: pmatch->rm_so = -1;
425: pmatch->rm_eo = -1;
426:
1.6 tedu 427: /* No point in going farther if we do not have enough data. */
428: if (dataLen < fg->patternLen)
429: return (rtrnVal);
430:
431: /* Only try once at the beginning or ending of the line. */
432: if (fg->bol || fg->eol) {
433: /* Simple text comparison. */
434: /* Verify data is >= pattern length before searching on it. */
435: if (dataLen >= fg->patternLen) {
436: /* Determine where in data to start search at. */
437: if (fg->eol)
438: j = dataLen - fg->patternLen;
439: else
440: j = 0;
441: if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
1.22 millert 442: if (grep_cmp(fg->pattern, data + j,
443: fg->patternLen) == -1) {
1.9 millert 444: pmatch->rm_so = j;
445: pmatch->rm_eo = j + fg->patternLen;
1.22 millert 446: if (!fg->wmatch || wmatch(data, dataLen,
447: pmatch->rm_so, pmatch->rm_eo))
448: rtrnVal = 0;
1.9 millert 449: }
1.6 tedu 450: }
451: } else if (fg->reversedSearch) {
452: /* Quick Search algorithm. */
1.17 millert 453: j = dataLen;
454: do {
1.6 tedu 455: if (grep_cmp(fg->pattern, data + j - fg->patternLen,
456: fg->patternLen) == -1) {
1.9 millert 457: pmatch->rm_so = j - fg->patternLen;
458: pmatch->rm_eo = j;
1.22 millert 459: if (!fg->wmatch || wmatch(data, dataLen,
460: pmatch->rm_so, pmatch->rm_eo)) {
461: rtrnVal = 0;
462: break;
463: }
1.6 tedu 464: }
1.17 millert 465: /* Shift if within bounds, otherwise, we are done. */
466: if (j == fg->patternLen)
467: break;
468: j -= fg->qsBc[data[j - fg->patternLen - 1]];
469: } while (j >= fg->patternLen);
1.6 tedu 470: } else {
471: /* Quick Search algorithm. */
472: j = 0;
473: do {
474: if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
1.9 millert 475: pmatch->rm_so = j;
476: pmatch->rm_eo = j + fg->patternLen;
1.32 jaredy 477: if (fg->patternLen == 0 || !fg->wmatch ||
478: wmatch(data, dataLen, pmatch->rm_so,
479: pmatch->rm_eo)) {
1.22 millert 480: rtrnVal = 0;
481: break;
482: }
1.6 tedu 483: }
484:
485: /* Shift if within bounds, otherwise, we are done. */
486: if (j + fg->patternLen == dataLen)
487: break;
488: else
489: j += fg->qsBc[data[j + fg->patternLen]];
490: } while (j <= (dataLen - fg->patternLen));
491: }
492:
493: return (rtrnVal);
494: }
495:
496:
1.1 deraadt 497: void *
498: grep_malloc(size_t size)
499: {
1.10 deraadt 500: void *ptr;
1.1 deraadt 501:
502: if ((ptr = malloc(size)) == NULL)
1.14 millert 503: err(2, "malloc");
1.35 deraadt 504: return ptr;
505: }
506:
507: void *
508: grep_calloc(size_t nmemb, size_t size)
509: {
510: void *ptr;
511:
512: if ((ptr = calloc(nmemb, size)) == NULL)
513: err(2, "calloc");
1.1 deraadt 514: return ptr;
515: }
516:
517: void *
518: grep_realloc(void *ptr, size_t size)
519: {
520: if ((ptr = realloc(ptr, size)) == NULL)
1.14 millert 521: err(2, "realloc");
1.6 tedu 522: return ptr;
523: }
524:
525: /*
526: * Returns: i >= 0 on failure (position that it failed)
527: * -1 on success
528: */
1.18 avsm 529: static int
1.9 millert 530: grep_cmp(const unsigned char *pattern, const unsigned char *data, size_t len)
1.6 tedu 531: {
532: int i;
533:
534: for (i = 0; i < len; i++) {
1.25 millert 535: if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
536: || (iflag && pattern[i] == toupper(data[i])))
1.6 tedu 537: continue;
538: return (i);
539: }
540:
541: return (-1);
542: }
543:
544: static void
545: grep_revstr(unsigned char *str, int len)
546: {
547: int i;
548: char c;
549:
550: for (i = 0; i < len / 2; i++) {
551: c = str[i];
552: str[i] = str[len - i - 1];
553: str[len - i - 1] = c;
554: }
1.1 deraadt 555: }
556:
557: void
558: printline(str_t *line, int sep)
559: {
560: int n;
1.2 deraadt 561:
1.1 deraadt 562: n = 0;
563: if (!hflag) {
564: fputs(line->file, stdout);
565: ++n;
566: }
567: if (nflag) {
568: if (n)
569: putchar(sep);
570: printf("%d", line->line_no);
571: ++n;
572: }
573: if (bflag) {
574: if (n)
575: putchar(sep);
1.21 otto 576: printf("%lld", (long long)line->off);
577: ++n;
1.1 deraadt 578: }
579: if (n)
580: putchar(sep);
581: fwrite(line->dat, line->len, 1, stdout);
582: putchar('\n');
583: }