Annotation of src/usr.bin/grep/util.c, Revision 1.31
1.31 ! otto 1: /* $OpenBSD: util.c,v 1.30 2005/04/03 19:12:40 otto Exp $ */
1.3 deraadt 2:
1.1 deraadt 3: /*-
4: * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5: * All rights reserved.
6: *
7: * Redistribution and use in source and binary forms, with or without
8: * modification, are permitted provided that the following conditions
9: * are met:
10: * 1. Redistributions of source code must retain the above copyright
11: * notice, this list of conditions and the following disclaimer.
12: * 2. Redistributions in binary form must reproduce the above copyright
13: * notice, this list of conditions and the following disclaimer in the
14: * documentation and/or other materials provided with the distribution.
15: *
16: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26: * SUCH DAMAGE.
27: */
28:
29: #include <sys/types.h>
30: #include <sys/stat.h>
31:
32: #include <ctype.h>
33: #include <err.h>
34: #include <errno.h>
35: #include <fts.h>
36: #include <regex.h>
37: #include <stdio.h>
38: #include <stdlib.h>
39: #include <string.h>
40: #include <unistd.h>
41: #include <zlib.h>
42:
43: #include "grep.h"
44:
45: /*
46: * Process a file line by line...
47: */
48:
49: static int linesqueued;
1.4 tedu 50: static int procline(str_t *l, int);
1.23 millert 51: static int grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *pmatch);
1.6 tedu 52: static int grep_cmp(const unsigned char *, const unsigned char *, size_t);
53: static void grep_revstr(unsigned char *, int);
1.1 deraadt 54:
1.2 deraadt 55: int
1.1 deraadt 56: grep_tree(char **argv)
57: {
1.10 deraadt 58: FTS *fts;
59: FTSENT *p;
60: int c, fts_flags;
1.1 deraadt 61:
62: c = fts_flags = 0;
63:
64: if (Hflag)
65: fts_flags = FTS_COMFOLLOW;
66: if (Pflag)
67: fts_flags = FTS_PHYSICAL;
68: if (Sflag)
69: fts_flags = FTS_LOGICAL;
70:
71: fts_flags |= FTS_NOSTAT | FTS_NOCHDIR;
72:
1.11 millert 73: if (!(fts = fts_open(argv, fts_flags, NULL)))
1.14 millert 74: err(2, NULL);
1.1 deraadt 75: while ((p = fts_read(fts)) != NULL) {
76: switch (p->fts_info) {
77: case FTS_DNR:
78: break;
79: case FTS_ERR:
1.14 millert 80: errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno));
1.1 deraadt 81: break;
82: case FTS_DP:
83: break;
84: default:
85: c += procfile(p->fts_path);
86: break;
87: }
88: }
89:
90: return c;
91: }
92:
93: int
94: procfile(char *fn)
95: {
96: str_t ln;
97: file_t *f;
1.4 tedu 98: int c, t, z, nottext;
1.1 deraadt 99:
100: if (fn == NULL) {
101: fn = "(standard input)";
102: f = grep_fdopen(STDIN_FILENO, "r");
103: } else {
104: f = grep_open(fn, "r");
105: }
106: if (f == NULL) {
107: if (!sflag)
108: warn("%s", fn);
109: return 0;
110: }
1.4 tedu 111:
112: nottext = grep_bin_file(f);
113: if (nottext && binbehave == BIN_FILE_SKIP) {
1.1 deraadt 114: grep_close(f);
115: return 0;
116: }
117:
118: ln.file = fn;
119: ln.line_no = 0;
1.20 espie 120: ln.len = 0;
1.1 deraadt 121: linesqueued = 0;
122: ln.off = -1;
123:
124: if (Bflag > 0)
125: initqueue();
1.27 otto 126: for (c = 0; c == 0 || !(lflag || qflag); ) {
1.1 deraadt 127: ln.off += ln.len + 1;
128: if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
129: break;
130: if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
131: --ln.len;
132: ln.line_no++;
133:
134: z = tail;
1.2 deraadt 135:
1.4 tedu 136: if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
1.1 deraadt 137: enqueue(&ln);
138: linesqueued++;
139: }
140: c += t;
141: }
142: if (Bflag > 0)
143: clearqueue();
144: grep_close(f);
145:
146: if (cflag) {
147: if (!hflag)
148: printf("%s:", ln.file);
149: printf("%u\n", c);
150: }
151: if (lflag && c != 0)
152: printf("%s\n", fn);
153: if (Lflag && c == 0)
154: printf("%s\n", fn);
1.4 tedu 155: if (c && !cflag && !lflag && !Lflag &&
1.7 tedu 156: binbehave == BIN_FILE_BIN && nottext && !qflag)
1.4 tedu 157: printf("Binary file %s matches\n", fn);
158:
1.1 deraadt 159: return c;
160: }
161:
162:
163: /*
164: * Process an individual line in a file. Return non-zero if it matches.
165: */
166:
167: #define isword(x) (isalnum(x) || (x) == '_')
168:
169: static int
1.4 tedu 170: procline(str_t *l, int nottext)
1.1 deraadt 171: {
172: regmatch_t pmatch;
1.15 dhartmei 173: int c, i, r;
1.1 deraadt 174:
175: if (matchall) {
176: c = !vflag;
177: goto print;
178: }
1.2 deraadt 179:
1.1 deraadt 180: for (c = i = 0; i < patterns; i++) {
1.22 millert 181: if (fg_pattern[i].pattern) {
1.6 tedu 182: r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
1.9 millert 183: l->len, &pmatch);
1.22 millert 184: } else {
185: pmatch.rm_so = 0;
186: pmatch.rm_eo = l->len;
1.9 millert 187: r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
1.22 millert 188: }
189: if (r == 0 && xflag) {
190: if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
191: r = REG_NOMATCH;
1.1 deraadt 192: }
1.15 dhartmei 193: if (r == 0) {
1.1 deraadt 194: c++;
195: break;
196: }
197: }
1.15 dhartmei 198: if (vflag)
199: c = !c;
1.2 deraadt 200:
1.1 deraadt 201: print:
1.4 tedu 202: if (c && binbehave == BIN_FILE_BIN && nottext)
203: return c; /* Binary file */
204:
1.1 deraadt 205: if ((tail > 0 || c) && !cflag && !qflag) {
206: if (c) {
1.5 deraadt 207: if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
208: (Aflag || Bflag))
1.1 deraadt 209: printf("--\n");
210: first = 1;
211: tail = Aflag;
212: if (Bflag > 0)
213: printqueue();
214: linesqueued = 0;
215: printline(l, ':');
216: } else {
217: printline(l, '-');
218: tail--;
219: }
220: }
221: return c;
222: }
223:
1.31 ! otto 224: void
1.25 millert 225: fgrepcomp(fastgrep_t *fg, const char *pattern)
226: {
227: int i;
228:
229: /* Initialize. */
230: fg->patternLen = strlen(pattern);
231: fg->bol = 0;
232: fg->eol = 0;
233: fg->wmatch = wflag;
234: fg->reversedSearch = 0;
235:
236: /*
237: * Make a copy and upper case it for later if in -i mode,
238: * else just copy the pointer.
239: */
240: if (iflag) {
241: fg->pattern = grep_malloc(fg->patternLen + 1);
242: for (i = 0; i < fg->patternLen; i++)
243: fg->pattern[i] = toupper(pattern[i]);
244: fg->pattern[fg->patternLen] = '\0';
245: } else
1.28 deraadt 246: fg->pattern = (unsigned char *)pattern; /* really const */
1.25 millert 247:
248: /* Preprocess pattern. */
249: for (i = 0; i <= UCHAR_MAX; i++)
250: fg->qsBc[i] = fg->patternLen;
251: for (i = 1; i < fg->patternLen; i++) {
252: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
253: /*
254: * If case is ignored, make the jump apply to both upper and
255: * lower cased characters. As the pattern is stored in upper
256: * case, apply the same to the lower case equivalents.
257: */
258: if (iflag)
259: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
260: }
261: }
262:
263: /*
264: * Returns: -1 on failure, 0 on success
265: */
266: int
1.6 tedu 267: fastcomp(fastgrep_t *fg, const char *pattern)
268: {
269: int i;
270: int bol = 0;
271: int eol = 0;
272: int shiftPatternLen;
273: int hasDot = 0;
274: int firstHalfDot = -1;
275: int firstLastHalfDot = -1;
276: int lastHalfDot = 0;
277:
278: /* Initialize. */
1.28 deraadt 279: fg->patternLen = strlen(pattern);
1.6 tedu 280: fg->bol = 0;
281: fg->eol = 0;
1.22 millert 282: fg->wmatch = 0;
1.6 tedu 283: fg->reversedSearch = 0;
284:
285: /* Remove end-of-line character ('$'). */
286: if (pattern[fg->patternLen - 1] == '$') {
287: eol++;
288: fg->eol = 1;
289: fg->patternLen--;
290: }
291:
292: /* Remove beginning-of-line character ('^'). */
293: if (pattern[0] == '^') {
294: bol++;
295: fg->bol = 1;
296: fg->patternLen--;
297: }
298:
1.22 millert 299: /* Remove enclosing [[:<:]] and [[:>:]] (word match). */
1.30 otto 300: if (wflag) {
301: /* basic re's use \( \), extended re's ( ) */
302: int extra = Eflag ? 1 : 2;
303: fg->patternLen -= 14 + 2 * extra;
304: fg->wmatch = 7 + extra;
305: } else if (fg->patternLen >= 14 &&
1.22 millert 306: strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
1.24 millert 307: strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
1.22 millert 308: fg->patternLen -= 14;
309: fg->wmatch = 7;
310: }
311:
1.6 tedu 312: /*
1.22 millert 313: * Copy pattern minus '^' and '$' characters as well as word
314: * match character classes at the beginning and ending of the
315: * string respectively.
1.6 tedu 316: */
1.22 millert 317: fg->pattern = grep_malloc(fg->patternLen + 1);
318: memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
319: fg->pattern[fg->patternLen] = '\0';
1.6 tedu 320:
321: /* Look for ways to cheat...er...avoid the full regex engine. */
322: for (i = 0; i < fg->patternLen; i++)
323: {
324: /* Can still cheat? */
325: if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
326: (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
327: (fg->pattern[i] == '^') || (fg->pattern[i] == '$') ||
328: (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
329: (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
330: /* As long as it is good, upper case it for later. */
331: if (iflag)
332: fg->pattern[i] = toupper(fg->pattern[i]);
333: } else if (fg->pattern[i] == '.') {
334: hasDot = i;
335: if (i < fg->patternLen / 2) {
1.19 otto 336: if (firstHalfDot < 0)
1.6 tedu 337: /* Closest dot to the beginning */
338: firstHalfDot = i;
339: } else {
340: /* Closest dot to the end of the pattern. */
341: lastHalfDot = i;
342: if (firstLastHalfDot < 0)
343: firstLastHalfDot = i;
344: }
345: } else {
346: /* Free memory and let others know this is empty. */
347: free(fg->pattern);
348: fg->pattern = NULL;
349: return (-1);
350: }
351: }
352:
353: /*
354: * Determine if a reverse search would be faster based on the placement
355: * of the dots.
356: */
357: if ((!(lflag || cflag)) && ((!(bol || eol)) &&
358: ((lastHalfDot) && ((firstHalfDot < 0) ||
359: ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
360: fg->reversedSearch = 1;
361: hasDot = fg->patternLen - (firstHalfDot < 0 ?
362: firstLastHalfDot : firstHalfDot) - 1;
363: grep_revstr(fg->pattern, fg->patternLen);
364: }
365:
366: /*
367: * Normal Quick Search would require a shift based on the position the
368: * next character after the comparison is within the pattern. With
369: * wildcards, the position of the last dot effects the maximum shift
370: * distance.
371: * The closer to the end the wild card is the slower the search. A
1.10 deraadt 372: * reverse version of this algorithm would be useful for wildcards near
1.6 tedu 373: * the end of the string.
374: *
375: * Examples:
376: * Pattern Max shift
377: * ------- ---------
378: * this 5
379: * .his 4
380: * t.is 3
381: * th.s 2
382: * thi. 1
383: */
384:
385: /* Adjust the shift based on location of the last dot ('.'). */
386: shiftPatternLen = fg->patternLen - hasDot;
387:
388: /* Preprocess pattern. */
389: for (i = 0; i <= UCHAR_MAX; i++)
390: fg->qsBc[i] = shiftPatternLen;
391: for (i = hasDot + 1; i < fg->patternLen; i++) {
392: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
393: /*
394: * If case is ignored, make the jump apply to both upper and
395: * lower cased characters. As the pattern is stored in upper
396: * case, apply the same to the lower case equivalents.
397: */
398: if (iflag)
399: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
400: }
401:
402: /*
403: * Put pattern back to normal after pre-processing to allow for easy
404: * comparisons later.
405: */
406: if (fg->reversedSearch)
407: grep_revstr(fg->pattern, fg->patternLen);
408:
409: return (0);
410: }
411:
1.26 otto 412: /*
413: * Word boundaries using regular expressions are defined as the point
414: * of transition from a non-word char to a word char, or vice versa.
415: * This means that grep -w +a and grep -w a+ never match anything,
416: * because they lack a starting or ending transition, but grep -w a+b
417: * does match a line containing a+b.
418: */
1.22 millert 419: #define wmatch(d, l, s, e) \
1.26 otto 420: ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
421: e > s && isword(d[s]) && isword(d[e-1]))
1.22 millert 422:
1.9 millert 423: static int
1.23 millert 424: grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen, regmatch_t *pmatch)
1.6 tedu 425: {
426: int j;
427: int rtrnVal = REG_NOMATCH;
428:
1.9 millert 429: pmatch->rm_so = -1;
430: pmatch->rm_eo = -1;
431:
1.6 tedu 432: /* No point in going farther if we do not have enough data. */
433: if (dataLen < fg->patternLen)
434: return (rtrnVal);
435:
436: /* Only try once at the beginning or ending of the line. */
437: if (fg->bol || fg->eol) {
438: /* Simple text comparison. */
439: /* Verify data is >= pattern length before searching on it. */
440: if (dataLen >= fg->patternLen) {
441: /* Determine where in data to start search at. */
442: if (fg->eol)
443: j = dataLen - fg->patternLen;
444: else
445: j = 0;
446: if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
1.22 millert 447: if (grep_cmp(fg->pattern, data + j,
448: fg->patternLen) == -1) {
1.9 millert 449: pmatch->rm_so = j;
450: pmatch->rm_eo = j + fg->patternLen;
1.22 millert 451: if (!fg->wmatch || wmatch(data, dataLen,
452: pmatch->rm_so, pmatch->rm_eo))
453: rtrnVal = 0;
1.9 millert 454: }
1.6 tedu 455: }
456: } else if (fg->reversedSearch) {
457: /* Quick Search algorithm. */
1.17 millert 458: j = dataLen;
459: do {
1.6 tedu 460: if (grep_cmp(fg->pattern, data + j - fg->patternLen,
461: fg->patternLen) == -1) {
1.9 millert 462: pmatch->rm_so = j - fg->patternLen;
463: pmatch->rm_eo = j;
1.22 millert 464: if (!fg->wmatch || wmatch(data, dataLen,
465: pmatch->rm_so, pmatch->rm_eo)) {
466: rtrnVal = 0;
467: break;
468: }
1.6 tedu 469: }
1.17 millert 470: /* Shift if within bounds, otherwise, we are done. */
471: if (j == fg->patternLen)
472: break;
473: j -= fg->qsBc[data[j - fg->patternLen - 1]];
474: } while (j >= fg->patternLen);
1.6 tedu 475: } else {
476: /* Quick Search algorithm. */
477: j = 0;
478: do {
479: if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
1.9 millert 480: pmatch->rm_so = j;
481: pmatch->rm_eo = j + fg->patternLen;
1.22 millert 482: if (!fg->wmatch || wmatch(data, dataLen,
483: pmatch->rm_so, pmatch->rm_eo)) {
484: rtrnVal = 0;
485: break;
486: }
1.6 tedu 487: }
488:
489: /* Shift if within bounds, otherwise, we are done. */
490: if (j + fg->patternLen == dataLen)
491: break;
492: else
493: j += fg->qsBc[data[j + fg->patternLen]];
494: } while (j <= (dataLen - fg->patternLen));
495: }
496:
497: return (rtrnVal);
498: }
499:
500:
1.1 deraadt 501: void *
502: grep_malloc(size_t size)
503: {
1.10 deraadt 504: void *ptr;
1.1 deraadt 505:
506: if ((ptr = malloc(size)) == NULL)
1.14 millert 507: err(2, "malloc");
1.1 deraadt 508: return ptr;
509: }
510:
511: void *
512: grep_realloc(void *ptr, size_t size)
513: {
514: if ((ptr = realloc(ptr, size)) == NULL)
1.14 millert 515: err(2, "realloc");
1.6 tedu 516: return ptr;
517: }
518:
519: /*
520: * Returns: i >= 0 on failure (position that it failed)
521: * -1 on success
522: */
1.18 avsm 523: static int
1.9 millert 524: grep_cmp(const unsigned char *pattern, const unsigned char *data, size_t len)
1.6 tedu 525: {
526: int i;
527:
528: for (i = 0; i < len; i++) {
1.25 millert 529: if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
530: || (iflag && pattern[i] == toupper(data[i])))
1.6 tedu 531: continue;
532: return (i);
533: }
534:
535: return (-1);
536: }
537:
538: static void
539: grep_revstr(unsigned char *str, int len)
540: {
541: int i;
542: char c;
543:
544: for (i = 0; i < len / 2; i++) {
545: c = str[i];
546: str[i] = str[len - i - 1];
547: str[len - i - 1] = c;
548: }
1.1 deraadt 549: }
550:
551: void
552: printline(str_t *line, int sep)
553: {
554: int n;
1.2 deraadt 555:
1.1 deraadt 556: n = 0;
557: if (!hflag) {
558: fputs(line->file, stdout);
559: ++n;
560: }
561: if (nflag) {
562: if (n)
563: putchar(sep);
564: printf("%d", line->line_no);
565: ++n;
566: }
567: if (bflag) {
568: if (n)
569: putchar(sep);
1.21 otto 570: printf("%lld", (long long)line->off);
571: ++n;
1.1 deraadt 572: }
573: if (n)
574: putchar(sep);
575: fwrite(line->dat, line->len, 1, stdout);
576: putchar('\n');
577: }