Annotation of src/usr.bin/grep/util.c, Revision 1.42
1.42 ! aschrijv 1: /* $OpenBSD: util.c,v 1.41 2011/07/11 20:43:21 tedu Exp $ */
1.3 deraadt 2:
1.1 deraadt 3: /*-
4: * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5: * All rights reserved.
6: *
7: * Redistribution and use in source and binary forms, with or without
8: * modification, are permitted provided that the following conditions
9: * are met:
10: * 1. Redistributions of source code must retain the above copyright
11: * notice, this list of conditions and the following disclaimer.
12: * 2. Redistributions in binary form must reproduce the above copyright
13: * notice, this list of conditions and the following disclaimer in the
14: * documentation and/or other materials provided with the distribution.
15: *
16: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26: * SUCH DAMAGE.
27: */
28:
29: #include <sys/types.h>
30: #include <sys/stat.h>
31:
32: #include <ctype.h>
33: #include <err.h>
34: #include <errno.h>
35: #include <fts.h>
36: #include <regex.h>
37: #include <stdio.h>
38: #include <stdlib.h>
39: #include <string.h>
40: #include <unistd.h>
41: #include <zlib.h>
42:
43: #include "grep.h"
44:
45: /*
46: * Process a file line by line...
47: */
48:
49: static int linesqueued;
1.4 tedu 50: static int procline(str_t *l, int);
1.40 tedu 51: static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch);
1.39 tedu 52: #ifndef SMALL
1.40 tedu 53: static int grep_cmp(const char *, const char *, size_t);
1.6 tedu 54: static void grep_revstr(unsigned char *, int);
1.39 tedu 55: #endif
1.1 deraadt 56:
1.2 deraadt 57: int
1.1 deraadt 58: grep_tree(char **argv)
59: {
1.10 deraadt 60: FTS *fts;
61: FTSENT *p;
62: int c, fts_flags;
1.1 deraadt 63:
1.37 tedu 64: c = 0;
1.1 deraadt 65:
1.37 tedu 66: fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
1.1 deraadt 67:
1.11 millert 68: if (!(fts = fts_open(argv, fts_flags, NULL)))
1.14 millert 69: err(2, NULL);
1.1 deraadt 70: while ((p = fts_read(fts)) != NULL) {
71: switch (p->fts_info) {
72: case FTS_DNR:
73: break;
74: case FTS_ERR:
1.14 millert 75: errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno));
1.1 deraadt 76: break;
77: case FTS_DP:
78: break;
79: default:
80: c += procfile(p->fts_path);
81: break;
82: }
83: }
1.34 otto 84: if (errno)
85: err(2, "fts_read");
1.1 deraadt 86:
87: return c;
88: }
89:
90: int
91: procfile(char *fn)
92: {
93: str_t ln;
94: file_t *f;
1.4 tedu 95: int c, t, z, nottext;
1.1 deraadt 96:
97: if (fn == NULL) {
98: fn = "(standard input)";
99: f = grep_fdopen(STDIN_FILENO, "r");
100: } else {
101: f = grep_open(fn, "r");
102: }
103: if (f == NULL) {
104: if (!sflag)
105: warn("%s", fn);
106: return 0;
107: }
1.4 tedu 108:
109: nottext = grep_bin_file(f);
110: if (nottext && binbehave == BIN_FILE_SKIP) {
1.1 deraadt 111: grep_close(f);
112: return 0;
113: }
114:
115: ln.file = fn;
116: ln.line_no = 0;
1.20 espie 117: ln.len = 0;
1.1 deraadt 118: linesqueued = 0;
1.33 jaredy 119: tail = 0;
1.1 deraadt 120: ln.off = -1;
121:
122: if (Bflag > 0)
123: initqueue();
1.27 otto 124: for (c = 0; c == 0 || !(lflag || qflag); ) {
1.1 deraadt 125: ln.off += ln.len + 1;
126: if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
127: break;
128: if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
129: --ln.len;
130: ln.line_no++;
131:
132: z = tail;
1.2 deraadt 133:
1.4 tedu 134: if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
1.1 deraadt 135: enqueue(&ln);
136: linesqueued++;
137: }
138: c += t;
139: }
140: if (Bflag > 0)
141: clearqueue();
142: grep_close(f);
143:
144: if (cflag) {
145: if (!hflag)
146: printf("%s:", ln.file);
147: printf("%u\n", c);
148: }
149: if (lflag && c != 0)
150: printf("%s\n", fn);
151: if (Lflag && c == 0)
152: printf("%s\n", fn);
1.4 tedu 153: if (c && !cflag && !lflag && !Lflag &&
1.7 tedu 154: binbehave == BIN_FILE_BIN && nottext && !qflag)
1.4 tedu 155: printf("Binary file %s matches\n", fn);
156:
1.1 deraadt 157: return c;
158: }
159:
160:
161: /*
162: * Process an individual line in a file. Return non-zero if it matches.
163: */
164:
165: #define isword(x) (isalnum(x) || (x) == '_')
166:
167: static int
1.4 tedu 168: procline(str_t *l, int nottext)
1.1 deraadt 169: {
170: regmatch_t pmatch;
1.15 dhartmei 171: int c, i, r;
1.42 ! aschrijv 172: regoff_t offset;
! 173:
! 174: /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
! 175: * into regoff_t */
! 176: if (l->len > SSIZE_MAX) {
! 177: errx(2, "Line is too big to process");
! 178: }
1.1 deraadt 179:
1.40 tedu 180: c = 0;
181: i = 0;
1.1 deraadt 182: if (matchall) {
1.41 tedu 183: c = 1;
1.1 deraadt 184: goto print;
185: }
1.2 deraadt 186:
1.40 tedu 187: for (i = 0; i < patterns; i++) {
188: offset = 0;
189: redo:
1.22 millert 190: if (fg_pattern[i].pattern) {
1.40 tedu 191: r = grep_search(&fg_pattern[i], l->dat + offset,
192: l->len - offset, &pmatch);
193: pmatch.rm_so += offset;
194: pmatch.rm_eo += offset;
1.22 millert 195: } else {
1.40 tedu 196: pmatch.rm_so = offset;
197: pmatch.rm_eo = l->len - offset;
1.9 millert 198: r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
1.22 millert 199: }
200: if (r == 0 && xflag) {
201: if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
202: r = REG_NOMATCH;
1.1 deraadt 203: }
1.15 dhartmei 204: if (r == 0) {
1.40 tedu 205: c = 1;
206: if (oflag)
207: goto print;
1.1 deraadt 208: break;
209: }
210: }
1.40 tedu 211: if (oflag)
212: return c;
213: print:
1.15 dhartmei 214: if (vflag)
215: c = !c;
1.2 deraadt 216:
1.4 tedu 217: if (c && binbehave == BIN_FILE_BIN && nottext)
218: return c; /* Binary file */
219:
1.1 deraadt 220: if ((tail > 0 || c) && !cflag && !qflag) {
221: if (c) {
1.5 deraadt 222: if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
223: (Aflag || Bflag))
1.1 deraadt 224: printf("--\n");
225: first = 1;
226: tail = Aflag;
227: if (Bflag > 0)
228: printqueue();
229: linesqueued = 0;
1.40 tedu 230: printline(l, ':', oflag ? &pmatch : NULL);
1.1 deraadt 231: } else {
1.40 tedu 232: printline(l, '-', oflag ? &pmatch : NULL);
1.1 deraadt 233: tail--;
234: }
235: }
1.40 tedu 236: if (oflag && !matchall) {
237: offset = pmatch.rm_eo;
238: goto redo;
239: }
1.1 deraadt 240: return c;
241: }
242:
1.39 tedu 243: #ifndef SMALL
1.31 otto 244: void
1.25 millert 245: fgrepcomp(fastgrep_t *fg, const char *pattern)
246: {
247: int i;
248:
249: /* Initialize. */
250: fg->patternLen = strlen(pattern);
251: fg->bol = 0;
252: fg->eol = 0;
253: fg->wmatch = wflag;
254: fg->reversedSearch = 0;
255:
256: /*
257: * Make a copy and upper case it for later if in -i mode,
258: * else just copy the pointer.
259: */
260: if (iflag) {
261: fg->pattern = grep_malloc(fg->patternLen + 1);
262: for (i = 0; i < fg->patternLen; i++)
263: fg->pattern[i] = toupper(pattern[i]);
264: fg->pattern[fg->patternLen] = '\0';
265: } else
1.28 deraadt 266: fg->pattern = (unsigned char *)pattern; /* really const */
1.25 millert 267:
268: /* Preprocess pattern. */
269: for (i = 0; i <= UCHAR_MAX; i++)
270: fg->qsBc[i] = fg->patternLen;
271: for (i = 1; i < fg->patternLen; i++) {
272: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
273: /*
274: * If case is ignored, make the jump apply to both upper and
275: * lower cased characters. As the pattern is stored in upper
276: * case, apply the same to the lower case equivalents.
277: */
278: if (iflag)
279: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
280: }
281: }
1.39 tedu 282: #endif
1.25 millert 283:
284: /*
285: * Returns: -1 on failure, 0 on success
286: */
287: int
1.6 tedu 288: fastcomp(fastgrep_t *fg, const char *pattern)
289: {
1.39 tedu 290: #ifdef SMALL
291: return -1;
292: #else
1.6 tedu 293: int i;
294: int bol = 0;
295: int eol = 0;
296: int shiftPatternLen;
297: int hasDot = 0;
298: int firstHalfDot = -1;
299: int firstLastHalfDot = -1;
300: int lastHalfDot = 0;
301:
302: /* Initialize. */
1.28 deraadt 303: fg->patternLen = strlen(pattern);
1.6 tedu 304: fg->bol = 0;
305: fg->eol = 0;
1.22 millert 306: fg->wmatch = 0;
1.6 tedu 307: fg->reversedSearch = 0;
308:
309: /* Remove end-of-line character ('$'). */
1.38 eric 310: if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
1.6 tedu 311: eol++;
312: fg->eol = 1;
313: fg->patternLen--;
314: }
315:
316: /* Remove beginning-of-line character ('^'). */
317: if (pattern[0] == '^') {
318: bol++;
319: fg->bol = 1;
320: fg->patternLen--;
321: }
322:
1.22 millert 323: /* Remove enclosing [[:<:]] and [[:>:]] (word match). */
1.30 otto 324: if (wflag) {
325: /* basic re's use \( \), extended re's ( ) */
326: int extra = Eflag ? 1 : 2;
327: fg->patternLen -= 14 + 2 * extra;
328: fg->wmatch = 7 + extra;
329: } else if (fg->patternLen >= 14 &&
1.22 millert 330: strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
1.24 millert 331: strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
1.22 millert 332: fg->patternLen -= 14;
333: fg->wmatch = 7;
334: }
335:
1.6 tedu 336: /*
1.22 millert 337: * Copy pattern minus '^' and '$' characters as well as word
338: * match character classes at the beginning and ending of the
339: * string respectively.
1.6 tedu 340: */
1.22 millert 341: fg->pattern = grep_malloc(fg->patternLen + 1);
342: memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
343: fg->pattern[fg->patternLen] = '\0';
1.6 tedu 344:
345: /* Look for ways to cheat...er...avoid the full regex engine. */
346: for (i = 0; i < fg->patternLen; i++)
347: {
348: /* Can still cheat? */
349: if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
350: (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
351: (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
352: (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
353: /* As long as it is good, upper case it for later. */
354: if (iflag)
355: fg->pattern[i] = toupper(fg->pattern[i]);
356: } else if (fg->pattern[i] == '.') {
357: hasDot = i;
358: if (i < fg->patternLen / 2) {
1.19 otto 359: if (firstHalfDot < 0)
1.6 tedu 360: /* Closest dot to the beginning */
361: firstHalfDot = i;
362: } else {
363: /* Closest dot to the end of the pattern. */
364: lastHalfDot = i;
365: if (firstLastHalfDot < 0)
366: firstLastHalfDot = i;
367: }
368: } else {
369: /* Free memory and let others know this is empty. */
370: free(fg->pattern);
371: fg->pattern = NULL;
372: return (-1);
373: }
374: }
375:
376: /*
377: * Determine if a reverse search would be faster based on the placement
378: * of the dots.
379: */
380: if ((!(lflag || cflag)) && ((!(bol || eol)) &&
381: ((lastHalfDot) && ((firstHalfDot < 0) ||
382: ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
383: fg->reversedSearch = 1;
384: hasDot = fg->patternLen - (firstHalfDot < 0 ?
385: firstLastHalfDot : firstHalfDot) - 1;
386: grep_revstr(fg->pattern, fg->patternLen);
387: }
388:
389: /*
390: * Normal Quick Search would require a shift based on the position the
391: * next character after the comparison is within the pattern. With
392: * wildcards, the position of the last dot effects the maximum shift
393: * distance.
394: * The closer to the end the wild card is the slower the search. A
1.10 deraadt 395: * reverse version of this algorithm would be useful for wildcards near
1.6 tedu 396: * the end of the string.
397: *
398: * Examples:
399: * Pattern Max shift
400: * ------- ---------
401: * this 5
402: * .his 4
403: * t.is 3
404: * th.s 2
405: * thi. 1
406: */
407:
408: /* Adjust the shift based on location of the last dot ('.'). */
409: shiftPatternLen = fg->patternLen - hasDot;
410:
411: /* Preprocess pattern. */
412: for (i = 0; i <= UCHAR_MAX; i++)
413: fg->qsBc[i] = shiftPatternLen;
414: for (i = hasDot + 1; i < fg->patternLen; i++) {
415: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
416: /*
417: * If case is ignored, make the jump apply to both upper and
418: * lower cased characters. As the pattern is stored in upper
419: * case, apply the same to the lower case equivalents.
420: */
421: if (iflag)
422: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
423: }
424:
425: /*
426: * Put pattern back to normal after pre-processing to allow for easy
427: * comparisons later.
428: */
429: if (fg->reversedSearch)
430: grep_revstr(fg->pattern, fg->patternLen);
431:
432: return (0);
1.39 tedu 433: #endif
1.6 tedu 434: }
435:
1.26 otto 436: /*
437: * Word boundaries using regular expressions are defined as the point
438: * of transition from a non-word char to a word char, or vice versa.
439: * This means that grep -w +a and grep -w a+ never match anything,
440: * because they lack a starting or ending transition, but grep -w a+b
441: * does match a line containing a+b.
442: */
1.22 millert 443: #define wmatch(d, l, s, e) \
1.26 otto 444: ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
445: e > s && isword(d[s]) && isword(d[e-1]))
1.22 millert 446:
1.9 millert 447: static int
1.40 tedu 448: grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch)
1.6 tedu 449: {
1.39 tedu 450: #ifdef SMALL
451: return 0;
452: #else
1.42 ! aschrijv 453: regoff_t j;
1.6 tedu 454: int rtrnVal = REG_NOMATCH;
455:
1.9 millert 456: pmatch->rm_so = -1;
457: pmatch->rm_eo = -1;
458:
1.6 tedu 459: /* No point in going farther if we do not have enough data. */
460: if (dataLen < fg->patternLen)
461: return (rtrnVal);
462:
463: /* Only try once at the beginning or ending of the line. */
464: if (fg->bol || fg->eol) {
465: /* Simple text comparison. */
466: /* Verify data is >= pattern length before searching on it. */
467: if (dataLen >= fg->patternLen) {
468: /* Determine where in data to start search at. */
469: if (fg->eol)
470: j = dataLen - fg->patternLen;
471: else
472: j = 0;
473: if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
1.22 millert 474: if (grep_cmp(fg->pattern, data + j,
475: fg->patternLen) == -1) {
1.9 millert 476: pmatch->rm_so = j;
477: pmatch->rm_eo = j + fg->patternLen;
1.22 millert 478: if (!fg->wmatch || wmatch(data, dataLen,
479: pmatch->rm_so, pmatch->rm_eo))
480: rtrnVal = 0;
1.9 millert 481: }
1.6 tedu 482: }
483: } else if (fg->reversedSearch) {
484: /* Quick Search algorithm. */
1.17 millert 485: j = dataLen;
486: do {
1.6 tedu 487: if (grep_cmp(fg->pattern, data + j - fg->patternLen,
488: fg->patternLen) == -1) {
1.9 millert 489: pmatch->rm_so = j - fg->patternLen;
490: pmatch->rm_eo = j;
1.22 millert 491: if (!fg->wmatch || wmatch(data, dataLen,
492: pmatch->rm_so, pmatch->rm_eo)) {
493: rtrnVal = 0;
494: break;
495: }
1.6 tedu 496: }
1.17 millert 497: /* Shift if within bounds, otherwise, we are done. */
498: if (j == fg->patternLen)
499: break;
1.40 tedu 500: j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
1.17 millert 501: } while (j >= fg->patternLen);
1.6 tedu 502: } else {
503: /* Quick Search algorithm. */
504: j = 0;
505: do {
506: if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
1.9 millert 507: pmatch->rm_so = j;
508: pmatch->rm_eo = j + fg->patternLen;
1.32 jaredy 509: if (fg->patternLen == 0 || !fg->wmatch ||
510: wmatch(data, dataLen, pmatch->rm_so,
511: pmatch->rm_eo)) {
1.22 millert 512: rtrnVal = 0;
513: break;
514: }
1.6 tedu 515: }
516:
517: /* Shift if within bounds, otherwise, we are done. */
518: if (j + fg->patternLen == dataLen)
519: break;
520: else
1.40 tedu 521: j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
1.6 tedu 522: } while (j <= (dataLen - fg->patternLen));
523: }
524:
525: return (rtrnVal);
1.39 tedu 526: #endif
1.6 tedu 527: }
528:
529:
1.1 deraadt 530: void *
531: grep_malloc(size_t size)
532: {
1.10 deraadt 533: void *ptr;
1.1 deraadt 534:
535: if ((ptr = malloc(size)) == NULL)
1.14 millert 536: err(2, "malloc");
1.35 deraadt 537: return ptr;
538: }
539:
540: void *
541: grep_calloc(size_t nmemb, size_t size)
542: {
543: void *ptr;
544:
545: if ((ptr = calloc(nmemb, size)) == NULL)
546: err(2, "calloc");
1.1 deraadt 547: return ptr;
548: }
549:
550: void *
551: grep_realloc(void *ptr, size_t size)
552: {
553: if ((ptr = realloc(ptr, size)) == NULL)
1.14 millert 554: err(2, "realloc");
1.6 tedu 555: return ptr;
556: }
557:
1.39 tedu 558: #ifndef SMALL
1.6 tedu 559: /*
560: * Returns: i >= 0 on failure (position that it failed)
561: * -1 on success
562: */
1.18 avsm 563: static int
1.40 tedu 564: grep_cmp(const char *pattern, const char *data, size_t len)
1.6 tedu 565: {
566: int i;
567:
568: for (i = 0; i < len; i++) {
1.25 millert 569: if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
570: || (iflag && pattern[i] == toupper(data[i])))
1.6 tedu 571: continue;
572: return (i);
573: }
574:
575: return (-1);
576: }
577:
578: static void
579: grep_revstr(unsigned char *str, int len)
580: {
581: int i;
582: char c;
583:
584: for (i = 0; i < len / 2; i++) {
585: c = str[i];
586: str[i] = str[len - i - 1];
587: str[len - i - 1] = c;
588: }
1.1 deraadt 589: }
1.39 tedu 590: #endif
1.1 deraadt 591:
592: void
1.40 tedu 593: printline(str_t *line, int sep, regmatch_t *pmatch)
1.1 deraadt 594: {
595: int n;
1.2 deraadt 596:
1.1 deraadt 597: n = 0;
598: if (!hflag) {
599: fputs(line->file, stdout);
600: ++n;
601: }
602: if (nflag) {
603: if (n)
604: putchar(sep);
605: printf("%d", line->line_no);
606: ++n;
607: }
608: if (bflag) {
609: if (n)
610: putchar(sep);
1.21 otto 611: printf("%lld", (long long)line->off);
612: ++n;
1.1 deraadt 613: }
614: if (n)
615: putchar(sep);
1.40 tedu 616: if (pmatch)
617: fwrite(line->dat + pmatch->rm_so,
618: pmatch->rm_eo - pmatch->rm_so, 1, stdout);
619: else
620: fwrite(line->dat, line->len, 1, stdout);
1.1 deraadt 621: putchar('\n');
622: }