Annotation of src/usr.bin/grep/util.c, Revision 1.51
1.51 ! mmcc 1: /* $OpenBSD: util.c,v 1.50 2015/06/25 02:04:08 uebayasi Exp $ */
1.3 deraadt 2:
1.1 deraadt 3: /*-
4: * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5: * All rights reserved.
6: *
7: * Redistribution and use in source and binary forms, with or without
8: * modification, are permitted provided that the following conditions
9: * are met:
10: * 1. Redistributions of source code must retain the above copyright
11: * notice, this list of conditions and the following disclaimer.
12: * 2. Redistributions in binary form must reproduce the above copyright
13: * notice, this list of conditions and the following disclaimer in the
14: * documentation and/or other materials provided with the distribution.
15: *
16: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26: * SUCH DAMAGE.
27: */
28:
29: #include <sys/types.h>
30: #include <sys/stat.h>
31:
32: #include <ctype.h>
33: #include <err.h>
34: #include <errno.h>
35: #include <fts.h>
36: #include <regex.h>
37: #include <stdio.h>
38: #include <stdlib.h>
39: #include <string.h>
40: #include <unistd.h>
41: #include <zlib.h>
42:
43: #include "grep.h"
44:
45: /*
46: * Process a file line by line...
47: */
48:
49: static int linesqueued;
1.4 tedu 50: static int procline(str_t *l, int);
1.40 tedu 51: static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch);
1.39 tedu 52: #ifndef SMALL
1.40 tedu 53: static int grep_cmp(const char *, const char *, size_t);
1.6 tedu 54: static void grep_revstr(unsigned char *, int);
1.39 tedu 55: #endif
1.1 deraadt 56:
1.2 deraadt 57: int
1.1 deraadt 58: grep_tree(char **argv)
59: {
1.10 deraadt 60: FTS *fts;
61: FTSENT *p;
62: int c, fts_flags;
1.1 deraadt 63:
1.37 tedu 64: c = 0;
1.1 deraadt 65:
1.37 tedu 66: fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
1.1 deraadt 67:
1.11 millert 68: if (!(fts = fts_open(argv, fts_flags, NULL)))
1.14 millert 69: err(2, NULL);
1.1 deraadt 70: while ((p = fts_read(fts)) != NULL) {
71: switch (p->fts_info) {
72: case FTS_DNR:
73: break;
74: case FTS_ERR:
1.45 millert 75: file_err = 1;
76: if(!sflag)
1.48 guenther 77: warnc(p->fts_errno, "%s", p->fts_path);
1.1 deraadt 78: break;
79: case FTS_DP:
80: break;
81: default:
82: c += procfile(p->fts_path);
83: break;
84: }
85: }
1.34 otto 86: if (errno)
87: err(2, "fts_read");
1.50 uebayasi 88: fts_close(fts);
1.1 deraadt 89: return c;
90: }
91:
92: int
93: procfile(char *fn)
94: {
95: str_t ln;
96: file_t *f;
1.4 tedu 97: int c, t, z, nottext;
1.1 deraadt 98:
99: if (fn == NULL) {
100: fn = "(standard input)";
101: f = grep_fdopen(STDIN_FILENO, "r");
102: } else {
103: f = grep_open(fn, "r");
104: }
105: if (f == NULL) {
1.45 millert 106: file_err = 1;
1.1 deraadt 107: if (!sflag)
108: warn("%s", fn);
109: return 0;
110: }
1.4 tedu 111:
112: nottext = grep_bin_file(f);
113: if (nottext && binbehave == BIN_FILE_SKIP) {
1.1 deraadt 114: grep_close(f);
115: return 0;
116: }
117:
118: ln.file = fn;
119: ln.line_no = 0;
1.20 espie 120: ln.len = 0;
1.1 deraadt 121: linesqueued = 0;
1.33 jaredy 122: tail = 0;
1.1 deraadt 123: ln.off = -1;
124:
125: if (Bflag > 0)
126: initqueue();
1.27 otto 127: for (c = 0; c == 0 || !(lflag || qflag); ) {
1.1 deraadt 128: ln.off += ln.len + 1;
129: if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
130: break;
131: if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
132: --ln.len;
133: ln.line_no++;
134:
135: z = tail;
1.2 deraadt 136:
1.4 tedu 137: if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
1.1 deraadt 138: enqueue(&ln);
139: linesqueued++;
140: }
141: c += t;
142: }
143: if (Bflag > 0)
144: clearqueue();
145: grep_close(f);
146:
147: if (cflag) {
148: if (!hflag)
149: printf("%s:", ln.file);
150: printf("%u\n", c);
151: }
152: if (lflag && c != 0)
153: printf("%s\n", fn);
154: if (Lflag && c == 0)
155: printf("%s\n", fn);
1.4 tedu 156: if (c && !cflag && !lflag && !Lflag &&
1.7 tedu 157: binbehave == BIN_FILE_BIN && nottext && !qflag)
1.4 tedu 158: printf("Binary file %s matches\n", fn);
159:
1.1 deraadt 160: return c;
161: }
162:
163:
164: /*
165: * Process an individual line in a file. Return non-zero if it matches.
166: */
167:
1.47 deraadt 168: #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
1.1 deraadt 169:
170: static int
1.4 tedu 171: procline(str_t *l, int nottext)
1.1 deraadt 172: {
173: regmatch_t pmatch;
1.15 dhartmei 174: int c, i, r;
1.42 aschrijv 175: regoff_t offset;
176:
177: /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
178: * into regoff_t */
179: if (l->len > SSIZE_MAX) {
180: errx(2, "Line is too big to process");
181: }
1.1 deraadt 182:
1.40 tedu 183: c = 0;
184: i = 0;
1.1 deraadt 185: if (matchall) {
1.41 tedu 186: c = 1;
1.1 deraadt 187: goto print;
188: }
1.2 deraadt 189:
1.40 tedu 190: for (i = 0; i < patterns; i++) {
191: offset = 0;
192: redo:
1.22 millert 193: if (fg_pattern[i].pattern) {
1.40 tedu 194: r = grep_search(&fg_pattern[i], l->dat + offset,
195: l->len - offset, &pmatch);
196: pmatch.rm_so += offset;
197: pmatch.rm_eo += offset;
1.22 millert 198: } else {
1.40 tedu 199: pmatch.rm_so = offset;
1.43 otto 200: pmatch.rm_eo = l->len;
1.9 millert 201: r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
1.22 millert 202: }
203: if (r == 0 && xflag) {
204: if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
205: r = REG_NOMATCH;
1.1 deraadt 206: }
1.15 dhartmei 207: if (r == 0) {
1.40 tedu 208: c = 1;
1.44 millert 209: if (oflag && pmatch.rm_so != pmatch.rm_eo)
1.40 tedu 210: goto print;
1.1 deraadt 211: break;
212: }
213: }
1.40 tedu 214: if (oflag)
215: return c;
216: print:
1.15 dhartmei 217: if (vflag)
218: c = !c;
1.2 deraadt 219:
1.4 tedu 220: if (c && binbehave == BIN_FILE_BIN && nottext)
221: return c; /* Binary file */
222:
1.1 deraadt 223: if ((tail > 0 || c) && !cflag && !qflag) {
224: if (c) {
1.5 deraadt 225: if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
226: (Aflag || Bflag))
1.1 deraadt 227: printf("--\n");
228: first = 1;
229: tail = Aflag;
230: if (Bflag > 0)
231: printqueue();
232: linesqueued = 0;
1.40 tedu 233: printline(l, ':', oflag ? &pmatch : NULL);
1.1 deraadt 234: } else {
1.40 tedu 235: printline(l, '-', oflag ? &pmatch : NULL);
1.1 deraadt 236: tail--;
237: }
238: }
1.40 tedu 239: if (oflag && !matchall) {
240: offset = pmatch.rm_eo;
241: goto redo;
242: }
1.1 deraadt 243: return c;
244: }
245:
1.39 tedu 246: #ifndef SMALL
1.31 otto 247: void
1.47 deraadt 248: fgrepcomp(fastgrep_t *fg, const unsigned char *pattern)
1.25 millert 249: {
250: int i;
251:
252: /* Initialize. */
253: fg->patternLen = strlen(pattern);
254: fg->bol = 0;
255: fg->eol = 0;
256: fg->wmatch = wflag;
257: fg->reversedSearch = 0;
258:
259: /*
260: * Make a copy and upper case it for later if in -i mode,
261: * else just copy the pointer.
262: */
263: if (iflag) {
264: fg->pattern = grep_malloc(fg->patternLen + 1);
265: for (i = 0; i < fg->patternLen; i++)
266: fg->pattern[i] = toupper(pattern[i]);
267: fg->pattern[fg->patternLen] = '\0';
268: } else
1.28 deraadt 269: fg->pattern = (unsigned char *)pattern; /* really const */
1.25 millert 270:
271: /* Preprocess pattern. */
272: for (i = 0; i <= UCHAR_MAX; i++)
273: fg->qsBc[i] = fg->patternLen;
274: for (i = 1; i < fg->patternLen; i++) {
275: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
276: /*
277: * If case is ignored, make the jump apply to both upper and
278: * lower cased characters. As the pattern is stored in upper
279: * case, apply the same to the lower case equivalents.
280: */
281: if (iflag)
282: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
283: }
284: }
1.39 tedu 285: #endif
1.25 millert 286:
287: /*
288: * Returns: -1 on failure, 0 on success
289: */
290: int
1.6 tedu 291: fastcomp(fastgrep_t *fg, const char *pattern)
292: {
1.39 tedu 293: #ifdef SMALL
294: return -1;
295: #else
1.6 tedu 296: int i;
297: int bol = 0;
298: int eol = 0;
299: int shiftPatternLen;
300: int hasDot = 0;
301: int firstHalfDot = -1;
302: int firstLastHalfDot = -1;
303: int lastHalfDot = 0;
304:
305: /* Initialize. */
1.28 deraadt 306: fg->patternLen = strlen(pattern);
1.6 tedu 307: fg->bol = 0;
308: fg->eol = 0;
1.22 millert 309: fg->wmatch = 0;
1.6 tedu 310: fg->reversedSearch = 0;
311:
312: /* Remove end-of-line character ('$'). */
1.38 eric 313: if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
1.6 tedu 314: eol++;
315: fg->eol = 1;
316: fg->patternLen--;
317: }
318:
319: /* Remove beginning-of-line character ('^'). */
320: if (pattern[0] == '^') {
321: bol++;
322: fg->bol = 1;
323: fg->patternLen--;
324: }
325:
1.22 millert 326: /* Remove enclosing [[:<:]] and [[:>:]] (word match). */
1.30 otto 327: if (wflag) {
328: /* basic re's use \( \), extended re's ( ) */
329: int extra = Eflag ? 1 : 2;
330: fg->patternLen -= 14 + 2 * extra;
331: fg->wmatch = 7 + extra;
332: } else if (fg->patternLen >= 14 &&
1.22 millert 333: strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
1.24 millert 334: strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
1.22 millert 335: fg->patternLen -= 14;
336: fg->wmatch = 7;
337: }
338:
1.6 tedu 339: /*
1.22 millert 340: * Copy pattern minus '^' and '$' characters as well as word
341: * match character classes at the beginning and ending of the
342: * string respectively.
1.6 tedu 343: */
1.22 millert 344: fg->pattern = grep_malloc(fg->patternLen + 1);
345: memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
346: fg->pattern[fg->patternLen] = '\0';
1.6 tedu 347:
348: /* Look for ways to cheat...er...avoid the full regex engine. */
349: for (i = 0; i < fg->patternLen; i++)
350: {
1.46 tedu 351: switch (fg->pattern[i]) {
352: case '.':
1.6 tedu 353: hasDot = i;
354: if (i < fg->patternLen / 2) {
1.19 otto 355: if (firstHalfDot < 0)
1.6 tedu 356: /* Closest dot to the beginning */
357: firstHalfDot = i;
358: } else {
359: /* Closest dot to the end of the pattern. */
360: lastHalfDot = i;
361: if (firstLastHalfDot < 0)
362: firstLastHalfDot = i;
363: }
1.46 tedu 364: break;
365: case '(': case ')':
366: case '{': case '}':
367: /* Special in BRE if preceded by '\\' */
368: case '?':
369: case '+':
370: case '|':
371: /* Not special in BRE. */
372: if (!Eflag)
373: goto nonspecial;
374: case '\\':
375: case '*':
376: case '[': case ']':
1.6 tedu 377: /* Free memory and let others know this is empty. */
378: free(fg->pattern);
379: fg->pattern = NULL;
380: return (-1);
1.46 tedu 381: default:
382: nonspecial:
383: if (iflag)
384: fg->pattern[i] = toupper(fg->pattern[i]);
385: break;
1.6 tedu 386: }
387: }
388:
389: /*
390: * Determine if a reverse search would be faster based on the placement
391: * of the dots.
392: */
393: if ((!(lflag || cflag)) && ((!(bol || eol)) &&
394: ((lastHalfDot) && ((firstHalfDot < 0) ||
395: ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
396: fg->reversedSearch = 1;
397: hasDot = fg->patternLen - (firstHalfDot < 0 ?
398: firstLastHalfDot : firstHalfDot) - 1;
399: grep_revstr(fg->pattern, fg->patternLen);
400: }
401:
402: /*
403: * Normal Quick Search would require a shift based on the position the
404: * next character after the comparison is within the pattern. With
405: * wildcards, the position of the last dot effects the maximum shift
406: * distance.
407: * The closer to the end the wild card is the slower the search. A
1.10 deraadt 408: * reverse version of this algorithm would be useful for wildcards near
1.6 tedu 409: * the end of the string.
410: *
411: * Examples:
412: * Pattern Max shift
413: * ------- ---------
414: * this 5
415: * .his 4
416: * t.is 3
417: * th.s 2
418: * thi. 1
419: */
420:
421: /* Adjust the shift based on location of the last dot ('.'). */
422: shiftPatternLen = fg->patternLen - hasDot;
423:
424: /* Preprocess pattern. */
425: for (i = 0; i <= UCHAR_MAX; i++)
426: fg->qsBc[i] = shiftPatternLen;
427: for (i = hasDot + 1; i < fg->patternLen; i++) {
428: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
429: /*
430: * If case is ignored, make the jump apply to both upper and
431: * lower cased characters. As the pattern is stored in upper
432: * case, apply the same to the lower case equivalents.
433: */
434: if (iflag)
435: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
436: }
437:
438: /*
439: * Put pattern back to normal after pre-processing to allow for easy
440: * comparisons later.
441: */
442: if (fg->reversedSearch)
443: grep_revstr(fg->pattern, fg->patternLen);
444:
445: return (0);
1.39 tedu 446: #endif
1.6 tedu 447: }
448:
1.26 otto 449: /*
450: * Word boundaries using regular expressions are defined as the point
451: * of transition from a non-word char to a word char, or vice versa.
452: * This means that grep -w +a and grep -w a+ never match anything,
453: * because they lack a starting or ending transition, but grep -w a+b
454: * does match a line containing a+b.
455: */
1.22 millert 456: #define wmatch(d, l, s, e) \
1.26 otto 457: ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
458: e > s && isword(d[s]) && isword(d[e-1]))
1.22 millert 459:
1.9 millert 460: static int
1.40 tedu 461: grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch)
1.6 tedu 462: {
1.39 tedu 463: #ifdef SMALL
464: return 0;
465: #else
1.42 aschrijv 466: regoff_t j;
1.6 tedu 467: int rtrnVal = REG_NOMATCH;
468:
1.9 millert 469: pmatch->rm_so = -1;
470: pmatch->rm_eo = -1;
471:
1.6 tedu 472: /* No point in going farther if we do not have enough data. */
473: if (dataLen < fg->patternLen)
474: return (rtrnVal);
475:
476: /* Only try once at the beginning or ending of the line. */
477: if (fg->bol || fg->eol) {
478: /* Simple text comparison. */
479: /* Verify data is >= pattern length before searching on it. */
480: if (dataLen >= fg->patternLen) {
481: /* Determine where in data to start search at. */
482: if (fg->eol)
483: j = dataLen - fg->patternLen;
484: else
485: j = 0;
486: if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
1.22 millert 487: if (grep_cmp(fg->pattern, data + j,
488: fg->patternLen) == -1) {
1.9 millert 489: pmatch->rm_so = j;
490: pmatch->rm_eo = j + fg->patternLen;
1.22 millert 491: if (!fg->wmatch || wmatch(data, dataLen,
492: pmatch->rm_so, pmatch->rm_eo))
493: rtrnVal = 0;
1.9 millert 494: }
1.6 tedu 495: }
496: } else if (fg->reversedSearch) {
497: /* Quick Search algorithm. */
1.17 millert 498: j = dataLen;
499: do {
1.6 tedu 500: if (grep_cmp(fg->pattern, data + j - fg->patternLen,
501: fg->patternLen) == -1) {
1.9 millert 502: pmatch->rm_so = j - fg->patternLen;
503: pmatch->rm_eo = j;
1.22 millert 504: if (!fg->wmatch || wmatch(data, dataLen,
505: pmatch->rm_so, pmatch->rm_eo)) {
506: rtrnVal = 0;
507: break;
508: }
1.6 tedu 509: }
1.17 millert 510: /* Shift if within bounds, otherwise, we are done. */
511: if (j == fg->patternLen)
512: break;
1.40 tedu 513: j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
1.17 millert 514: } while (j >= fg->patternLen);
1.6 tedu 515: } else {
516: /* Quick Search algorithm. */
517: j = 0;
518: do {
519: if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
1.9 millert 520: pmatch->rm_so = j;
521: pmatch->rm_eo = j + fg->patternLen;
1.32 jaredy 522: if (fg->patternLen == 0 || !fg->wmatch ||
523: wmatch(data, dataLen, pmatch->rm_so,
524: pmatch->rm_eo)) {
1.22 millert 525: rtrnVal = 0;
526: break;
527: }
1.6 tedu 528: }
529:
530: /* Shift if within bounds, otherwise, we are done. */
531: if (j + fg->patternLen == dataLen)
532: break;
533: else
1.40 tedu 534: j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
1.6 tedu 535: } while (j <= (dataLen - fg->patternLen));
536: }
537:
538: return (rtrnVal);
1.39 tedu 539: #endif
1.6 tedu 540: }
541:
542:
1.1 deraadt 543: void *
544: grep_malloc(size_t size)
545: {
1.10 deraadt 546: void *ptr;
1.1 deraadt 547:
548: if ((ptr = malloc(size)) == NULL)
1.14 millert 549: err(2, "malloc");
1.35 deraadt 550: return ptr;
551: }
552:
553: void *
554: grep_calloc(size_t nmemb, size_t size)
555: {
556: void *ptr;
557:
558: if ((ptr = calloc(nmemb, size)) == NULL)
559: err(2, "calloc");
1.1 deraadt 560: return ptr;
561: }
562:
563: void *
564: grep_realloc(void *ptr, size_t size)
565: {
566: if ((ptr = realloc(ptr, size)) == NULL)
1.14 millert 567: err(2, "realloc");
1.49 deraadt 568: return ptr;
569: }
570:
571: void *
572: grep_reallocarray(void *ptr, size_t nmemb, size_t size)
573: {
574: if ((ptr = reallocarray(ptr, nmemb, size)) == NULL)
575: err(2, "reallocarray");
1.6 tedu 576: return ptr;
577: }
578:
1.39 tedu 579: #ifndef SMALL
1.6 tedu 580: /*
581: * Returns: i >= 0 on failure (position that it failed)
582: * -1 on success
583: */
1.18 avsm 584: static int
1.40 tedu 585: grep_cmp(const char *pattern, const char *data, size_t len)
1.6 tedu 586: {
587: int i;
588:
589: for (i = 0; i < len; i++) {
1.25 millert 590: if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
591: || (iflag && pattern[i] == toupper(data[i])))
1.6 tedu 592: continue;
593: return (i);
594: }
595:
596: return (-1);
597: }
598:
599: static void
600: grep_revstr(unsigned char *str, int len)
601: {
602: int i;
603: char c;
604:
605: for (i = 0; i < len / 2; i++) {
606: c = str[i];
607: str[i] = str[len - i - 1];
608: str[len - i - 1] = c;
609: }
1.1 deraadt 610: }
1.39 tedu 611: #endif
1.1 deraadt 612:
613: void
1.40 tedu 614: printline(str_t *line, int sep, regmatch_t *pmatch)
1.1 deraadt 615: {
616: int n;
1.2 deraadt 617:
1.1 deraadt 618: n = 0;
619: if (!hflag) {
620: fputs(line->file, stdout);
621: ++n;
622: }
623: if (nflag) {
624: if (n)
625: putchar(sep);
1.51 ! mmcc 626: printf("%lld", (long long)line->line_no);
1.1 deraadt 627: ++n;
628: }
629: if (bflag) {
630: if (n)
631: putchar(sep);
1.21 otto 632: printf("%lld", (long long)line->off);
633: ++n;
1.1 deraadt 634: }
635: if (n)
636: putchar(sep);
1.40 tedu 637: if (pmatch)
638: fwrite(line->dat + pmatch->rm_so,
639: pmatch->rm_eo - pmatch->rm_so, 1, stdout);
640: else
641: fwrite(line->dat, line->len, 1, stdout);
1.1 deraadt 642: putchar('\n');
643: }