Annotation of src/usr.bin/grep/util.c, Revision 1.55
1.55 ! otto 1: /* $OpenBSD: util.c,v 1.54 2015/12/22 19:35:50 mmcc Exp $ */
1.3 deraadt 2:
1.1 deraadt 3: /*-
4: * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5: * All rights reserved.
6: *
7: * Redistribution and use in source and binary forms, with or without
8: * modification, are permitted provided that the following conditions
9: * are met:
10: * 1. Redistributions of source code must retain the above copyright
11: * notice, this list of conditions and the following disclaimer.
12: * 2. Redistributions in binary form must reproduce the above copyright
13: * notice, this list of conditions and the following disclaimer in the
14: * documentation and/or other materials provided with the distribution.
15: *
16: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26: * SUCH DAMAGE.
27: */
28:
29: #include <sys/types.h>
30: #include <sys/stat.h>
31:
32: #include <ctype.h>
33: #include <err.h>
34: #include <errno.h>
35: #include <fts.h>
36: #include <regex.h>
1.53 millert 37: #include <stdbool.h>
1.1 deraadt 38: #include <stdio.h>
39: #include <stdlib.h>
40: #include <string.h>
41: #include <unistd.h>
42: #include <zlib.h>
43:
44: #include "grep.h"
45:
46: /*
47: * Process a file line by line...
48: */
49:
50: static int linesqueued;
1.4 tedu 51: static int procline(str_t *l, int);
1.40 tedu 52: static int grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch);
1.39 tedu 53: #ifndef SMALL
1.53 millert 54: static bool grep_cmp(const char *, const char *, size_t);
1.6 tedu 55: static void grep_revstr(unsigned char *, int);
1.39 tedu 56: #endif
1.1 deraadt 57:
1.2 deraadt 58: int
1.1 deraadt 59: grep_tree(char **argv)
60: {
1.10 deraadt 61: FTS *fts;
62: FTSENT *p;
63: int c, fts_flags;
1.1 deraadt 64:
1.37 tedu 65: c = 0;
1.1 deraadt 66:
1.37 tedu 67: fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
1.1 deraadt 68:
1.11 millert 69: if (!(fts = fts_open(argv, fts_flags, NULL)))
1.14 millert 70: err(2, NULL);
1.1 deraadt 71: while ((p = fts_read(fts)) != NULL) {
72: switch (p->fts_info) {
73: case FTS_DNR:
74: break;
75: case FTS_ERR:
1.45 millert 76: file_err = 1;
77: if(!sflag)
1.48 guenther 78: warnc(p->fts_errno, "%s", p->fts_path);
1.1 deraadt 79: break;
80: case FTS_DP:
81: break;
82: default:
83: c += procfile(p->fts_path);
84: break;
85: }
86: }
1.34 otto 87: if (errno)
88: err(2, "fts_read");
1.50 uebayasi 89: fts_close(fts);
1.1 deraadt 90: return c;
91: }
92:
93: int
94: procfile(char *fn)
95: {
96: str_t ln;
97: file_t *f;
1.4 tedu 98: int c, t, z, nottext;
1.1 deraadt 99:
100: if (fn == NULL) {
101: fn = "(standard input)";
102: f = grep_fdopen(STDIN_FILENO, "r");
103: } else {
104: f = grep_open(fn, "r");
105: }
106: if (f == NULL) {
1.45 millert 107: file_err = 1;
1.1 deraadt 108: if (!sflag)
109: warn("%s", fn);
110: return 0;
111: }
1.4 tedu 112:
113: nottext = grep_bin_file(f);
114: if (nottext && binbehave == BIN_FILE_SKIP) {
1.1 deraadt 115: grep_close(f);
116: return 0;
117: }
118:
119: ln.file = fn;
120: ln.line_no = 0;
1.20 espie 121: ln.len = 0;
1.1 deraadt 122: linesqueued = 0;
1.33 jaredy 123: tail = 0;
1.1 deraadt 124: ln.off = -1;
125:
126: if (Bflag > 0)
127: initqueue();
1.27 otto 128: for (c = 0; c == 0 || !(lflag || qflag); ) {
1.1 deraadt 129: ln.off += ln.len + 1;
130: if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
131: break;
132: if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
133: --ln.len;
134: ln.line_no++;
135:
136: z = tail;
1.2 deraadt 137:
1.4 tedu 138: if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
1.1 deraadt 139: enqueue(&ln);
140: linesqueued++;
141: }
142: c += t;
143: }
144: if (Bflag > 0)
145: clearqueue();
146: grep_close(f);
147:
148: if (cflag) {
149: if (!hflag)
150: printf("%s:", ln.file);
151: printf("%u\n", c);
152: }
153: if (lflag && c != 0)
154: printf("%s\n", fn);
155: if (Lflag && c == 0)
156: printf("%s\n", fn);
1.4 tedu 157: if (c && !cflag && !lflag && !Lflag &&
1.7 tedu 158: binbehave == BIN_FILE_BIN && nottext && !qflag)
1.4 tedu 159: printf("Binary file %s matches\n", fn);
160:
1.1 deraadt 161: return c;
162: }
163:
164:
165: /*
166: * Process an individual line in a file. Return non-zero if it matches.
167: */
168:
1.47 deraadt 169: #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
1.1 deraadt 170:
171: static int
1.4 tedu 172: procline(str_t *l, int nottext)
1.1 deraadt 173: {
174: regmatch_t pmatch;
1.15 dhartmei 175: int c, i, r;
1.42 aschrijv 176: regoff_t offset;
177:
178: /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
179: * into regoff_t */
180: if (l->len > SSIZE_MAX) {
181: errx(2, "Line is too big to process");
182: }
1.1 deraadt 183:
1.40 tedu 184: c = 0;
185: i = 0;
1.1 deraadt 186: if (matchall) {
1.41 tedu 187: c = 1;
1.1 deraadt 188: goto print;
189: }
1.2 deraadt 190:
1.40 tedu 191: for (i = 0; i < patterns; i++) {
192: offset = 0;
193: redo:
1.22 millert 194: if (fg_pattern[i].pattern) {
1.40 tedu 195: r = grep_search(&fg_pattern[i], l->dat + offset,
196: l->len - offset, &pmatch);
197: pmatch.rm_so += offset;
198: pmatch.rm_eo += offset;
1.22 millert 199: } else {
1.40 tedu 200: pmatch.rm_so = offset;
1.43 otto 201: pmatch.rm_eo = l->len;
1.9 millert 202: r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
1.22 millert 203: }
204: if (r == 0 && xflag) {
205: if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
206: r = REG_NOMATCH;
1.1 deraadt 207: }
1.15 dhartmei 208: if (r == 0) {
1.40 tedu 209: c = 1;
1.44 millert 210: if (oflag && pmatch.rm_so != pmatch.rm_eo)
1.40 tedu 211: goto print;
1.1 deraadt 212: break;
213: }
214: }
1.40 tedu 215: if (oflag)
216: return c;
217: print:
1.15 dhartmei 218: if (vflag)
219: c = !c;
1.2 deraadt 220:
1.4 tedu 221: if (c && binbehave == BIN_FILE_BIN && nottext)
222: return c; /* Binary file */
223:
1.1 deraadt 224: if ((tail > 0 || c) && !cflag && !qflag) {
225: if (c) {
1.5 deraadt 226: if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
227: (Aflag || Bflag))
1.1 deraadt 228: printf("--\n");
229: first = 1;
230: tail = Aflag;
231: if (Bflag > 0)
232: printqueue();
233: linesqueued = 0;
1.40 tedu 234: printline(l, ':', oflag ? &pmatch : NULL);
1.1 deraadt 235: } else {
1.40 tedu 236: printline(l, '-', oflag ? &pmatch : NULL);
1.1 deraadt 237: tail--;
238: }
239: }
1.40 tedu 240: if (oflag && !matchall) {
241: offset = pmatch.rm_eo;
242: goto redo;
243: }
1.1 deraadt 244: return c;
245: }
246:
1.39 tedu 247: #ifndef SMALL
1.31 otto 248: void
1.47 deraadt 249: fgrepcomp(fastgrep_t *fg, const unsigned char *pattern)
1.25 millert 250: {
251: int i;
252:
253: /* Initialize. */
254: fg->patternLen = strlen(pattern);
255: fg->bol = 0;
256: fg->eol = 0;
257: fg->wmatch = wflag;
258: fg->reversedSearch = 0;
259:
260: /*
261: * Make a copy and upper case it for later if in -i mode,
262: * else just copy the pointer.
263: */
264: if (iflag) {
265: fg->pattern = grep_malloc(fg->patternLen + 1);
266: for (i = 0; i < fg->patternLen; i++)
267: fg->pattern[i] = toupper(pattern[i]);
268: fg->pattern[fg->patternLen] = '\0';
269: } else
1.28 deraadt 270: fg->pattern = (unsigned char *)pattern; /* really const */
1.25 millert 271:
272: /* Preprocess pattern. */
273: for (i = 0; i <= UCHAR_MAX; i++)
274: fg->qsBc[i] = fg->patternLen;
275: for (i = 1; i < fg->patternLen; i++) {
276: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
277: /*
278: * If case is ignored, make the jump apply to both upper and
279: * lower cased characters. As the pattern is stored in upper
280: * case, apply the same to the lower case equivalents.
281: */
282: if (iflag)
283: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
284: }
285: }
1.39 tedu 286: #endif
1.25 millert 287:
288: /*
289: * Returns: -1 on failure, 0 on success
290: */
291: int
1.6 tedu 292: fastcomp(fastgrep_t *fg, const char *pattern)
293: {
1.39 tedu 294: #ifdef SMALL
295: return -1;
296: #else
1.6 tedu 297: int i;
298: int bol = 0;
299: int eol = 0;
300: int shiftPatternLen;
301: int hasDot = 0;
302: int firstHalfDot = -1;
303: int firstLastHalfDot = -1;
304: int lastHalfDot = 0;
305:
306: /* Initialize. */
1.28 deraadt 307: fg->patternLen = strlen(pattern);
1.6 tedu 308: fg->bol = 0;
309: fg->eol = 0;
1.22 millert 310: fg->wmatch = 0;
1.6 tedu 311: fg->reversedSearch = 0;
312:
313: /* Remove end-of-line character ('$'). */
1.38 eric 314: if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
1.6 tedu 315: eol++;
316: fg->eol = 1;
317: fg->patternLen--;
318: }
319:
320: /* Remove beginning-of-line character ('^'). */
321: if (pattern[0] == '^') {
322: bol++;
323: fg->bol = 1;
324: fg->patternLen--;
325: }
326:
1.22 millert 327: /* Remove enclosing [[:<:]] and [[:>:]] (word match). */
1.30 otto 328: if (wflag) {
329: /* basic re's use \( \), extended re's ( ) */
330: int extra = Eflag ? 1 : 2;
331: fg->patternLen -= 14 + 2 * extra;
332: fg->wmatch = 7 + extra;
333: } else if (fg->patternLen >= 14 &&
1.22 millert 334: strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
1.24 millert 335: strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
1.22 millert 336: fg->patternLen -= 14;
337: fg->wmatch = 7;
338: }
339:
1.6 tedu 340: /*
1.22 millert 341: * Copy pattern minus '^' and '$' characters as well as word
342: * match character classes at the beginning and ending of the
343: * string respectively.
1.6 tedu 344: */
1.22 millert 345: fg->pattern = grep_malloc(fg->patternLen + 1);
346: memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
347: fg->pattern[fg->patternLen] = '\0';
1.6 tedu 348:
349: /* Look for ways to cheat...er...avoid the full regex engine. */
350: for (i = 0; i < fg->patternLen; i++)
351: {
1.46 tedu 352: switch (fg->pattern[i]) {
353: case '.':
1.6 tedu 354: hasDot = i;
355: if (i < fg->patternLen / 2) {
1.19 otto 356: if (firstHalfDot < 0)
1.6 tedu 357: /* Closest dot to the beginning */
358: firstHalfDot = i;
359: } else {
360: /* Closest dot to the end of the pattern. */
361: lastHalfDot = i;
362: if (firstLastHalfDot < 0)
363: firstLastHalfDot = i;
364: }
1.46 tedu 365: break;
366: case '(': case ')':
367: case '{': case '}':
368: /* Special in BRE if preceded by '\\' */
369: case '?':
370: case '+':
371: case '|':
372: /* Not special in BRE. */
373: if (!Eflag)
374: goto nonspecial;
375: case '\\':
376: case '*':
377: case '[': case ']':
1.6 tedu 378: /* Free memory and let others know this is empty. */
379: free(fg->pattern);
380: fg->pattern = NULL;
381: return (-1);
1.46 tedu 382: default:
383: nonspecial:
384: if (iflag)
385: fg->pattern[i] = toupper(fg->pattern[i]);
386: break;
1.6 tedu 387: }
388: }
389:
390: /*
391: * Determine if a reverse search would be faster based on the placement
392: * of the dots.
393: */
1.55 ! otto 394: if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) &&
1.6 tedu 395: ((lastHalfDot) && ((firstHalfDot < 0) ||
396: ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
397: fg->reversedSearch = 1;
398: hasDot = fg->patternLen - (firstHalfDot < 0 ?
399: firstLastHalfDot : firstHalfDot) - 1;
400: grep_revstr(fg->pattern, fg->patternLen);
401: }
402:
403: /*
404: * Normal Quick Search would require a shift based on the position the
405: * next character after the comparison is within the pattern. With
406: * wildcards, the position of the last dot effects the maximum shift
407: * distance.
408: * The closer to the end the wild card is the slower the search. A
1.10 deraadt 409: * reverse version of this algorithm would be useful for wildcards near
1.6 tedu 410: * the end of the string.
411: *
412: * Examples:
413: * Pattern Max shift
414: * ------- ---------
415: * this 5
416: * .his 4
417: * t.is 3
418: * th.s 2
419: * thi. 1
420: */
421:
422: /* Adjust the shift based on location of the last dot ('.'). */
423: shiftPatternLen = fg->patternLen - hasDot;
424:
425: /* Preprocess pattern. */
426: for (i = 0; i <= UCHAR_MAX; i++)
427: fg->qsBc[i] = shiftPatternLen;
428: for (i = hasDot + 1; i < fg->patternLen; i++) {
429: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
430: /*
431: * If case is ignored, make the jump apply to both upper and
432: * lower cased characters. As the pattern is stored in upper
433: * case, apply the same to the lower case equivalents.
434: */
435: if (iflag)
436: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
437: }
438:
439: /*
440: * Put pattern back to normal after pre-processing to allow for easy
441: * comparisons later.
442: */
443: if (fg->reversedSearch)
444: grep_revstr(fg->pattern, fg->patternLen);
445:
446: return (0);
1.39 tedu 447: #endif
1.6 tedu 448: }
449:
1.26 otto 450: /*
451: * Word boundaries using regular expressions are defined as the point
452: * of transition from a non-word char to a word char, or vice versa.
453: * This means that grep -w +a and grep -w a+ never match anything,
454: * because they lack a starting or ending transition, but grep -w a+b
455: * does match a line containing a+b.
456: */
1.22 millert 457: #define wmatch(d, l, s, e) \
1.26 otto 458: ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
459: e > s && isword(d[s]) && isword(d[e-1]))
1.22 millert 460:
1.9 millert 461: static int
1.40 tedu 462: grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch)
1.6 tedu 463: {
1.39 tedu 464: #ifdef SMALL
465: return 0;
466: #else
1.42 aschrijv 467: regoff_t j;
1.6 tedu 468: int rtrnVal = REG_NOMATCH;
469:
1.9 millert 470: pmatch->rm_so = -1;
471: pmatch->rm_eo = -1;
472:
1.6 tedu 473: /* No point in going farther if we do not have enough data. */
474: if (dataLen < fg->patternLen)
475: return (rtrnVal);
476:
477: /* Only try once at the beginning or ending of the line. */
478: if (fg->bol || fg->eol) {
479: /* Simple text comparison. */
480: /* Verify data is >= pattern length before searching on it. */
481: if (dataLen >= fg->patternLen) {
482: /* Determine where in data to start search at. */
483: if (fg->eol)
484: j = dataLen - fg->patternLen;
485: else
486: j = 0;
487: if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
1.22 millert 488: if (grep_cmp(fg->pattern, data + j,
1.53 millert 489: fg->patternLen)) {
1.9 millert 490: pmatch->rm_so = j;
491: pmatch->rm_eo = j + fg->patternLen;
1.22 millert 492: if (!fg->wmatch || wmatch(data, dataLen,
493: pmatch->rm_so, pmatch->rm_eo))
494: rtrnVal = 0;
1.9 millert 495: }
1.6 tedu 496: }
497: } else if (fg->reversedSearch) {
498: /* Quick Search algorithm. */
1.17 millert 499: j = dataLen;
500: do {
1.6 tedu 501: if (grep_cmp(fg->pattern, data + j - fg->patternLen,
1.53 millert 502: fg->patternLen)) {
1.9 millert 503: pmatch->rm_so = j - fg->patternLen;
504: pmatch->rm_eo = j;
1.22 millert 505: if (!fg->wmatch || wmatch(data, dataLen,
506: pmatch->rm_so, pmatch->rm_eo)) {
507: rtrnVal = 0;
508: break;
509: }
1.6 tedu 510: }
1.17 millert 511: /* Shift if within bounds, otherwise, we are done. */
512: if (j == fg->patternLen)
513: break;
1.40 tedu 514: j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
1.17 millert 515: } while (j >= fg->patternLen);
1.6 tedu 516: } else {
517: /* Quick Search algorithm. */
518: j = 0;
519: do {
1.53 millert 520: if (grep_cmp(fg->pattern, data + j, fg->patternLen)) {
1.9 millert 521: pmatch->rm_so = j;
522: pmatch->rm_eo = j + fg->patternLen;
1.32 jaredy 523: if (fg->patternLen == 0 || !fg->wmatch ||
524: wmatch(data, dataLen, pmatch->rm_so,
525: pmatch->rm_eo)) {
1.22 millert 526: rtrnVal = 0;
527: break;
528: }
1.6 tedu 529: }
530:
531: /* Shift if within bounds, otherwise, we are done. */
532: if (j + fg->patternLen == dataLen)
533: break;
534: else
1.40 tedu 535: j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
1.6 tedu 536: } while (j <= (dataLen - fg->patternLen));
537: }
538:
539: return (rtrnVal);
1.39 tedu 540: #endif
1.6 tedu 541: }
542:
543:
1.1 deraadt 544: void *
545: grep_malloc(size_t size)
546: {
1.10 deraadt 547: void *ptr;
1.1 deraadt 548:
549: if ((ptr = malloc(size)) == NULL)
1.14 millert 550: err(2, "malloc");
1.35 deraadt 551: return ptr;
552: }
553:
554: void *
555: grep_calloc(size_t nmemb, size_t size)
556: {
557: void *ptr;
558:
559: if ((ptr = calloc(nmemb, size)) == NULL)
560: err(2, "calloc");
1.1 deraadt 561: return ptr;
562: }
563:
564: void *
565: grep_realloc(void *ptr, size_t size)
566: {
567: if ((ptr = realloc(ptr, size)) == NULL)
1.14 millert 568: err(2, "realloc");
1.49 deraadt 569: return ptr;
570: }
571:
572: void *
573: grep_reallocarray(void *ptr, size_t nmemb, size_t size)
574: {
575: if ((ptr = reallocarray(ptr, nmemb, size)) == NULL)
576: err(2, "reallocarray");
1.6 tedu 577: return ptr;
578: }
579:
1.39 tedu 580: #ifndef SMALL
1.6 tedu 581: /*
1.53 millert 582: * Returns: true on success, false on failure
1.6 tedu 583: */
1.53 millert 584: static bool
1.40 tedu 585: grep_cmp(const char *pattern, const char *data, size_t len)
1.6 tedu 586: {
1.53 millert 587: size_t i;
1.6 tedu 588:
589: for (i = 0; i < len; i++) {
1.25 millert 590: if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
1.54 mmcc 591: || (iflag && pattern[i] == toupper((unsigned char)data[i])))
1.6 tedu 592: continue;
1.53 millert 593: return false;
1.6 tedu 594: }
595:
1.53 millert 596: return true;
1.6 tedu 597: }
598:
599: static void
600: grep_revstr(unsigned char *str, int len)
601: {
602: int i;
603: char c;
604:
605: for (i = 0; i < len / 2; i++) {
606: c = str[i];
607: str[i] = str[len - i - 1];
608: str[len - i - 1] = c;
609: }
1.1 deraadt 610: }
1.39 tedu 611: #endif
1.1 deraadt 612:
613: void
1.40 tedu 614: printline(str_t *line, int sep, regmatch_t *pmatch)
1.1 deraadt 615: {
616: int n;
1.2 deraadt 617:
1.1 deraadt 618: n = 0;
619: if (!hflag) {
620: fputs(line->file, stdout);
621: ++n;
622: }
623: if (nflag) {
624: if (n)
625: putchar(sep);
1.52 mmcc 626: printf("%lld", line->line_no);
1.1 deraadt 627: ++n;
628: }
629: if (bflag) {
630: if (n)
631: putchar(sep);
1.21 otto 632: printf("%lld", (long long)line->off);
633: ++n;
1.1 deraadt 634: }
635: if (n)
636: putchar(sep);
1.40 tedu 637: if (pmatch)
638: fwrite(line->dat + pmatch->rm_so,
639: pmatch->rm_eo - pmatch->rm_so, 1, stdout);
640: else
641: fwrite(line->dat, line->len, 1, stdout);
1.1 deraadt 642: putchar('\n');
643: }