Annotation of src/usr.bin/grep/util.c, Revision 1.6
1.6 ! tedu 1: /* $OpenBSD: util.c,v 1.5 2003/06/23 07:52:18 deraadt Exp $ */
1.3 deraadt 2:
1.1 deraadt 3: /*-
4: * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5: * All rights reserved.
6: *
7: * Redistribution and use in source and binary forms, with or without
8: * modification, are permitted provided that the following conditions
9: * are met:
10: * 1. Redistributions of source code must retain the above copyright
11: * notice, this list of conditions and the following disclaimer.
12: * 2. Redistributions in binary form must reproduce the above copyright
13: * notice, this list of conditions and the following disclaimer in the
14: * documentation and/or other materials provided with the distribution.
15: *
16: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26: * SUCH DAMAGE.
27: */
28:
29: #include <sys/types.h>
30: #include <sys/stat.h>
31:
32: #include <ctype.h>
33: #include <err.h>
34: #include <errno.h>
35: #include <fts.h>
36: #include <regex.h>
37: #include <stdio.h>
38: #include <stdlib.h>
39: #include <string.h>
40: #include <unistd.h>
41: #include <zlib.h>
42:
43: #include "grep.h"
44:
45: /*
46: * Process a file line by line...
47: */
48:
49: static int linesqueued;
1.4 tedu 50: static int procline(str_t *l, int);
1.6 ! tedu 51: static int grep_search(fastgrep_t *, unsigned char *, int);
! 52: static int grep_cmp(const unsigned char *, const unsigned char *, size_t);
! 53: static void grep_revstr(unsigned char *, int);
1.1 deraadt 54:
1.2 deraadt 55: int
1.1 deraadt 56: grep_tree(char **argv)
57: {
58: FTS *fts;
59: FTSENT *p;
60: int c, fts_flags;
61:
62: c = fts_flags = 0;
63:
64: if (Hflag)
65: fts_flags = FTS_COMFOLLOW;
66: if (Pflag)
67: fts_flags = FTS_PHYSICAL;
68: if (Sflag)
69: fts_flags = FTS_LOGICAL;
70:
71: fts_flags |= FTS_NOSTAT | FTS_NOCHDIR;
72:
73: if (!(fts = fts_open(argv, fts_flags, (int (*) ()) NULL)))
74: err(1, NULL);
75: while ((p = fts_read(fts)) != NULL) {
76: switch (p->fts_info) {
77: case FTS_DNR:
78: break;
79: case FTS_ERR:
80: errx(1, "%s: %s", p->fts_path, strerror(p->fts_errno));
81: break;
82: case FTS_DP:
83: break;
84: default:
85: c += procfile(p->fts_path);
86: break;
87: }
88: }
89:
90: return c;
91: }
92:
93: int
94: procfile(char *fn)
95: {
96: str_t ln;
97: file_t *f;
1.4 tedu 98: int c, t, z, nottext;
1.1 deraadt 99:
100: if (fn == NULL) {
101: fn = "(standard input)";
102: f = grep_fdopen(STDIN_FILENO, "r");
103: } else {
104: f = grep_open(fn, "r");
105: }
106: if (f == NULL) {
107: if (!sflag)
108: warn("%s", fn);
109: return 0;
110: }
1.4 tedu 111:
112: nottext = grep_bin_file(f);
113: if (nottext && binbehave == BIN_FILE_SKIP) {
1.1 deraadt 114: grep_close(f);
115: return 0;
116: }
117:
118: ln.file = fn;
119: ln.line_no = 0;
120: linesqueued = 0;
121: ln.off = -1;
122:
123: if (Bflag > 0)
124: initqueue();
125: for (c = 0; !(lflag && c);) {
126: ln.off += ln.len + 1;
127: if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
128: break;
129: if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
130: --ln.len;
131: ln.line_no++;
132:
133: z = tail;
1.2 deraadt 134:
1.4 tedu 135: if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
1.1 deraadt 136: enqueue(&ln);
137: linesqueued++;
138: }
139: c += t;
140: }
141: if (Bflag > 0)
142: clearqueue();
143: grep_close(f);
144:
145: if (cflag) {
146: if (!hflag)
147: printf("%s:", ln.file);
148: printf("%u\n", c);
149: }
150: if (lflag && c != 0)
151: printf("%s\n", fn);
152: if (Lflag && c == 0)
153: printf("%s\n", fn);
1.4 tedu 154: if (c && !cflag && !lflag && !Lflag &&
155: binbehave == BIN_FILE_BIN && nottext)
156: printf("Binary file %s matches\n", fn);
157:
1.1 deraadt 158: return c;
159: }
160:
161:
162: /*
163: * Process an individual line in a file. Return non-zero if it matches.
164: */
165:
166: #define isword(x) (isalnum(x) || (x) == '_')
167:
168: static int
1.4 tedu 169: procline(str_t *l, int nottext)
1.1 deraadt 170: {
171: regmatch_t pmatch;
172: int c, i, r, t;
173:
174: if (matchall) {
175: c = !vflag;
176: goto print;
177: }
1.2 deraadt 178:
1.1 deraadt 179: t = vflag ? REG_NOMATCH : 0;
180: pmatch.rm_so = 0;
181: pmatch.rm_eo = l->len;
182: for (c = i = 0; i < patterns; i++) {
1.6 ! tedu 183: if (fg_pattern[i].pattern)
! 184: r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
! 185: l->len);
! 186: else
! 187: r = regexec(&r_pattern[i], l->dat, 0, &pmatch, eflags);
1.1 deraadt 188: if (r == REG_NOMATCH && t == 0)
189: continue;
190: if (r == 0) {
191: if (wflag) {
1.5 deraadt 192: if ((pmatch.rm_so != 0 &&
193: isword(l->dat[pmatch.rm_so - 1])) ||
194: (pmatch.rm_eo != l->len &&
195: isword(l->dat[pmatch.rm_eo])))
1.1 deraadt 196: r = REG_NOMATCH;
197: }
198: if (xflag) {
199: if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
200: r = REG_NOMATCH;
201: }
202: }
203: if (r == t) {
204: c++;
205: break;
206: }
207: }
1.2 deraadt 208:
1.1 deraadt 209: print:
1.4 tedu 210: if (c && binbehave == BIN_FILE_BIN && nottext)
211: return c; /* Binary file */
212:
1.1 deraadt 213: if ((tail > 0 || c) && !cflag && !qflag) {
214: if (c) {
1.5 deraadt 215: if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
216: (Aflag || Bflag))
1.1 deraadt 217: printf("--\n");
218: first = 1;
219: tail = Aflag;
220: if (Bflag > 0)
221: printqueue();
222: linesqueued = 0;
223: printline(l, ':');
224: } else {
225: printline(l, '-');
226: tail--;
227: }
228: }
229: return c;
230: }
231:
1.6 ! tedu 232: /*
! 233: * Returns: -1 on failure
! 234: * 0 on success
! 235: */
! 236: int
! 237: fastcomp(fastgrep_t *fg, const char *pattern)
! 238: {
! 239: int i;
! 240: int bol = 0;
! 241: int eol = 0;
! 242: int origPatternLen;
! 243: int shiftPatternLen;
! 244: int hasDot = 0;
! 245: int firstHalfDot = -1;
! 246: int firstLastHalfDot = -1;
! 247: int lastHalfDot = 0;
! 248:
! 249: /* Initialize. */
! 250: origPatternLen = fg->patternLen = strlen(pattern);
! 251: fg->bol = 0;
! 252: fg->eol = 0;
! 253: fg->reversedSearch = 0;
! 254:
! 255: /* Remove end-of-line character ('$'). */
! 256: if (pattern[fg->patternLen - 1] == '$') {
! 257: eol++;
! 258: fg->eol = 1;
! 259: fg->patternLen--;
! 260: boleol = 1;
! 261: }
! 262:
! 263: /* Remove beginning-of-line character ('^'). */
! 264: if (pattern[0] == '^') {
! 265: bol++;
! 266: fg->bol = 1;
! 267: fg->patternLen--;
! 268: boleol = 1;
! 269: }
! 270:
! 271: /*
! 272: * Copy pattern minus '^' and '$' characters at the beginning and ending of
! 273: * the string respectively.
! 274: */
! 275: fg->pattern = grep_strdup(pattern + bol);
! 276:
! 277: /* Look for ways to cheat...er...avoid the full regex engine. */
! 278: for (i = 0; i < fg->patternLen; i++)
! 279: {
! 280: /* Can still cheat? */
! 281: if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
! 282: (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
! 283: (fg->pattern[i] == '^') || (fg->pattern[i] == '$') ||
! 284: (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
! 285: (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
! 286: /* As long as it is good, upper case it for later. */
! 287: if (iflag)
! 288: fg->pattern[i] = toupper(fg->pattern[i]);
! 289: } else if (fg->pattern[i] == '.') {
! 290: hasDot = i;
! 291: if (i < fg->patternLen / 2) {
! 292: if (firstHalfDot < -1)
! 293: /* Closest dot to the beginning */
! 294: firstHalfDot = i;
! 295: } else {
! 296: /* Closest dot to the end of the pattern. */
! 297: lastHalfDot = i;
! 298: if (firstLastHalfDot < 0)
! 299: firstLastHalfDot = i;
! 300: }
! 301: } else {
! 302: /* Free memory and let others know this is empty. */
! 303: free(fg->pattern);
! 304: fg->pattern = NULL;
! 305: return (-1);
! 306: }
! 307: }
! 308:
! 309: /*
! 310: * Determine if a reverse search would be faster based on the placement
! 311: * of the dots.
! 312: */
! 313: if ((!(lflag || cflag)) && ((!(bol || eol)) &&
! 314: ((lastHalfDot) && ((firstHalfDot < 0) ||
! 315: ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
! 316: fg->reversedSearch = 1;
! 317: hasDot = fg->patternLen - (firstHalfDot < 0 ?
! 318: firstLastHalfDot : firstHalfDot) - 1;
! 319: grep_revstr(fg->pattern, fg->patternLen);
! 320: }
! 321:
! 322: /*
! 323: * Normal Quick Search would require a shift based on the position the
! 324: * next character after the comparison is within the pattern. With
! 325: * wildcards, the position of the last dot effects the maximum shift
! 326: * distance.
! 327: * The closer to the end the wild card is the slower the search. A
! 328: * reverse version of this algorithm would be useful for wildcards near
! 329: * the end of the string.
! 330: *
! 331: * Examples:
! 332: * Pattern Max shift
! 333: * ------- ---------
! 334: * this 5
! 335: * .his 4
! 336: * t.is 3
! 337: * th.s 2
! 338: * thi. 1
! 339: */
! 340:
! 341: /* Adjust the shift based on location of the last dot ('.'). */
! 342: shiftPatternLen = fg->patternLen - hasDot;
! 343:
! 344: /* Preprocess pattern. */
! 345: for (i = 0; i <= UCHAR_MAX; i++)
! 346: fg->qsBc[i] = shiftPatternLen;
! 347: for (i = hasDot + 1; i < fg->patternLen; i++) {
! 348: fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
! 349: /*
! 350: * If case is ignored, make the jump apply to both upper and
! 351: * lower cased characters. As the pattern is stored in upper
! 352: * case, apply the same to the lower case equivalents.
! 353: */
! 354: if (iflag)
! 355: fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
! 356: }
! 357:
! 358: /*
! 359: * Put pattern back to normal after pre-processing to allow for easy
! 360: * comparisons later.
! 361: */
! 362: if (fg->reversedSearch)
! 363: grep_revstr(fg->pattern, fg->patternLen);
! 364:
! 365: return (0);
! 366: }
! 367:
! 368: static int grep_search(fastgrep_t *fg, unsigned char *data, int dataLen)
! 369: {
! 370: int j;
! 371: int rtrnVal = REG_NOMATCH;
! 372:
! 373: /* No point in going farther if we do not have enough data. */
! 374: if (dataLen < fg->patternLen)
! 375: return (rtrnVal);
! 376:
! 377: /* Only try once at the beginning or ending of the line. */
! 378: if (fg->bol || fg->eol) {
! 379: /* Simple text comparison. */
! 380: /* Verify data is >= pattern length before searching on it. */
! 381: if (dataLen >= fg->patternLen) {
! 382: /* Determine where in data to start search at. */
! 383: if (fg->eol)
! 384: j = dataLen - fg->patternLen;
! 385: else
! 386: j = 0;
! 387: if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
! 388: if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1)
! 389: rtrnVal = 0;
! 390: }
! 391: } else if (fg->reversedSearch) {
! 392: /* Quick Search algorithm. */
! 393: j = dataLen;
! 394: do {
! 395: if (grep_cmp(fg->pattern, data + j - fg->patternLen,
! 396: fg->patternLen) == -1) {
! 397: rtrnVal = 0;
! 398: break;
! 399: }
! 400:
! 401: /* Shift if within bounds, otherwise, we are done. */
! 402: if (j == 0)
! 403: break;
! 404: else
! 405: j -= fg->qsBc[data[j - fg->patternLen - 1]];
! 406: } while (j >= 0);
! 407: } else {
! 408: /* Quick Search algorithm. */
! 409: j = 0;
! 410: do {
! 411: if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
! 412: rtrnVal = 0;
! 413: break;
! 414: }
! 415:
! 416: /* Shift if within bounds, otherwise, we are done. */
! 417: if (j + fg->patternLen == dataLen)
! 418: break;
! 419: else
! 420: j += fg->qsBc[data[j + fg->patternLen]];
! 421: } while (j <= (dataLen - fg->patternLen));
! 422: }
! 423:
! 424: return (rtrnVal);
! 425: }
! 426:
! 427:
1.1 deraadt 428: void *
429: grep_malloc(size_t size)
430: {
431: void *ptr;
432:
433: if ((ptr = malloc(size)) == NULL)
434: err(1, "malloc");
435: return ptr;
436: }
437:
438: void *
439: grep_realloc(void *ptr, size_t size)
440: {
441: if ((ptr = realloc(ptr, size)) == NULL)
442: err(1, "realloc");
443: return ptr;
1.6 ! tedu 444: }
! 445:
! 446: unsigned char *
! 447: grep_strdup(const char *str)
! 448: {
! 449: unsigned char *ptr;
! 450:
! 451: if ((ptr = (unsigned char *)strdup(str)) == NULL)
! 452: err(1, "strdup");
! 453: return ptr;
! 454: }
! 455:
! 456: /*
! 457: * Returns: i >= 0 on failure (position that it failed)
! 458: * -1 on success
! 459: */
! 460: int
! 461: grep_cmp(const unsigned char *pattern, const unsigned char *data,
! 462: size_t len)
! 463: {
! 464: int i;
! 465:
! 466: for (i = 0; i < len; i++) {
! 467: if (((pattern[i] == data[i]) || (pattern[i] == '.')) ||
! 468: (iflag && pattern[i] == toupper(data[i])))
! 469: continue;
! 470: return (i);
! 471: }
! 472:
! 473: return (-1);
! 474: }
! 475:
! 476: static void
! 477: grep_revstr(unsigned char *str, int len)
! 478: {
! 479: int i;
! 480: char c;
! 481:
! 482: for (i = 0; i < len / 2; i++) {
! 483: c = str[i];
! 484: str[i] = str[len - i - 1];
! 485: str[len - i - 1] = c;
! 486: }
1.1 deraadt 487: }
488:
489: void
490: printline(str_t *line, int sep)
491: {
492: int n;
1.2 deraadt 493:
1.1 deraadt 494: n = 0;
495: if (!hflag) {
496: fputs(line->file, stdout);
497: ++n;
498: }
499: if (nflag) {
500: if (n)
501: putchar(sep);
502: printf("%d", line->line_no);
503: ++n;
504: }
505: if (bflag) {
506: if (n)
507: putchar(sep);
508: printf("%lu", (unsigned long)line->off);
509: }
510: if (n)
511: putchar(sep);
512: fwrite(line->dat, line->len, 1, stdout);
513: putchar('\n');
514: }