src/usr.bin/grep/util.c - diff

Return to util.c CVS log

Up to [local] / src / usr.bin / grep

Diff for /src/usr.bin/grep/util.c between version 1.5 and 1.6

-version 1.5, 2003/06/23 07:52:18
+version 1.6, 2003/06/23 22:05:23
 Line 48
 Line 48
 Line 48
  static int      linesqueued;
  static int      procline(str_t *l, int);
+ static int      grep_search(fastgrep_t *, unsigned char *, int);
+ static int      grep_cmp(const unsigned char *, const unsigned char *, size_t);
+ static void     grep_revstr(unsigned char *, int);
  int
  grep_tree(char **argv)
-Line 177
+Line 180
 Line 177
 Line 180
          pmatch.rm_so = 0;
          pmatch.rm_eo = l->len;
          for (c = i = 0; i < patterns; i++) {
-                 r = regexec(&r_pattern[i], l->dat, 0, &pmatch, eflags);
+                 if (fg_pattern[i].pattern)
+                         r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
+                             l->len);
+                 else
+                         r = regexec(&r_pattern[i], l->dat, 0, &pmatch, eflags);
                  if (r == REG_NOMATCH && t == 0)
                          continue;
                  if (r == 0) {
-Line 222
+Line 229
 Line 222
 Line 229
          return c;
  }
+ /*
+  * Returns: -1 on failure
+  *           0 on success
+  */
+ int
+ fastcomp(fastgrep_t *fg, const char *pattern)
+ {
+         int i;
+         int bol = 0;
+         int eol = 0;
+         int origPatternLen;
+         int shiftPatternLen;
+         int hasDot = 0;
+         int firstHalfDot = -1;
+         int firstLastHalfDot = -1;
+         int lastHalfDot = 0;
+         /* Initialize. */
+         origPatternLen = fg->patternLen = strlen(pattern);
+         fg->bol = 0;
+         fg->eol = 0;
+         fg->reversedSearch = 0;
+         /* Remove end-of-line character ('$'). */
+         if (pattern[fg->patternLen - 1] == '$') {
+                 eol++;
+                 fg->eol = 1;
+                 fg->patternLen--;
+                 boleol = 1;
+         }
+         /* Remove beginning-of-line character ('^'). */
+         if (pattern[0] == '^') {
+                 bol++;
+                 fg->bol = 1;
+                 fg->patternLen--;
+                 boleol = 1;
+         }
+         /*
+          * Copy pattern minus '^' and '$' characters at the beginning and ending of
+          * the string respectively.
+          */
+         fg->pattern = grep_strdup(pattern + bol);
+         /* Look for ways to cheat...er...avoid the full regex engine. */
+         for (i = 0; i < fg->patternLen; i++)
+         {
+                 /* Can still cheat? */
+                 if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
+                     (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
+                     (fg->pattern[i] == '^') || (fg->pattern[i] == '$') ||
+                     (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
+                     (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
+                         /* As long as it is good, upper case it for later. */
+                         if (iflag)
+                                 fg->pattern[i] = toupper(fg->pattern[i]);
+                 } else if (fg->pattern[i] == '.') {
+                         hasDot = i;
+                         if (i < fg->patternLen / 2) {
+                                 if (firstHalfDot < -1)
+                                         /* Closest dot to the beginning */
+                                         firstHalfDot = i;
+                         } else {
+                                 /* Closest dot to the end of the pattern. */
+                                 lastHalfDot = i;
+                                 if (firstLastHalfDot < 0)
+                                         firstLastHalfDot = i;
+                         }
+                 } else {
+                         /* Free memory and let others know this is empty. */
+                         free(fg->pattern);
+                         fg->pattern = NULL;
+                         return (-1);
+                 }
+         }
+         /*
+          * Determine if a reverse search would be faster based on the placement
+          * of the dots.
+          */
+         if ((!(lflag || cflag)) && ((!(bol || eol)) &&
+             ((lastHalfDot) && ((firstHalfDot < 0) ||
+             ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
+                 fg->reversedSearch = 1;
+                 hasDot = fg->patternLen - (firstHalfDot < 0 ?
+                     firstLastHalfDot : firstHalfDot) - 1;
+                 grep_revstr(fg->pattern, fg->patternLen);
+         }
+         /*
+          * Normal Quick Search would require a shift based on the position the
+          * next character after the comparison is within the pattern.  With
+          * wildcards, the position of the last dot effects the maximum shift
+          * distance.
+          * The closer to the end the wild card is the slower the search.  A
+          * reverse version of this algorithm would be useful for wildcards near
+          * the end of the string.
+          *
+          * Examples:
+          * Pattern      Max shift
+          * -------      ---------
+          * this         5
+          * .his         4
+          * t.is         3
+          * th.s         2
+          * thi.         1
+          */
+         /* Adjust the shift based on location of the last dot ('.'). */
+         shiftPatternLen = fg->patternLen - hasDot;
+         /* Preprocess pattern. */
+         for (i = 0; i <= UCHAR_MAX; i++)
+                 fg->qsBc[i] = shiftPatternLen;
+         for (i = hasDot + 1; i < fg->patternLen; i++) {
+                 fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
+                 /*
+                  * If case is ignored, make the jump apply to both upper and
+                  * lower cased characters.  As the pattern is stored in upper
+                  * case, apply the same to the lower case equivalents.
+                  */
+                 if (iflag)
+                         fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
+         }
+         /*
+          * Put pattern back to normal after pre-processing to allow for easy
+          * comparisons later.
+          */
+         if (fg->reversedSearch)
+                 grep_revstr(fg->pattern, fg->patternLen);
+         return (0);
+ }
+ static int grep_search(fastgrep_t *fg, unsigned char *data, int dataLen)
+ {
+         int j;
+         int rtrnVal = REG_NOMATCH;
+         /* No point in going farther if we do not have enough data. */
+         if (dataLen < fg->patternLen)
+                 return (rtrnVal);
+         /* Only try once at the beginning or ending of the line. */
+         if (fg->bol || fg->eol) {
+                 /* Simple text comparison. */
+                 /* Verify data is >= pattern length before searching on it. */
+                 if (dataLen >= fg->patternLen) {
+                         /* Determine where in data to start search at. */
+                         if (fg->eol)
+                                 j = dataLen - fg->patternLen;
+                         else
+                                 j = 0;
+                         if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
+                                 if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1)
+                                         rtrnVal = 0;
+                 }
+         } else if (fg->reversedSearch) {
+                 /* Quick Search algorithm. */
+                 j = dataLen;
+                 do {
+                         if (grep_cmp(fg->pattern, data + j - fg->patternLen,
+                             fg->patternLen) == -1) {
+                                 rtrnVal = 0;
+                                 break;
+                         }
+                         /* Shift if within bounds, otherwise, we are done. */
+                         if (j == 0)
+                                 break;
+                         else
+                                 j -= fg->qsBc[data[j - fg->patternLen - 1]];
+                 } while (j >= 0);
+         } else {
+                 /* Quick Search algorithm. */
+                 j = 0;
+                 do {
+                         if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
+                                 rtrnVal = 0;
+                                 break;
+                         }
+                         /* Shift if within bounds, otherwise, we are done. */
+                         if (j + fg->patternLen == dataLen)
+                                 break;
+                         else
+                                 j += fg->qsBc[data[j + fg->patternLen]];
+                 } while (j <= (dataLen - fg->patternLen));
+         }
+         return (rtrnVal);
+ }
  void *
  grep_malloc(size_t size)
  {
-Line 238
+Line 441
 Line 238
 Line 441
          if ((ptr = realloc(ptr, size)) == NULL)
                  err(1, "realloc");
          return ptr;
+ }
+ unsigned char *
+ grep_strdup(const char *str)
+ {
+         unsigned char *ptr;
+         if ((ptr = (unsigned char *)strdup(str)) == NULL)
+                 err(1, "strdup");
+         return ptr;
+ }
+ /*
+  * Returns:     i >= 0 on failure (position that it failed)
+  *              -1 on success
+  */
+ int
+ grep_cmp(const unsigned char *pattern, const unsigned char *data,
+     size_t len)
+ {
+         int i;
+         for (i = 0; i < len; i++) {
+                 if (((pattern[i] == data[i]) || (pattern[i] == '.')) ||
+                     (iflag && pattern[i] == toupper(data[i])))
+                         continue;
+                 return (i);
+         }
+         return (-1);
+ }
+ static void
+ grep_revstr(unsigned char *str, int len)
+ {
+         int i;
+         char c;
+         for (i = 0; i < len / 2; i++) {
+                 c = str[i];
+                 str[i] = str[len - i - 1];
+                 str[len - i - 1] = c;
+         }
  }
  void