Commit ce1353a4 authored by monty@hundin.mysql.fi's avatar monty@hundin.mysql.fi
Browse files

Optimize LIKE with turbo-boyer-more algoritm

parent 0c5c517d
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -26728,6 +26728,12 @@ In the first statement, the @code{LIKE} value begins with a wildcard
character.  In the second statement, the @code{LIKE} value is not a
constant.
MySQL 4.0 does another optimization on @code{LIKE}.  If you are using
@code{... LIKE "%string%"} and @code{string} is longer than 3 characters
then MySQL will use the turbo-boyer-more algorithm to once initialize
the pattern for the string and then use this pattern to quickly search
after the given string.
@findex IS NULL, and indexes
@cindex indexes, and @code{IS NULL}
Searching using @code{column_name IS NULL} will use indexes if column_name
@@ -49310,6 +49316,8 @@ Our TODO section contains what we plan to have in 4.0. @xref{TODO MySQL 4.0}.
@itemize @bullet
@item
Use turbo-boyer-more to speed up @code{LIKE "%keyword%"} searches.
@item
Fixed bug in @code{DROP DATABASE} with symlink.
@item
Fixed crash in @code{REPAIR ... USE_FRM}.
+11 −0
Original line number Diff line number Diff line
@@ -15,4 +15,15 @@ test
select * from t1 where a like "te_t";
a
test
select * from t1 where a like "%a%";
a
a
abc
abcd
select * from t1 where a like "%abcd%";
a
abcd
select * from t1 where a like "%abc\d%";
a
abcd
drop table t1;
+8 −0
Original line number Diff line number Diff line
@@ -9,4 +9,12 @@ select * from t1 where a like "abc%";
select * from t1 where a like "ABC%"; 
select * from t1 where a like "test%"; 
select * from t1 where a like "te_t"; 

#
# The following will test the boyer-more code
#
select * from t1 where a like "%a%";
select * from t1 where a like "%abcd%";
select * from t1 where a like "%abc\d%";

drop table t1;
+261 −5
Original line number Diff line number Diff line
@@ -1228,23 +1228,23 @@ void Item_func_like::fix_length_and_dec()
  //  cmp_type=STRING_RESULT;			// For quick select
}


longlong Item_func_like::val_int()
{
  String *res,*res2;
  res=args[0]->val_str(&tmp_value1);
  String* res = args[0]->val_str(&tmp_value1);
  if (args[0]->null_value)
  {
    null_value=1;
    return 0;
  }
  res2=args[1]->val_str(&tmp_value2);
  String* res2 = args[1]->val_str(&tmp_value2);
  if (args[1]->null_value)
  {
    null_value=1;
    return 0;
  }
  null_value=0;
  if (canDoTurboBM)
    return turboBM_matches(res->ptr(), res->length()) ? 1 : 0;
  if (binary)
    return wild_compare(*res,*res2,escape) ? 0 : 1;
  else
@@ -1268,6 +1268,51 @@ Item_func::optimize_type Item_func_like::select_optimize() const
  return OPTIMIZE_NONE;
}

bool Item_func_like::fix_fields(THD *thd,struct st_table_list *tlist)
{
  if (Item_bool_func2::fix_fields(thd, tlist))
    return 1;

  /*
    TODO--we could do it for non-const, but we'd have to
    recompute the tables for each row--probably not worth it.
  */
  if (args[1]->const_item() && !(specialflag & SPECIAL_NO_NEW_FUNC))
  {
    String* res2 = args[1]->val_str(&tmp_value2);
    const size_t len   = res2->length();
    const char*  first = res2->ptr();
    const char*  last  = first + len - 1;
    /*
      len must be > 2 ('%pattern%')
      heuristic: only do TurboBM for pattern_len > 2
    */

    if (len > MIN_TURBOBM_PATTERN_LEN + 2 &&
	*first == wild_many &&
	*last  == wild_many)
    {
      const char* tmp = first + 1;
      for ( ; *tmp != wild_many && *tmp != wild_one && *tmp != escape; tmp++) ;
      canDoTurboBM = tmp == last;
    }

    if (canDoTurboBM)
    {
      pattern     = first + 1;
      pattern_len = len - 2;
      DBUG_PRINT("TurboBM", ("Initializing pattern: '%s'...", first));
      int* suff = (int*)thd->alloc(sizeof(int[pattern_len + 1]));
      bmGs      = (int*)thd->alloc(sizeof(int[pattern_len + 1]));
      bmBc      = (int*)thd->alloc(sizeof(int[alphabet_size]));
      turboBM_compute_good_suffix_shifts(suff);
      turboBM_compute_bad_character_shifts();
      DBUG_PRINT("turboBM",("done"));
    }
  }
  return 0;
}

#ifdef USE_REGEX

bool
@@ -1307,7 +1352,6 @@ Item_func_regex::fix_fields(THD *thd,TABLE_LIST *tables)
  return 0;
}


longlong Item_func_regex::val_int()
{
  char buff[MAX_FIELD_WIDTH];
@@ -1364,3 +1408,215 @@ Item_func_regex::~Item_func_regex()
}

#endif /* USE_REGEX */


#ifdef LIKE_CMP_TOUPPER
#define likeconv(A) (uchar) toupper(A)
#else
#define likeconv(A) (uchar) my_sort_order[(uchar) (A)]
#endif


/**********************************************************************
  turboBM_compute_suffixes()
  Precomputation dependent only on pattern_len.
**********************************************************************/

void Item_func_like::turboBM_compute_suffixes(int* suff)
{
  const int   plm1 = pattern_len - 1;
  int            f = 0;
  int            g = plm1;
  int* const splm1 = suff + plm1;

  *splm1 = pattern_len;

  if (binary)
  {
    int i;
    for (i = pattern_len - 2; i >= 0; i--)
    {
      int tmp = *(splm1 + i - f);
      if (g < i && tmp < i - g)
	suff[i] = tmp;
      else
      {
	if (i < g)
	  g = i; // g = min(i, g)
	f = i;
	while (g >= 0 && pattern[g] == pattern[g + plm1 - f])
	  g--;
	suff[i] = f - g;
      }
    }
  }
  else
  {
    int i;
    for (i = pattern_len - 2; 0 <= i; --i)
    {
      int tmp = *(splm1 + i - f);
      if (g < i && tmp < i - g)
	suff[i] = tmp;
      else
      {
	if (i < g)
	  g = i; // g = min(i, g)
	f = i;
	while (g >= 0 && likeconv(pattern[g]) == likeconv(pattern[g + plm1 - f]))
	  g--;
	suff[i] = f - g;
      }
    }
  }
}


/**********************************************************************
   turboBM_compute_good_suffix_shifts()
   Precomputation dependent only on pattern_len.
**********************************************************************/

void Item_func_like::turboBM_compute_good_suffix_shifts(int* suff)
{
  turboBM_compute_suffixes(suff);

  int* end = bmGs + pattern_len;
  int* k;
  for (k = bmGs; k < end; k++)
    *k = pattern_len;

  int tmp;
  int i;
  int j          = 0;
  const int plm1 = pattern_len - 1;
  for (i = plm1; i > -1; i--)
  {
    if (suff[i] == i + 1)
    {
      for (tmp = plm1 - i; j < tmp; j++)
      {
	int* tmp2 = bmGs + j;
	if (*tmp2 == pattern_len)
	  *tmp2 = tmp;
      }
    }
  }

  int* tmp2;
  for (tmp = plm1 - i; j < tmp; j++)
  {
    tmp2 = bmGs + j;
    if (*tmp2 == pattern_len)
      *tmp2 = tmp;
  }

  tmp2 = bmGs + plm1;
  for (i = 0; i <= pattern_len - 2; i++)
    *(tmp2 - suff[i]) = plm1 - i;
}


/**********************************************************************
   turboBM_compute_bad_character_shifts()
   Precomputation dependent on pattern_len.
**********************************************************************/

void Item_func_like::turboBM_compute_bad_character_shifts()
{
  int*   i;
  int* end = bmBc + alphabet_size;
  for (i = bmBc; i < end; i++)
    *i = pattern_len;

  int j;
  const int plm1 = pattern_len - 1;
  if (binary)
    for (j = 0; j < plm1; j++)
      bmBc[pattern[j]] = plm1 - j;
  else
    for (j = 0; j < plm1; j++)
      bmBc[likeconv(pattern[j])] = plm1 - j;
}


/**********************************************************************
  turboBM_matches()
  Search for pattern in text, returns true/false for match/no match
**********************************************************************/

bool Item_func_like::turboBM_matches(const char* text, int text_len) const
{
  register int bcShift;
  register int turboShift;
  int shift = pattern_len;
  int j     = 0;
  int u     = 0;

  const int plm1  = pattern_len - 1;
  const int tlmpl =    text_len - pattern_len;

  /* Searching */
  if (binary)
  {
    while (j <= tlmpl)
    {
      register int i = plm1;
      while (i >= 0 && pattern[i] == text[i + j])
      {
	i--;
	if (i == plm1 - shift)
	  i -= u;
      }
      if (i < 0)
	return true;

      register const int v = plm1 - i;
      turboShift = u - v;
      bcShift    = bmBc[text[i + j]] - plm1 + i;
      shift      = max(turboShift, bcShift);
      shift      = max(shift, bmGs[i]);
      if (shift == bmGs[i])
	u = min(pattern_len - shift, v);
      else
      {
	if (turboShift < bcShift)
	  shift = max(shift, u + 1);
	u = 0;
      }
      j += shift;
    }
    return false;
  }
  else
  {
    while (j <= tlmpl)
    {
      register int i = plm1;
      while (i >= 0 && likeconv(pattern[i]) == likeconv(text[i + j]))
      {
	i--;
	if (i == plm1 - shift)
	  i -= u;
      }
      if (i < 0)
	return true;

      register const int v = plm1 - i;
      turboShift = u - v;
      bcShift    = bmBc[likeconv(text[i + j])] - plm1 + i;
      shift      = max(turboShift, bcShift);
      shift      = max(shift, bmGs[i]);
      if (shift == bmGs[i])
	u = min(pattern_len - shift, v);
      else
      {
	if (turboShift < bcShift)
	  shift = max(shift, u + 1);
	u = 0;
      }
      j += shift;
    }
    return false;
  }
}
+27 −2
Original line number Diff line number Diff line
@@ -478,15 +478,40 @@ class Item_func_isnotnull :public Item_bool_func
class Item_func_like :public Item_bool_func2
{
  char escape;
public:
  Item_func_like(Item *a,Item *b, char* escape_arg) :Item_bool_func2(a,b),escape(*escape_arg)

  // Turbo Boyer-Moore data
  bool        canDoTurboBM;	// pattern is '%abcd%' case
  const char* pattern;
  int         pattern_len;

  // TurboBM buffers, *this is owner
  int* bmGs; //   good suffix shift table, size is pattern_len + 1
  int* bmBc; // bad character shift table, size is alphabet_size

  void turboBM_compute_suffixes(int* suff);
  void turboBM_compute_good_suffix_shifts(int* suff);
  void turboBM_compute_bad_character_shifts();
  bool turboBM_matches(const char* text, int text_len) const;
  enum { alphabet_size = 256 };

public:
  Item_func_like::Item_func_like(Item *a,Item *b, char* escape_arg) :
    Item_bool_func2(a,b),
    escape(*escape_arg),
    canDoTurboBM(false),
    pattern(0),
    pattern_len(0),
    bmGs(0),
    bmBc(0)
  {}

  longlong val_int();
  enum Functype functype() const { return LIKE_FUNC; }
  optimize_type select_optimize() const;
  cond_result eq_cmp_result() const { return COND_TRUE; }
  const char *func_name() const { return "like"; }
  void fix_length_and_dec();
  bool fix_fields(THD *thd,struct st_table_list *tlist);
};

#ifdef USE_REGEX
Loading