Commit c706bf40 authored by sasha@mysql.sashanet.com's avatar sasha@mysql.sashanet.com
Browse files

use tree for count(distinct) when possible

parent a12117f0
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -57,7 +57,7 @@ typedef struct st_tree {
  void (*free)(void *);
} TREE;

	/* Functions on hole tree */
	/* Functions on whole tree */
void init_tree(TREE *tree,uint default_alloc_size, int element_size,
	       qsort_cmp2 compare, my_bool with_delete,
	       void (*free_element)(void*));
+1 −1
Original line number Diff line number Diff line
@@ -84,7 +84,7 @@ void init_tree(TREE *tree, uint default_alloc_size, int size,
      ((uint) size <= sizeof(void*) || ((uint) size & (sizeof(void*)-1))))
  {
    tree->offset_to_key=sizeof(TREE_ELEMENT); /* Put key after element */
    /* Fix allocation size so that we don't loose any memory */
    /* Fix allocation size so that we don't lose any memory */
    default_alloc_size/=(sizeof(TREE_ELEMENT)+size);
    if (!default_alloc_size)
      default_alloc_size=1;
+102 −1
Original line number Diff line number Diff line
@@ -788,11 +788,56 @@ String *Item_std_field::val_str(String *str)

#include "sql_select.h"

static int simple_raw_key_cmp(void* arg, byte* key1, byte* key2)
{
  return memcmp(key1, key2, (int)arg);
}

static int simple_str_key_cmp(void* arg, byte* key1, byte* key2)
{
  return my_sortcmp(key1, key2, (int)arg);
}

// did not make this one static - at least gcc gets confused when
// I try to declare a static function as a friend. If you can figure
// out the syntax to make a static function a friend, make this one
// static
int composite_key_cmp(void* arg, byte* key1, byte* key2)
{
  Item_sum_count_distinct* item = (Item_sum_count_distinct*)arg;
  Field** field = item->table->field, **field_end;
  field_end = field + item->table->fields;
  for(; field < field_end; ++field)
    {
      int res;
      int len = (*field)->field_length;
      switch((*field)->type())
	{
	case FIELD_TYPE_STRING:
	case FIELD_TYPE_VAR_STRING:
	  res = my_sortcmp(key1, key2, len);
	  break;
	default:
	  res = memcmp(key1, key2, len);
	  break;
	}
      if(res)
	return res;
      key1 += len;
      key2 += len;
    }
  return 0;
}



Item_sum_count_distinct::~Item_sum_count_distinct()
{
  if (table)
    free_tmp_table(current_thd, table);
  delete tmp_table_param;
  if(use_tree)
    delete_tree(&tree);
}


@@ -821,6 +866,53 @@ bool Item_sum_count_distinct::setup(THD *thd)
			       0, 0, current_lex->options | thd->options)))
    return 1;
  table->file->extra(HA_EXTRA_NO_ROWS);		// Don't update rows

  if(table->db_type == DB_TYPE_HEAP) // no blobs, otherwise it would be
    // MyISAM
    {
      qsort_cmp2 compare_key;
      void* cmp_arg;
      int key_len;
      
      if(table->fields == 1) // if we have only one field, which is
	// the most common use of count(distinct), it is much faster
	// to use a simpler key compare method that can take advantage
	// of not having to worry about other fields
	{
	  switch(table->field[0]->type())
	    {
	      // if we have a string, we must take care of charsets
	      // and case sensitivity
	    case FIELD_TYPE_STRING:
	    case FIELD_TYPE_VAR_STRING:
	      compare_key = (qsort_cmp2)simple_str_key_cmp;
	      break;
	    default: // since at this point we cannot have blobs
	      // anything else can be compared with memcmp
	      compare_key = (qsort_cmp2)simple_raw_key_cmp;
	      break;
	    }
	  cmp_arg = (void*)(key_len = table->field[0]->field_length);
	  rec_offset = 1;
	}
      else // too bad, cannot cheat - there is more than one field
	{
	  cmp_arg = (void*)this;
	  compare_key = (qsort_cmp2)composite_key_cmp;
	  Field** field, **field_end;
	  field_end = (field = table->field) + table->fields;
	  for(key_len = 0; field < field_end; ++field)
	    {
	      key_len += (*field)->field_length;
	    }
	  rec_offset = table->reclength - key_len;
	}

      init_tree(&tree, 0, key_len, compare_key, 0, 0);
      tree.cmp_arg = cmp_arg;
      use_tree = 1;
    }
  
  return 0;
}

@@ -830,6 +922,8 @@ void Item_sum_count_distinct::reset()
  table->file->extra(HA_EXTRA_NO_CACHE);
  table->file->delete_all_rows();
  table->file->extra(HA_EXTRA_WRITE_CACHE);
  if(use_tree)
    delete_tree(&tree);
  (void) add();
}

@@ -843,7 +937,12 @@ bool Item_sum_count_distinct::add()
    if ((*field)->is_real_null(0))
      return 0;					// Don't count NULL

  if ((error=table->file->write_row(table->record[0])))
  if(use_tree)
    {
      if(!tree_insert(&tree, table->record[0] + rec_offset, 0))
	return 1;
    }
  else if ((error=table->file->write_row(table->record[0])))
  {
    if (error != HA_ERR_FOUND_DUPP_KEY &&
	error != HA_ERR_FOUND_DUPP_UNIQUE)
@@ -859,6 +958,8 @@ longlong Item_sum_count_distinct::val_int()
{
  if (!table)					// Empty query
    return LL(0);
  if(use_tree)
    return tree.elements_in_tree;
  table->file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK);
  return table->file->records;
}
+13 −2
Original line number Diff line number Diff line
@@ -21,6 +21,8 @@
#pragma interface			/* gcc class implementation */
#endif

#include <my_tree.h>

class Item_sum :public Item_result_field
{
public:
@@ -145,11 +147,20 @@ class Item_sum_count_distinct :public Item_sum_int
  table_map used_table_cache;
  bool fix_fields(THD *thd,TABLE_LIST *tables);
  TMP_TABLE_PARAM *tmp_table_param;
  TREE tree;
  bool use_tree; // If there are no blobs, we can use a tree, which
  // is faster than heap table. In that case, we still use the table
  // to help get things set up, but we insert nothing in it
  int rec_offset; // the first few bytes of record ( at least one)
  // are just markers for deleted and NULLs. We want to skip them since
  // they will just bloat the tree without providing any valuable info

  friend int composite_key_cmp(void* arg, byte* key1, byte* key2);
  
  public:
  Item_sum_count_distinct(List<Item> &list)
    :Item_sum_int(list),table(0),used_table_cache(~(table_map) 0),
    tmp_table_param(0)
     tmp_table_param(0),use_tree(0)
  { quick_group=0; }
  ~Item_sum_count_distinct();
  table_map used_tables() const { return used_table_cache; }