Commit 3cc69d5a authored by serg@serg.mysql.com's avatar serg@serg.mysql.com
Browse files

phrase search

parent 08384a3a
Loading
Loading
Loading
Loading
+57 −22
Original line number Diff line number Diff line
@@ -59,6 +59,7 @@ static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
typedef struct st_ftb_expr FTB_EXPR;
struct st_ftb_expr {
  FTB_EXPR *up;
  byte     *quot, *qend;
  float     weight;
  uint      flags;
  my_off_t  docid[2];             /* for index search and for scan */
@@ -126,6 +127,7 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
    return;

  param.prev=' ';
  param.quot=up->quot;
  while ((res=ft_get_word(start,end,&w,&param)))
  {
    int   r=param.plusminus;
@@ -149,7 +151,7 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
        ftbw->word[0]=w.len;
        if (param.yesno > 0) up->ythresh++;
        queue_insert(& ftb->queue, (byte *)ftbw);
        ftb->with_scan|=param.trunc;
        ftb->with_scan|=(param.trunc & FTB_FLAG_TRUNC);
        break;
      case 2: /* left bracket */
        ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR));
@@ -160,10 +162,12 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
        ftbe->up=up;
        ftbe->ythresh=ftbe->yweaks=0;
        ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR;
        if ((ftbe->quot=param.quot)) ftb->with_scan|=2;
        if (param.yesno > 0) up->ythresh++;
        _ftb_parse_query(ftb, start, end, ftbe, depth+1);
        break;
      case 3: /* right bracket */
        if (up->quot) up->qend=param.quot;
        return;
    }
  }
@@ -260,7 +264,7 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
  ftbe->weight=1;
  ftbe->flags=FTB_FLAG_YES;
  ftbe->nos=1;
  ftbe->up=0;
  ftbe->quot=ftbe->up=0;
  ftbe->ythresh=ftbe->yweaks=0;
  ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR;
  ftb->root=ftbe;
@@ -270,16 +274,39 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
  memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
  qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
                              (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
  if (ftb->queue.elements<2) ftb->with_scan=0;
  if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
  ftb->state=READY;
  return ftb;
}

void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode)
/* returns 1 if str0 contain str1 */
int _ftb_strstr(const byte *s0, const byte *e0,
                const byte *s1, const byte *e1,
                CHARSET_INFO *cs)
{
  const byte *p;

  while (s0 < e0)
  {
    while (s0 < e0 && cs->to_upper[*s0++] != cs->to_upper[*s1])
      /* no-op */;
    if (s0 >= e0)
      return 0;
    p=s1+1;
    while (s0 < e0 && p < e1 && cs->to_upper[*s0++] == cs->to_upper[*p++])
      /* no-op */;
    if (p >= e1)
      return 1;
  }
  return 0;
}

void _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
{
  FT_SEG_ITERATOR ftsi;
  FTB_EXPR *ftbe;
  float weight=ftbw->weight;
  int  yn=ftbw->flags, ythresh;
  int  yn=ftbw->flags, ythresh, mode=(ftsi_orig != 0);
  my_off_t curdoc=ftbw->docid[mode];

  for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
@@ -300,6 +327,20 @@ void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode)
      {
        yn=ftbe->flags;
        weight=ftbe->cur_weight*ftbe->weight;
        if (mode && ftbe->quot)
        {
          int not_found=1;

          memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
          while (_mi_ft_segiterator(&ftsi) && not_found)
          {
            if (!ftsi.pos)
              continue;
            not_found = ! _ftb_strstr(ftsi.pos, ftsi.pos+ftsi.len,
                                      ftbe->quot, ftbe->qend, ftb->charset);
          }
          if (not_found) break;
        } /* ftbe->quot */
      }
      else
        break;
@@ -356,7 +397,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
  {
    while (curdoc==(ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
    {
      _ftb_climb_the_tree(ftbw,0);
      _ftb_climb_the_tree(ftb, ftbw, 0);

      /* update queue */
      r=_mi_search(info, keyinfo, (uchar*) ftbw->word, USE_WHOLE_KEY,
@@ -414,7 +455,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
  FT_WORD word;
  FTB_WORD *ftbw;
  FTB_EXPR *ftbe;
  FT_SEG_ITERATOR ftsi;
  FT_SEG_ITERATOR ftsi, ftsi2;
  const byte *end;
  my_off_t  docid=ftb->info->lastpos;

@@ -423,17 +464,11 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
  if (!ftb->queue.elements)
    return 0;

#if NOT_USED
  if (ftb->state == READY || ftb->state == INDEX_DONE)
    ftb->state=SCAN;
  else if (ftb->state != SCAN)
    return -3.0;
#endif

  if (ftb->keynr==NO_SUCH_KEY)
    _mi_ft_segiterator_dummy_init(record, length, &ftsi);
  else
    _mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
  memcpy(&ftsi2, &ftsi, sizeof(ftsi));

  while (_mi_ft_segiterator(&ftsi))
  {
@@ -464,7 +499,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
        if (ftbw->docid[1] == docid)
          continue;
        ftbw->docid[1]=docid;
        _ftb_climb_the_tree(ftbw,1);
        _ftb_climb_the_tree(ftb, ftbw, &ftsi2);
      }
    }
  }
+11 −3
Original line number Diff line number Diff line
@@ -133,13 +133,20 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param)
    for (;doc<end;doc++)
    {
      if (true_word_char(*doc)) break;
      if (*doc == FTB_LBR || *doc == FTB_RBR)
      if (*doc == FTB_RQUOT && param->quot) {
        param->quot=doc-1;
        *start=doc+1;
        return 3; /* FTB_RBR */
      }
      if ((*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
          && !param->quot)
      {
        /* param->prev=' '; */
        *start=doc+1;
        if (*doc == FTB_LQUOT) param->quot=*start;
        return (*doc == FTB_RBR)+2;
      }
      if (param->prev == ' ')
      if (param->prev == ' ' && !param->quot)
      {
        if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
        if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
@@ -149,7 +156,8 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param)
        if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; }
      }
      param->prev=*doc;
      param->yesno=param->plusminus=param->pmsign=0;
      param->yesno=(param->quot != 0);
      param->plusminus=param->pmsign=0;
    }

    mwc=0;
+4 −1
Original line number Diff line number Diff line
@@ -95,6 +95,8 @@ extern ulong collstat;
#define FTB_RBR   (ft_boolean_syntax[6])
#define FTB_NEG   (ft_boolean_syntax[7])
#define FTB_TRUNC (ft_boolean_syntax[8])
#define FTB_LQUOT (ft_boolean_syntax[10])
#define FTB_RQUOT (ft_boolean_syntax[11])

typedef struct st_ft_word {
  byte * pos;
@@ -111,6 +113,7 @@ typedef struct st_ftb_param {
  int  plusminus;
  bool pmsign;
  bool trunc;
  byte *quot;
} FTB_PARAM;

int is_stopword(char *word, uint len);
@@ -132,7 +135,7 @@ uint _mi_ft_segiterator(FT_SEG_ITERATOR *);

void ft_parse_init(TREE *, CHARSET_INFO *);
int ft_parse(TREE *, byte *, int);
FT_WORD * ft_linearize(/*MI_INFO *, uint, byte *, */TREE *);
FT_WORD * ft_linearize(TREE *);
FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, byte *, const byte *);
uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record);

+3 −0
Original line number Diff line number Diff line
@@ -67,6 +67,9 @@ Full-text indexes are called collections 1
Only MyISAM tables	support collections	2
Function MATCH ... AGAINST()	is used to do a search	0
Full-text search in MySQL	implements vector space model	0
select * from t1 where MATCH a,b AGAINST ('"Now sUPPort"' IN BOOLEAN MODE);
a	b
MySQL has now support	for full-text search
select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE);
a	b
Full-text search in MySQL	implements vector space model
+2 −1
Original line number Diff line number Diff line
@@ -20,7 +20,6 @@ select * from t1 where MATCH(a,b) AGAINST ("indexes collections");
# UNION of fulltext's
select * from t1 where MATCH(a,b) AGAINST ("collections") UNION ALL select * from t1 where MATCH(a,b) AGAINST ("indexes");


# boolean search

select * from t1 where MATCH(a,b) AGAINST("support -collections" IN BOOLEAN MODE);
@@ -34,6 +33,8 @@ select * from t1 where MATCH(a,b) AGAINST("+search -(support vector)" IN BOOLEAN
select *, MATCH(a,b) AGAINST("support collections" IN BOOLEAN MODE) as x from t1;
select *, MATCH(a,b) AGAINST("collections support" IN BOOLEAN MODE) as x from t1;

select * from t1 where MATCH a,b AGAINST ('"Now sUPPort"' IN BOOLEAN MODE);

# boolean w/o index:

select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE);