merged (a40774fb) · Commits · Software / OSDI20 Artifacts / mariadb

Docs/manual.texi

+80 −38

Original line number	Diff line number	Diff line
		@@ -29292,42 +29292,36 @@ index.
		Full-text search is performed with the @code{MATCH} function.

		@example
		mysql> CREATE TABLE t (a VARCHAR(200), b TEXT, FULLTEXT (a,b));
		mysql> CREATE TABLE articles (
		-> id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
		-> title VARCHAR(200),
		-> body TEXT,
		-> FULLTEXT (title,body)
		-> );
		Query OK, 0 rows affected (0.00 sec)

		mysql> INSERT INTO t VALUES
		-> ('MySQL has now support', 'for full-text search'),
		-> ('Full-text indexes', 'are called collections'),
		-> ('Only MyISAM tables','support collections'),
		-> ('Function MATCH ... AGAINST()','is used to do a search'),
		-> ('Full-text search in MySQL', 'implements vector space model');
		mysql> INSERT INTO articles VALUES
		-> (0,'MySQL Tutorial', 'DBMS stands for DataBase Management ...'),
		-> (0,'How To Use MySQL Efficiently', 'After you went through a ...'),
		-> (0,'Optimizing MySQL','In this tutorial we will show how to ...'),
		-> (0,'1001 MySQL Trick','1. Never run mysqld as root. 2. Normalize ...'),
		-> (0,'MySQL vs. YourSQL', 'In the following database comparison we ...'),
		-> (0,'MySQL Security', 'When configured properly, MySQL could be ...');
		Query OK, 5 rows affected (0.00 sec)
		Records: 5 Duplicates: 0 Warnings: 0

		mysql> SELECT * FROM t WHERE MATCH (a,b) AGAINST ('MySQL');
		+---------------------------+-------------------------------+
		\| a \| b \|
		+---------------------------+-------------------------------+
		\| MySQL has now support \| for full-text search \|
		\| Full-text search in MySQL \| implements vector-space-model \|
		+---------------------------+-------------------------------+
		mysql> SELECT * FROM articles WHERE MATCH (title,body) AGAINST ('database');
		+----+-------------------+---------------------------------------------+
		\| id \| title \| body \|
		+----+-------------------+---------------------------------------------+
		\| 5 \| MySQL vs. YourSQL \| In the following database comparison we ... \|
		\| 1 \| MySQL Tutorial \| DBMS stands for DataBase Management ... \|
		+----+-------------------+---------------------------------------------+
		2 rows in set (0.00 sec)

		mysql> SELECT *,MATCH a,b AGAINST ('collections support') as x FROM t;
		+------------------------------+-------------------------------+--------+
		\| a \| b \| x \|
		+------------------------------+-------------------------------+--------+
		\| MySQL has now support \| for full-text search \| 0.3834 \|
		\| Full-text indexes \| are called collections \| 0.3834 \|
		\| Only MyISAM tables \| support collections \| 0.7668 \|
		\| Function MATCH ... AGAINST() \| is used to do a search \| 0 \|
		\| Full-text search in MySQL \| implements vector space model \| 0 \|
		+------------------------------+-------------------------------+--------+
		5 rows in set (0.00 sec)
		@end example

		The function @code{MATCH} matches a natural language query @code{AGAINST}
		a text collection (which is simply the columns that are covered by a
		a text collection (which is simply the set of columns covered by a
		@code{FULLTEXT} index). For every row in a table it returns relevance -
		a similarity measure between the text in that row (in the columns that are
		part of the collection) and the query. When it is used in a @code{WHERE}
		@@ -29338,10 +29332,51 @@ number of words in the row, the number of unique words in that row, the
		total number of words in the collection, and the number of documents (rows)
		that contain a particular word.

		MySQL uses a very simple parser to split text into words. A ``word'' is
		any sequence of letters, numbers, @samp{'}, and @samp{_}. Any ``word''
		that is present in the stopword list or just too short (3 characters
		or less) is ignored.
		The above is a basic example of using @code{MATCH} function. Rows are
		returned with relevance decreasing.

		@example
		mysql> SELECT id,MATCH (title,body) AGAINST ('Tutorial') FROM articles;
		+----+-----------------------------------------+
		\| id \| MATCH (title,body) AGAINST ('Tutorial') \|
		+----+-----------------------------------------+
		\| 1 \| 0.64840710366884 \|
		\| 2 \| 0 \|
		\| 3 \| 0.66266459031789 \|
		\| 4 \| 0 \|
		\| 5 \| 0 \|
		\| 6 \| 0 \|
		+----+-----------------------------------------+
		5 rows in set (0.00 sec)
		@end example

		This example shows how to retrieve the relevances. As neither @code{WHERE}
		nor @code{ORDER BY} clauses are present, returned rows are not ordered.

		@example
		mysql> SELECT id, body, MATCH (title,body) AGAINST (
		-> 'Security implications of running MySQL as root') AS score
		-> FROM articles WHERE MATCH (title,body) AGAINST
		-> ('Security implications of running MySQL as root');
		+----+-----------------------------------------------+-----------------+
		\| id \| body \| score \|
		+----+-----------------------------------------------+-----------------+
		\| 4 \| 1. Never run mysqld as root. 2. Normalize ... \| 1.5055546709332 \|
		\| 6 \| When configured properly, MySQL could be ... \| 1.31140957288 \|
		+----+-----------------------------------------------+-----------------+
		2 rows in set (0.00 sec)
		@end example

		This is more complex example - the query returns the relevance and still
		sorts the rows with relevance decreasing. To achieve it one should specify
		@code{MATCH} twice. Note, that this will cause no additional overhead, as
		@strong{MySQL} optimizer will notice that these two @code{MATCH} calls are
		identical and will call full-text search code only once.

		@strong{MySQL} uses a very simple parser to split text into words. A
		``word'' is any sequence of letters, numbers, @samp{'}, and @samp{_}. Any
		``word'' that is present in the stopword list or just too short (3
		characters or less) is ignored.

		Every correct word in the collection and in the query is weighted,
		according to its significance in the query or collection. This way, a
		@@ -29356,17 +29391,22 @@ carefully tuned this way). For very small tables, word distribution
		does not reflect adequately their semantical value, and this model
		may sometimes produce bizarre results.

		For example, search for the word "search" will produce no results in the
		above example. Word "search" is present in more than half of rows, and
		as such, is effectively treated as a stopword (that is, with semantical value
		zero). It is, really, the desired behavior - a natural language query
		should not return every other row in 1GB table.
		@example
		mysql> SELECT * FROM articles WHERE MATCH (title,body) AGAINST ('MySQL');
		Empty set (0.00 sec)
		@end example

		Search for the word @code{MySQL} produces no results in the above example.
		Word @code{MySQL} is present in more than half of rows, and as such, is
		effectively treated as a stopword (that is, with semantical value zero).
		It is, really, the desired behavior - a natural language query should not
		return every second row in 1GB table.

		A word that matches half of rows in a table is less likely to locate relevant
		documents. In fact, it will most likely find plenty of irrelevant documents.
		We all know this happens far too often when we are trying to find something on
		the Internet with a search engine. It is with this reasoning that such rows
		have been assigned a low semantical value in @strong{a particular dataset}.
		have been assigned a low semantical value in @strong{this particular dataset}.

		@menu
		* Fulltext Fine-tuning::
		@@ -44222,6 +44262,9 @@ not yet 100% confident in this code.
		@appendixsubsec Changes in release 3.23.38
		@itemize @bullet
		@item
		Fixed a bug when @code{SELECT} from @code{MERGE} table
		sometimes results in incorrectly ordered rows.
		@item
		Fixed a bug in @code{REPLACE()} when using the ujis character set.
		@item
		Applied Sleepycat BDB patches 3.2.9.1 and 3.2.9.2.
		@@ -50083,7 +50126,6 @@ Start the @code{mysqld} server with a trace log in @file{/tmp/mysqld.trace}

		On Windows you should also use the @code{--standalone} flag to not start
		@code{mysqld} as a service.

		Note that the trace file will get very @emph{BIG}!

		If you want to have a smaller trace file, you can use something like:

myisammrg/mymrgdef.h

+0 −1

Original line number	Diff line number	Diff line
		@@ -29,4 +29,3 @@ extern pthread_mutex_t THR_LOCK_open;
		#endif

		int _myrg_init_queue(MYRG_INFO *info,int inx,enum ha_rkey_function search_flag);
		int _myrg_finish_scan(MYRG_INFO *info, int inx, enum ha_rkey_function type);

myisammrg/myrg_rkey.c

+2 −9

Original line number	Diff line number	Diff line
		@@ -44,7 +44,6 @@ int myrg_rkey(MYRG_INFO info,byte record,int inx, const byte *key,
		MYRG_TABLE *table;
		MI_INFO *mi;
		int err;
		byte *buf=((search_flag == HA_READ_KEY_EXACT) ? record: 0);
		LINT_INIT(key_buff);
		LINT_INIT(pack_key_length);

		@@ -57,14 +56,14 @@ int myrg_rkey(MYRG_INFO info,byte record,int inx, const byte *key,

		if (table == info->open_tables)
		{
		err=mi_rkey(mi,buf,inx,key,key_len,search_flag);
		err=mi_rkey(mi,0,inx,key,key_len,search_flag);
		key_buff=(byte*) mi->lastkey+mi->s->base.max_key_length;
		pack_key_length=mi->last_rkey_length;
		}
		else
		{
		mi->use_packed_key=1;
		err=mi_rkey(mi,buf,inx,key_buff,pack_key_length,search_flag);
		err=mi_rkey(mi,0,inx,key_buff,pack_key_length,search_flag);
		mi->use_packed_key=0;
		}
		info->last_used_table=table+1;
		@@ -78,12 +77,6 @@ int myrg_rkey(MYRG_INFO info,byte record,int inx, const byte *key,
		/* adding to queue */
		queue_insert(&(info->by_key),(byte *)table);

		/* if looking for KEY_EXACT, return first matched now */
		if (buf)
		{
		info->current_table=table;
		return 0;
		}
		}

		if (!info->by_key.elements)

myisammrg/myrg_rnext.c

+4 −41

Original line number	Diff line number	Diff line
		@@ -29,7 +29,11 @@ int myrg_rnext(MYRG_INFO info, byte buf, int inx)
		if ((err=mi_rnext(info->current_table->table,NULL,inx)))
		{
		if (err == HA_ERR_END_OF_FILE)
		{
		queue_remove(&(info->by_key),0);
		if (!info->by_key.elements)
		return HA_ERR_END_OF_FILE;
		}
		else
		return err;
		}
		@@ -40,48 +44,7 @@ int myrg_rnext(MYRG_INFO info, byte buf, int inx)
		queue_replaced(&(info->by_key));
		}

		/* next, let's finish myrg_rkey's initial scan */
		if ((err=_myrg_finish_scan(info, inx, HA_READ_KEY_OR_NEXT)))
		return err;

		if (!info->by_key.elements)
		return HA_ERR_END_OF_FILE;

		/* now, mymerge's read_next is as simple as one queue_top */
		mi=(info->current_table=(MYRG_TABLE *)queue_top(&(info->by_key)))->table;
		return mi_rrnd(mi,buf,mi->lastpos);
		}


		/* let's finish myrg_rkey's initial scan */

		int _myrg_finish_scan(MYRG_INFO *info, int inx, enum ha_rkey_function type)
		{
		int err;
		MYRG_TABLE *table=info->last_used_table;
		if (table < info->end_table)
		{
		MI_INFO *mi= table[-1].table;
		byte key_buff=(byte) mi->lastkey+mi->s->base.max_key_length;
		uint pack_key_length= mi->last_rkey_length;

		for (; table < info->end_table ; table++)
		{
		mi=table->table;
		mi->use_packed_key=1;
		err=mi_rkey(mi,NULL,inx,key_buff,pack_key_length,type);
		mi->use_packed_key=0;
		if (err)
		{
		if (err == HA_ERR_KEY_NOT_FOUND) /* If end of file */
		continue;
		return err;
		}
		/* Found here, adding to queue */
		queue_insert(&(info->by_key),(byte *) table);
		}
		/* All tables are now used */
		info->last_used_table=table;
		}
		return 0;
		}

myisammrg/myrg_rprev.c

+4 −9

Original line number	Diff line number	Diff line
		@@ -29,7 +29,11 @@ int myrg_rprev(MYRG_INFO info, byte buf, int inx)
		if ((err=mi_rprev(info->current_table->table,NULL,inx)))
		{
		if (err == HA_ERR_END_OF_FILE)
		{
		queue_remove(&(info->by_key),0);
		if (!info->by_key.elements)
		return HA_ERR_END_OF_FILE;
		}
		else
		return err;
		}
		@@ -40,16 +44,7 @@ int myrg_rprev(MYRG_INFO info, byte buf, int inx)
		queue_replaced(&(info->by_key));
		}

		/* next, let's finish myrg_rkey's initial scan */
		if ((err=_myrg_finish_scan(info, inx, HA_READ_KEY_OR_PREV)))
		return err;

		if (!info->by_key.elements)
		return HA_ERR_END_OF_FILE;

		/* now, mymerge's read_prev is as simple as one queue_top */
		mi=(info->current_table=(MYRG_TABLE *)queue_top(&(info->by_key)))->table;
		return mi_rrnd(mi,buf,mi->lastpos);
		}