Merge abarkov@bk-internal.mysql.com:/home/bk/mysql-5.0-rpl (905bc913) · Commits · Software / OSDI20 Artifacts / mariadb

mysql-test/r/ctype_ucs.result

+18 −0

Original line number	Diff line number	Diff line
		@@ -839,6 +839,24 @@ lily
		river
		drop table t1;
		deallocate prepare stmt;
		set names latin1;
		set character_set_connection=ucs2;
		select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
		soundex('') soundex('he') soundex('hello all folks') soundex('#3556 in bugdb')
		H000 H4142 I51231
		select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
		hex(soundex('')) hex(soundex('he')) hex(soundex('hello all folks')) hex(soundex('#3556 in bugdb'))
		0048003000300030 00480034003100340032 004900350031003200330031
		select 'mood' sounds like 'mud';
		'mood' sounds like 'mud'
		1
		select hex(soundex(_ucs2 0x041004110412));
		hex(soundex(_ucs2 0x041004110412))
		0410003000300030
		select hex(soundex(_ucs2 0x00BF00C0));
		hex(soundex(_ucs2 0x00BF00C0))
		00C0003000300030
		set names latin1;
		create table t1(a blob, b text charset utf8, c text charset ucs2);
		select data_type, character_octet_length, character_maximum_length
		from information_schema.columns where table_name='t1';

mysql-test/r/ctype_utf8.result

+12 −0

Original line number	Diff line number	Diff line
		@@ -854,6 +854,18 @@ select * from t1 where soundex(a) = soundex('test');
		id a
		1 Test
		drop table t1;
		select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
		soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)
		阅000
		select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
		hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB))
		E99885303030
		select soundex(_utf8 0xD091D092D093);
		soundex(_utf8 0xD091D092D093)
		Б000
		select hex(soundex(_utf8 0xD091D092D093));
		hex(soundex(_utf8 0xD091D092D093))
		D091303030
		SET collation_connection='utf8_general_ci';
		create table t1 select repeat('a',4000) a;
		delete from t1;

mysql-test/t/ctype_ucs.test

+14 −0

Original line number	Diff line number	Diff line
		@@ -572,6 +572,20 @@ select utext from t1 where utext like '%%';
		drop table t1;
		deallocate prepare stmt;

		#
		# Bug#22638 SOUNDEX broken for international characters
		#
		set names latin1;
		set character_set_connection=ucs2;
		select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
		select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
		select 'mood' sounds like 'mud';
		# Cyrillic A, BE, VE
		select hex(soundex(_ucs2 0x041004110412));
		# Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter
		select hex(soundex(_ucs2 0x00BF00C0));
		set names latin1;

		#
		# Bug #14290: character_maximum_length for text fields
		#

mysql-test/t/ctype_utf8.test

+8 −0

Original line number	Diff line number	Diff line
		@@ -702,6 +702,14 @@ select * from t1 where soundex(a) = soundex('TEST');
		select * from t1 where soundex(a) = soundex('test');
		drop table t1;

		#
		# Bug#22638 SOUNDEX broken for international characters
		#
		select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
		select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
		select soundex(_utf8 0xD091D092D093);
		select hex(soundex(_utf8 0xD091D092D093));


		SET collation_connection='utf8_general_ci';
		-- source include/ctype_filesort.inc

sql/item_strfunc.cc

+106 −29

Original line number	Diff line number	Diff line
		@@ -1805,7 +1805,8 @@ void Item_func_soundex::fix_length_and_dec()
		{
		collation.set(args[0]->collation);
		max_length=args[0]->max_length;
		set_if_bigger(max_length,4);
		set_if_bigger(max_length, 4 * collation.collation->mbminlen);
		tmp_value.set_charset(collation.collation);
		}


		@@ -1815,14 +1816,15 @@ void Item_func_soundex::fix_length_and_dec()
		else return 0
		*/

		static char soundex_toupper(char ch)
		static int soundex_toupper(int ch)
		{
		return (ch >= 'a' && ch <= 'z') ? ch - 'a' + 'A' : ch;
		}

		static char get_scode(char *ptr)

		static char get_scode(int wc)
		{
		uchar ch= soundex_toupper(*ptr);
		int ch= soundex_toupper(wc);
		if (ch < 'A' \|\| ch > 'Z')
		{
		// Thread extended alfa (country spec)
		@@ -1832,46 +1834,121 @@ static char get_scode(char *ptr)
		}


		static bool my_uni_isalpha(int wc)
		{
		/*
		Return true for all Basic Latin letters: a..z A..Z.
		Return true for all Unicode characters with code higher than U+00C0:
		- characters between 'z' and U+00C0 are controls and punctuations.
		- "U+00C0 LATIN CAPITAL LETTER A WITH GRAVE" is the first letter after 'z'.
		*/
		return (wc >= 'a' && wc <= 'z') \|\|
		(wc >= 'A' && wc <= 'Z') \|\|
		(wc >= 0xC0);
		}


		String Item_func_soundex::val_str(String str)
		{
		DBUG_ASSERT(fixed == 1);
		String *res =args[0]->val_str(str);
		char last_ch,ch;
		CHARSET_INFO *cs= collation.collation;
		my_wc_t wc;
		uint nchars;
		int rc;

		if ((null_value= args[0]->null_value))
		return 0; /* purecov: inspected */

		if (tmp_value.alloc(max(res->length(),4)))
		if (tmp_value.alloc(max(res->length(), 4 * cs->mbminlen)))
		return str; /* purecov: inspected */
		char to= (char ) tmp_value.ptr();
		char *to_end= to + tmp_value.alloced_length();
		char from= (char ) res->ptr(), *end= from + res->length();
		tmp_value.set_charset(cs);

		while (from != end && !my_isalpha(cs,*from)) // Skip pre-space
		from++; /* purecov: inspected */
		if (from == end)
		return &my_empty_string; // No alpha characters.
		to++ = soundex_toupper(from); // Copy first letter
		last_ch = get_scode(from); // code of the first letter
		// for the first 'double-letter check.
		// Loop on input letters until
		// end of input (null) or output
		// letter code count = 3
		for (from++ ; from < end ; from++)
		{
		if (!my_isalpha(cs,*from))

		for ( ; ; ) /* Skip pre-space */
		{
		if ((rc= cs->cset->mb_wc(cs, &wc, (uchar) from, (uchar) end)) <= 0)
		return &my_empty_string; /* EOL or invalid byte sequence */

		if (rc == 1 && cs->ctype)
		{
		/* Single byte letter found */
		if (my_isalpha(cs, *from))
		{
		last_ch= get_scode(*from); // Code of the first letter
		to++= soundex_toupper(from++); // Copy first letter
		break;
		}
		from++;
		}
		else
		{
		from+= rc;
		if (my_uni_isalpha(wc))
		{
		/* Multibyte letter found */
		wc= soundex_toupper(wc);
		last_ch= get_scode(wc); // Code of the first letter
		if ((rc= cs->cset->wc_mb(cs, wc, (uchar) to, (uchar) to_end)) <= 0)
		{
		/* Extra safety - should not really happen */
		DBUG_ASSERT(false);
		return &my_empty_string;
		}
		to+= rc;
		break;
		}
		}
		}

		/*
		last_ch is now set to the first 'double-letter' check.
		loop on input letters until end of input
		*/
		for (nchars= 1 ; ; )
		{
		if ((rc= cs->cset->mb_wc(cs, &wc, (uchar) from, (uchar) end)) <= 0)
		break; /* EOL or invalid byte sequence */

		if (rc == 1 && cs->ctype)
		{
		if (!my_isalpha(cs, *from++))
		continue;
		}
		else
		{
		from+= rc;
		if (!my_uni_isalpha(wc))
		continue;
		ch=get_scode(from);
		}

		ch= get_scode(wc);
		if ((ch != '0') && (ch != last_ch)) // if not skipped or double
		{
		*to++ = ch; // letter, copy to output
		// letter, copy to output
		if ((rc= cs->cset->wc_mb(cs, (my_wc_t) ch,
		(uchar) to, (uchar) to_end)) <= 0)
		{
		// Extra safety - should not really happen
		DBUG_ASSERT(false);
		break;
		}
		to+= rc;
		nchars++;
		last_ch= ch; // save code of last input letter
		} // for next double-letter check
		}
		for (end=(char*) tmp_value.ptr()+4 ; to < end ; to++)
		*to = '0';
		*to=0; // end string

		/* Pad up to 4 characters with DIGIT ZERO, if the string is shorter */
		if (nchars < 4)
		{
		uint nbytes= (4 - nchars) * cs->mbminlen;
		cs->cset->fill(cs, to, nbytes, '0');
		to+= nbytes;
		}

		tmp_value.length((uint) (to-tmp_value.ptr()));
		return &tmp_value;
		}