Loading mysql-test/r/ctype_ucs.result +18 −0 Original line number Diff line number Diff line Loading @@ -839,6 +839,24 @@ lily river drop table t1; deallocate prepare stmt; set names latin1; set character_set_connection=ucs2; select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb'); soundex('') soundex('he') soundex('hello all folks') soundex('#3556 in bugdb') H000 H4142 I51231 select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb')); hex(soundex('')) hex(soundex('he')) hex(soundex('hello all folks')) hex(soundex('#3556 in bugdb')) 0048003000300030 00480034003100340032 004900350031003200330031 select 'mood' sounds like 'mud'; 'mood' sounds like 'mud' 1 select hex(soundex(_ucs2 0x041004110412)); hex(soundex(_ucs2 0x041004110412)) 0410003000300030 select hex(soundex(_ucs2 0x00BF00C0)); hex(soundex(_ucs2 0x00BF00C0)) 00C0003000300030 set names latin1; create table t1(a blob, b text charset utf8, c text charset ucs2); select data_type, character_octet_length, character_maximum_length from information_schema.columns where table_name='t1'; Loading mysql-test/r/ctype_utf8.result +12 −0 Original line number Diff line number Diff line Loading @@ -854,6 +854,18 @@ select * from t1 where soundex(a) = soundex('test'); id a 1 Test drop table t1; select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB); soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB) 阅000 select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)); hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)) E99885303030 select soundex(_utf8 0xD091D092D093); soundex(_utf8 0xD091D092D093) Б000 select hex(soundex(_utf8 0xD091D092D093)); hex(soundex(_utf8 0xD091D092D093)) D091303030 SET collation_connection='utf8_general_ci'; create table t1 select repeat('a',4000) a; delete from t1; Loading mysql-test/t/ctype_ucs.test +14 −0 Original line number Diff line number Diff line Loading @@ -572,6 +572,20 @@ select utext from t1 where utext like '%%'; drop table t1; deallocate prepare stmt; # # Bug#22638 SOUNDEX broken for international characters # set names latin1; set character_set_connection=ucs2; select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb'); select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb')); select 'mood' sounds like 'mud'; # Cyrillic A, BE, VE select hex(soundex(_ucs2 0x041004110412)); # Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter select hex(soundex(_ucs2 0x00BF00C0)); set names latin1; # # Bug #14290: character_maximum_length for text fields # Loading mysql-test/t/ctype_utf8.test +8 −0 Original line number Diff line number Diff line Loading @@ -702,6 +702,14 @@ select * from t1 where soundex(a) = soundex('TEST'); select * from t1 where soundex(a) = soundex('test'); drop table t1; # # Bug#22638 SOUNDEX broken for international characters # select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB); select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)); select soundex(_utf8 0xD091D092D093); select hex(soundex(_utf8 0xD091D092D093)); SET collation_connection='utf8_general_ci'; -- source include/ctype_filesort.inc Loading sql/item_strfunc.cc +106 −29 Original line number Diff line number Diff line Loading @@ -1805,7 +1805,8 @@ void Item_func_soundex::fix_length_and_dec() { collation.set(args[0]->collation); max_length=args[0]->max_length; set_if_bigger(max_length,4); set_if_bigger(max_length, 4 * collation.collation->mbminlen); tmp_value.set_charset(collation.collation); } Loading @@ -1815,14 +1816,15 @@ void Item_func_soundex::fix_length_and_dec() else return 0 */ static char soundex_toupper(char ch) static int soundex_toupper(int ch) { return (ch >= 'a' && ch <= 'z') ? ch - 'a' + 'A' : ch; } static char get_scode(char *ptr) static char get_scode(int wc) { uchar ch= soundex_toupper(*ptr); int ch= soundex_toupper(wc); if (ch < 'A' || ch > 'Z') { // Thread extended alfa (country spec) Loading @@ -1832,46 +1834,121 @@ static char get_scode(char *ptr) } static bool my_uni_isalpha(int wc) { /* Return true for all Basic Latin letters: a..z A..Z. Return true for all Unicode characters with code higher than U+00C0: - characters between 'z' and U+00C0 are controls and punctuations. - "U+00C0 LATIN CAPITAL LETTER A WITH GRAVE" is the first letter after 'z'. */ return (wc >= 'a' && wc <= 'z') || (wc >= 'A' && wc <= 'Z') || (wc >= 0xC0); } String *Item_func_soundex::val_str(String *str) { DBUG_ASSERT(fixed == 1); String *res =args[0]->val_str(str); char last_ch,ch; CHARSET_INFO *cs= collation.collation; my_wc_t wc; uint nchars; int rc; if ((null_value= args[0]->null_value)) return 0; /* purecov: inspected */ if (tmp_value.alloc(max(res->length(),4))) if (tmp_value.alloc(max(res->length(), 4 * cs->mbminlen))) return str; /* purecov: inspected */ char *to= (char *) tmp_value.ptr(); char *to_end= to + tmp_value.alloced_length(); char *from= (char *) res->ptr(), *end= from + res->length(); tmp_value.set_charset(cs); while (from != end && !my_isalpha(cs,*from)) // Skip pre-space from++; /* purecov: inspected */ if (from == end) return &my_empty_string; // No alpha characters. *to++ = soundex_toupper(*from); // Copy first letter last_ch = get_scode(from); // code of the first letter // for the first 'double-letter check. // Loop on input letters until // end of input (null) or output // letter code count = 3 for (from++ ; from < end ; from++) { if (!my_isalpha(cs,*from)) for ( ; ; ) /* Skip pre-space */ { if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0) return &my_empty_string; /* EOL or invalid byte sequence */ if (rc == 1 && cs->ctype) { /* Single byte letter found */ if (my_isalpha(cs, *from)) { last_ch= get_scode(*from); // Code of the first letter *to++= soundex_toupper(*from++); // Copy first letter break; } from++; } else { from+= rc; if (my_uni_isalpha(wc)) { /* Multibyte letter found */ wc= soundex_toupper(wc); last_ch= get_scode(wc); // Code of the first letter if ((rc= cs->cset->wc_mb(cs, wc, (uchar*) to, (uchar*) to_end)) <= 0) { /* Extra safety - should not really happen */ DBUG_ASSERT(false); return &my_empty_string; } to+= rc; break; } } } /* last_ch is now set to the first 'double-letter' check. loop on input letters until end of input */ for (nchars= 1 ; ; ) { if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0) break; /* EOL or invalid byte sequence */ if (rc == 1 && cs->ctype) { if (!my_isalpha(cs, *from++)) continue; } else { from+= rc; if (!my_uni_isalpha(wc)) continue; ch=get_scode(from); } ch= get_scode(wc); if ((ch != '0') && (ch != last_ch)) // if not skipped or double { *to++ = ch; // letter, copy to output // letter, copy to output if ((rc= cs->cset->wc_mb(cs, (my_wc_t) ch, (uchar*) to, (uchar*) to_end)) <= 0) { // Extra safety - should not really happen DBUG_ASSERT(false); break; } to+= rc; nchars++; last_ch= ch; // save code of last input letter } // for next double-letter check } for (end=(char*) tmp_value.ptr()+4 ; to < end ; to++) *to = '0'; *to=0; // end string /* Pad up to 4 characters with DIGIT ZERO, if the string is shorter */ if (nchars < 4) { uint nbytes= (4 - nchars) * cs->mbminlen; cs->cset->fill(cs, to, nbytes, '0'); to+= nbytes; } tmp_value.length((uint) (to-tmp_value.ptr())); return &tmp_value; } Loading Loading
mysql-test/r/ctype_ucs.result +18 −0 Original line number Diff line number Diff line Loading @@ -839,6 +839,24 @@ lily river drop table t1; deallocate prepare stmt; set names latin1; set character_set_connection=ucs2; select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb'); soundex('') soundex('he') soundex('hello all folks') soundex('#3556 in bugdb') H000 H4142 I51231 select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb')); hex(soundex('')) hex(soundex('he')) hex(soundex('hello all folks')) hex(soundex('#3556 in bugdb')) 0048003000300030 00480034003100340032 004900350031003200330031 select 'mood' sounds like 'mud'; 'mood' sounds like 'mud' 1 select hex(soundex(_ucs2 0x041004110412)); hex(soundex(_ucs2 0x041004110412)) 0410003000300030 select hex(soundex(_ucs2 0x00BF00C0)); hex(soundex(_ucs2 0x00BF00C0)) 00C0003000300030 set names latin1; create table t1(a blob, b text charset utf8, c text charset ucs2); select data_type, character_octet_length, character_maximum_length from information_schema.columns where table_name='t1'; Loading
mysql-test/r/ctype_utf8.result +12 −0 Original line number Diff line number Diff line Loading @@ -854,6 +854,18 @@ select * from t1 where soundex(a) = soundex('test'); id a 1 Test drop table t1; select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB); soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB) 阅000 select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)); hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)) E99885303030 select soundex(_utf8 0xD091D092D093); soundex(_utf8 0xD091D092D093) Б000 select hex(soundex(_utf8 0xD091D092D093)); hex(soundex(_utf8 0xD091D092D093)) D091303030 SET collation_connection='utf8_general_ci'; create table t1 select repeat('a',4000) a; delete from t1; Loading
mysql-test/t/ctype_ucs.test +14 −0 Original line number Diff line number Diff line Loading @@ -572,6 +572,20 @@ select utext from t1 where utext like '%%'; drop table t1; deallocate prepare stmt; # # Bug#22638 SOUNDEX broken for international characters # set names latin1; set character_set_connection=ucs2; select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb'); select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb')); select 'mood' sounds like 'mud'; # Cyrillic A, BE, VE select hex(soundex(_ucs2 0x041004110412)); # Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter select hex(soundex(_ucs2 0x00BF00C0)); set names latin1; # # Bug #14290: character_maximum_length for text fields # Loading
mysql-test/t/ctype_utf8.test +8 −0 Original line number Diff line number Diff line Loading @@ -702,6 +702,14 @@ select * from t1 where soundex(a) = soundex('TEST'); select * from t1 where soundex(a) = soundex('test'); drop table t1; # # Bug#22638 SOUNDEX broken for international characters # select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB); select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)); select soundex(_utf8 0xD091D092D093); select hex(soundex(_utf8 0xD091D092D093)); SET collation_connection='utf8_general_ci'; -- source include/ctype_filesort.inc Loading
sql/item_strfunc.cc +106 −29 Original line number Diff line number Diff line Loading @@ -1805,7 +1805,8 @@ void Item_func_soundex::fix_length_and_dec() { collation.set(args[0]->collation); max_length=args[0]->max_length; set_if_bigger(max_length,4); set_if_bigger(max_length, 4 * collation.collation->mbminlen); tmp_value.set_charset(collation.collation); } Loading @@ -1815,14 +1816,15 @@ void Item_func_soundex::fix_length_and_dec() else return 0 */ static char soundex_toupper(char ch) static int soundex_toupper(int ch) { return (ch >= 'a' && ch <= 'z') ? ch - 'a' + 'A' : ch; } static char get_scode(char *ptr) static char get_scode(int wc) { uchar ch= soundex_toupper(*ptr); int ch= soundex_toupper(wc); if (ch < 'A' || ch > 'Z') { // Thread extended alfa (country spec) Loading @@ -1832,46 +1834,121 @@ static char get_scode(char *ptr) } static bool my_uni_isalpha(int wc) { /* Return true for all Basic Latin letters: a..z A..Z. Return true for all Unicode characters with code higher than U+00C0: - characters between 'z' and U+00C0 are controls and punctuations. - "U+00C0 LATIN CAPITAL LETTER A WITH GRAVE" is the first letter after 'z'. */ return (wc >= 'a' && wc <= 'z') || (wc >= 'A' && wc <= 'Z') || (wc >= 0xC0); } String *Item_func_soundex::val_str(String *str) { DBUG_ASSERT(fixed == 1); String *res =args[0]->val_str(str); char last_ch,ch; CHARSET_INFO *cs= collation.collation; my_wc_t wc; uint nchars; int rc; if ((null_value= args[0]->null_value)) return 0; /* purecov: inspected */ if (tmp_value.alloc(max(res->length(),4))) if (tmp_value.alloc(max(res->length(), 4 * cs->mbminlen))) return str; /* purecov: inspected */ char *to= (char *) tmp_value.ptr(); char *to_end= to + tmp_value.alloced_length(); char *from= (char *) res->ptr(), *end= from + res->length(); tmp_value.set_charset(cs); while (from != end && !my_isalpha(cs,*from)) // Skip pre-space from++; /* purecov: inspected */ if (from == end) return &my_empty_string; // No alpha characters. *to++ = soundex_toupper(*from); // Copy first letter last_ch = get_scode(from); // code of the first letter // for the first 'double-letter check. // Loop on input letters until // end of input (null) or output // letter code count = 3 for (from++ ; from < end ; from++) { if (!my_isalpha(cs,*from)) for ( ; ; ) /* Skip pre-space */ { if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0) return &my_empty_string; /* EOL or invalid byte sequence */ if (rc == 1 && cs->ctype) { /* Single byte letter found */ if (my_isalpha(cs, *from)) { last_ch= get_scode(*from); // Code of the first letter *to++= soundex_toupper(*from++); // Copy first letter break; } from++; } else { from+= rc; if (my_uni_isalpha(wc)) { /* Multibyte letter found */ wc= soundex_toupper(wc); last_ch= get_scode(wc); // Code of the first letter if ((rc= cs->cset->wc_mb(cs, wc, (uchar*) to, (uchar*) to_end)) <= 0) { /* Extra safety - should not really happen */ DBUG_ASSERT(false); return &my_empty_string; } to+= rc; break; } } } /* last_ch is now set to the first 'double-letter' check. loop on input letters until end of input */ for (nchars= 1 ; ; ) { if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0) break; /* EOL or invalid byte sequence */ if (rc == 1 && cs->ctype) { if (!my_isalpha(cs, *from++)) continue; } else { from+= rc; if (!my_uni_isalpha(wc)) continue; ch=get_scode(from); } ch= get_scode(wc); if ((ch != '0') && (ch != last_ch)) // if not skipped or double { *to++ = ch; // letter, copy to output // letter, copy to output if ((rc= cs->cset->wc_mb(cs, (my_wc_t) ch, (uchar*) to, (uchar*) to_end)) <= 0) { // Extra safety - should not really happen DBUG_ASSERT(false); break; } to+= rc; nchars++; last_ch= ch; // save code of last input letter } // for next double-letter check } for (end=(char*) tmp_value.ptr()+4 ; to < end ; to++) *to = '0'; *to=0; // end string /* Pad up to 4 characters with DIGIT ZERO, if the string is shorter */ if (nchars < 4) { uint nbytes= (4 - nchars) * cs->mbminlen; cs->cset->fill(cs, to, nbytes, '0'); to+= nbytes; } tmp_value.length((uint) (to-tmp_value.ptr())); return &tmp_value; } Loading