Bug #6040 can't retrieve records with umlaut characters in case insensitive manner (5267ec8a) · Commits · Software / OSDI20 Artifacts / mariadb

include/m_ctype.h

+5 −0

Original line number	Diff line number	Diff line
		@@ -365,6 +365,11 @@ uint my_instr_mb(struct charset_info_st *,
		const char *s, uint s_length,
		my_match_t *match, uint nmatch);

		int my_wildcmp_unicode(CHARSET_INFO *cs,
		const char str, const char str_end,
		const char wildstr, const char wildend,
		int escape, int w_one, int w_many,
		MY_UNICASE_INFO **weights);

		extern my_bool my_parse_charset_xml(const char *bug, uint len,
		int (add)(CHARSET_INFO cs));

mysql-test/r/ctype_utf8.result

+9 −0

Original line number	Diff line number	Diff line
		@@ -63,6 +63,15 @@ select 'A' like 'a' collate utf8_bin;
		select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%');
		_utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%')
		1
		select convert(_latin1'Gnter Andr' using utf8) like CONVERT(_latin1'GNTER%' USING utf8);
		convert(_latin1'Gnter Andr' using utf8) like CONVERT(_latin1'GNTER%' USING utf8)
		1
		select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
		CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8)
		1
		select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
		CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8)
		1
		SELECT 'a' = 'a ';
		'a' = 'a '
		1

mysql-test/t/ctype_utf8.test

+8 −0

Original line number	Diff line number	Diff line
		@@ -33,6 +33,14 @@ select 'A' like 'a';
		select 'A' like 'a' collate utf8_bin;
		select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%');

		# Bug #6040: can't retrieve records with umlaut
		# characters in case insensitive manner.
		# Case insensitive search LIKE comparison
		# was broken for multibyte characters:
		select convert(_latin1'Gnter Andr' using utf8) like CONVERT(_latin1'GNTER%' USING utf8);
		select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
		select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);

		#
		# Check the following:
		# "a" == "a "

strings/ctype-ucs2.c

+4 −162

Original line number	Diff line number	Diff line
		@@ -1231,171 +1231,13 @@ uint my_lengthsp_ucs2(CHARSET_INFO *cs __attribute__((unused)),
		}


		/*
		** Compare string against string with wildcard
		** 0 if matched
		** -1 if not matched with wildcard
		** 1 if matched with wildcard
		*/

		static
		int my_wildcmp_ucs2(CHARSET_INFO *cs,
		const char str,const char str_end,
		const char wildstr,const char wildend,
		int escape, int w_one, int w_many,
		MY_UNICASE_INFO **weights)
		{
		int result= -1; /* Not found, using wildcards */
		my_wc_t s_wc, w_wc;
		int scan, plane;

		while (wildstr != wildend)
		{

		while (1)
		{
		scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
		(const uchar*)wildend);
		if (scan <= 0)
		return 1;

		if (w_wc == (my_wc_t)escape)
		{
		wildstr+= scan;
		scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
		(const uchar*)wildend);
		if (scan <= 0)
		return 1;
		}

		if (w_wc == (my_wc_t)w_many)
		{
		result= 1; /* Found an anchor char */
		break;
		}

		wildstr+= scan;
		scan= my_ucs2_uni(cs, &s_wc, (const uchar)str, (const uchar)str_end);
		if (scan <=0)
		return 1;
		str+= scan;

		if (w_wc == (my_wc_t)w_one)
		{
		result= 1; /* Found an anchor char */
		}
		else
		{
		if (weights)
		{
		plane=(s_wc>>8) & 0xFF;
		s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
		plane=(w_wc>>8) & 0xFF;
		w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
		}
		if (s_wc != w_wc)
		return 1; /* No match */
		}
		if (wildstr == wildend)
		return (str != str_end); /* Match if both are at end */
		}


		if (w_wc == (my_wc_t)w_many)
		{ /* Found w_many */

		/* Remove any '%' and '_' from the wild search string */
		for ( ; wildstr != wildend ; )
		{
		scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
		(const uchar*)wildend);
		if (scan <= 0)
		return 1;

		if (w_wc == (my_wc_t)w_many)
		{
		wildstr+= scan;
		continue;
		}

		if (w_wc == (my_wc_t)w_one)
		{
		wildstr+= scan;
		scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str,
		(const uchar*)str_end);
		if (scan <=0)
		return 1;
		str+= scan;
		continue;
		}
		break; /* Not a wild character */
		}

		if (wildstr == wildend)
		return 0; /* Ok if w_many is last */

		if (str == str_end)
		return -1;

		scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
		(const uchar*)wildend);
		if (scan <= 0)
		return 1;

		if (w_wc == (my_wc_t)escape)
		{
		wildstr+= scan;
		scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
		(const uchar*)wildend);
		if (scan <= 0)
		return 1;
		}

		while (1)
		{
		/* Skip until the first character from wildstr is found */
		while (str != str_end)
		{
		scan= my_ucs2_uni(cs,&s_wc, (const uchar*)str,
		(const uchar*)str_end);
		if (scan <= 0)
		return 1;
		if (weights)
		{
		plane=(s_wc>>8) & 0xFF;
		s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
		plane=(w_wc>>8) & 0xFF;
		w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
		}

		if (s_wc == w_wc)
		break;
		str+= scan;
		}
		if (str == str_end)
		return -1;

		result= my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,escape,
		w_one,w_many,weights);

		if (result <= 0)
		return result;

		str+= scan;
		}
		}
		}
		return (str != str_end ? 1 : 0);
		}


		static
		int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
		const char str,const char str_end,
		const char wildstr,const char wildend,
		int escape, int w_one, int w_many)
		{
		return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,
		return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
		escape,w_one,w_many,uni_plane);
		}

		@@ -1406,7 +1248,7 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
		const char wildstr,const char wildend,
		int escape, int w_one, int w_many)
		{
		return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,
		return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
		escape,w_one,w_many,NULL);
		}

strings/ctype-utf8.c

+167 −1

Original line number	Diff line number	Diff line
		@@ -1518,6 +1518,161 @@ MY_UNICASE_INFO *uni_plane[256]={

		};


		/*
		** Compare string against string with wildcard
		** This function is used in UTF8 and UCS2
		**
		** 0 if matched
		** -1 if not matched with wildcard
		** 1 if matched with wildcard
		*/

		int my_wildcmp_unicode(CHARSET_INFO *cs,
		const char str,const char str_end,
		const char wildstr,const char wildend,
		int escape, int w_one, int w_many,
		MY_UNICASE_INFO **weights)
		{
		int result= -1; /* Not found, using wildcards */
		my_wc_t s_wc, w_wc;
		int scan, plane;
		int (mb_wc)(struct charset_info_st cs, my_wc_t *wc,
		const unsigned char s,const unsigned char e);
		mb_wc= cs->cset->mb_wc;

		while (wildstr != wildend)
		{
		while (1)
		{
		if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
		(const uchar*)wildend)) <= 0)
		return 1;

		if (w_wc == (my_wc_t)escape)
		{
		wildstr+= scan;
		if ((scan= mb_wc(cs,&w_wc, (const uchar*)wildstr,
		(const uchar*)wildend)) <= 0)
		return 1;
		}

		if (w_wc == (my_wc_t)w_many)
		{
		result= 1; /* Found an anchor char */
		break;
		}

		wildstr+= scan;
		if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
		(const uchar*)str_end)) <=0)
		return 1;
		str+= scan;

		if (w_wc == (my_wc_t)w_one)
		{
		result= 1; /* Found an anchor char */
		}
		else
		{
		if (weights)
		{
		plane=(s_wc>>8) & 0xFF;
		s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
		plane=(w_wc>>8) & 0xFF;
		w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
		}
		if (s_wc != w_wc)
		return 1; /* No match */
		}
		if (wildstr == wildend)
		return (str != str_end); /* Match if both are at end */
		}


		if (w_wc == (my_wc_t)w_many)
		{ /* Found w_many */

		/* Remove any '%' and '_' from the wild search string */
		for ( ; wildstr != wildend ; )
		{
		if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
		(const uchar*)wildend)) <= 0)
		return 1;

		if (w_wc == (my_wc_t)w_many)
		{
		wildstr+= scan;
		continue;
		}

		if (w_wc == (my_wc_t)w_one)
		{
		wildstr+= scan;
		if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
		(const uchar*)str_end)) <=0)
		return 1;
		str+= scan;
		continue;
		}
		break; /* Not a wild character */
		}

		if (wildstr == wildend)
		return 0; /* Ok if w_many is last */

		if (str == str_end)
		return -1;

		if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
		(const uchar*)wildend)) <=0)
		return 1;

		if (w_wc == (my_wc_t)escape)
		{
		wildstr+= scan;
		if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
		(const uchar*)wildend)) <=0)
		return 1;
		}

		while (1)
		{
		/* Skip until the first character from wildstr is found */
		while (str != str_end)
		{
		if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
		(const uchar*)str_end)) <=0)
		return 1;
		if (weights)
		{
		plane=(s_wc>>8) & 0xFF;
		s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
		plane=(w_wc>>8) & 0xFF;
		w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
		}

		if (s_wc == w_wc)
		break;
		str+= scan;
		}
		if (str == str_end)
		return -1;

		result= my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
		escape, w_one, w_many,
		weights);

		if (result <= 0)
		return result;

		str+= scan;
		}
		}
		}
		return (str != str_end ? 1 : 0);
		}

		#endif


		@@ -1992,6 +2147,17 @@ static int my_strcasecmp_utf8(CHARSET_INFO cs, const char s, const char *t)
		return my_strncasecmp_utf8(cs, s, t, len);
		}

		static
		int my_wildcmp_utf8(CHARSET_INFO *cs,
		const char str,const char str_end,
		const char wildstr,const char wildend,
		int escape, int w_one, int w_many)
		{
		return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
		escape,w_one,w_many,uni_plane);
		}


		static int my_strnxfrm_utf8(CHARSET_INFO *cs,
		uchar *dst, uint dstlen,
		const uchar *src, uint srclen)
		@@ -2060,7 +2226,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
		my_strnncollsp_utf8,
		my_strnxfrm_utf8,
		my_like_range_mb,
		my_wildcmp_mb,
		my_wildcmp_utf8,
		my_strcasecmp_utf8,
		my_instr_mb,
		my_hash_sort_utf8