Bug#27345 Incorrect data returned when range-read from utf8_danish_ci indexes (d3f43c87) · Commits · Software / OSDI20 Artifacts / mariadb

mysql-test/r/ctype_uca.result

+92 −0

Original line number	Diff line number	Diff line
		@@ -2663,3 +2663,95 @@ COUNT(*) c1
		1
		1 a
		DROP TABLE IF EXISTS t1;
		set names utf8;
		create table t1 (
		a varchar(255),
		key a(a)
		) character set utf8 collate utf8_danish_ci;
		insert into t1 values ('åaaaa'),('ååaaa'),('aaaaa');
		select a as like_a from t1 where a like 'a%';
		like_a
		aaaaa
		select a as like_aa from t1 where a like 'aa%';
		like_aa
		aaaaa
		select a as like_aaa from t1 where a like 'aaa%';
		like_aaa
		aaaaa
		select a as like_aaaa from t1 where a like 'aaaa%';
		like_aaaa
		aaaaa
		select a as like_aaaaa from t1 where a like 'aaaaa%';
		like_aaaaa
		aaaaa
		alter table t1 convert to character set ucs2 collate ucs2_danish_ci;
		select a as like_a from t1 where a like 'a%';
		like_a
		aaaaa
		select a as like_aa from t1 where a like 'aa%';
		like_aa
		aaaaa
		select a as like_aaa from t1 where a like 'aaa%';
		like_aaa
		aaaaa
		select a as like_aaaa from t1 where a like 'aaaa%';
		like_aaaa
		aaaaa
		select a as like_aaaaa from t1 where a like 'aaaaa%';
		like_aaaaa
		aaaaa
		drop table t1;
		create table t1 (
		a varchar(255),
		key(a)
		) character set utf8 collate utf8_spanish2_ci;
		insert into t1 values ('aaaaa'),('lllll'),('zzzzz');
		select a as like_l from t1 where a like 'l%';
		like_l
		lllll
		select a as like_ll from t1 where a like 'll%';
		like_ll
		lllll
		select a as like_lll from t1 where a like 'lll%';
		like_lll
		lllll
		select a as like_llll from t1 where a like 'llll%';
		like_llll
		lllll
		select a as like_lllll from t1 where a like 'lllll%';
		like_lllll
		lllll
		alter table t1 convert to character set ucs2 collate ucs2_spanish2_ci;
		select a as like_l from t1 where a like 'l%';
		like_l
		lllll
		select a as like_ll from t1 where a like 'll%';
		like_ll
		lllll
		select a as like_lll from t1 where a like 'lll%';
		like_lll
		lllll
		select a as like_llll from t1 where a like 'llll%';
		like_llll
		lllll
		select a as like_lllll from t1 where a like 'lllll%';
		like_lllll
		lllll
		drop table t1;
		create table t1 (
		a varchar(255),
		key a(a)
		) character set utf8 collate utf8_czech_ci;
		insert into t1 values
		('b'),('c'),('d'),('e'),('f'),('g'),('h'),('ch'),('i'),('j');
		select * from t1 where a like 'c%';
		a
		c
		ch
		alter table t1 convert to character set ucs2 collate ucs2_czech_ci;
		select * from t1 where a like 'c%';
		a
		c
		ch
		drop table t1;
		End for 5.0 tests

mysql-test/t/ctype_uca.test

+54 −0

Original line number	Diff line number	Diff line
		@@ -485,3 +485,57 @@ CREATE TABLE t1 (
		insert into t1 values (''),('a');
		SELECT COUNT(*), c1 FROM t1 GROUP BY c1;
		DROP TABLE IF EXISTS t1;

		#
		# Bug#27345 Incorrect data returned when range-read from utf8_danish_ci indexes
		#
		set names utf8;
		create table t1 (
		a varchar(255),
		key a(a)
		) character set utf8 collate utf8_danish_ci;
		insert into t1 values ('åaaaa'),('ååaaa'),('aaaaa');
		select a as like_a from t1 where a like 'a%';
		select a as like_aa from t1 where a like 'aa%';
		select a as like_aaa from t1 where a like 'aaa%';
		select a as like_aaaa from t1 where a like 'aaaa%';
		select a as like_aaaaa from t1 where a like 'aaaaa%';
		alter table t1 convert to character set ucs2 collate ucs2_danish_ci;
		select a as like_a from t1 where a like 'a%';
		select a as like_aa from t1 where a like 'aa%';
		select a as like_aaa from t1 where a like 'aaa%';
		select a as like_aaaa from t1 where a like 'aaaa%';
		select a as like_aaaaa from t1 where a like 'aaaaa%';
		drop table t1;

		create table t1 (
		a varchar(255),
		key(a)
		) character set utf8 collate utf8_spanish2_ci;
		insert into t1 values ('aaaaa'),('lllll'),('zzzzz');
		select a as like_l from t1 where a like 'l%';
		select a as like_ll from t1 where a like 'll%';
		select a as like_lll from t1 where a like 'lll%';
		select a as like_llll from t1 where a like 'llll%';
		select a as like_lllll from t1 where a like 'lllll%';
		alter table t1 convert to character set ucs2 collate ucs2_spanish2_ci;
		select a as like_l from t1 where a like 'l%';
		select a as like_ll from t1 where a like 'll%';
		select a as like_lll from t1 where a like 'lll%';
		select a as like_llll from t1 where a like 'llll%';
		select a as like_lllll from t1 where a like 'lllll%';
		drop table t1;

		create table t1 (
		a varchar(255),
		key a(a)
		) character set utf8 collate utf8_czech_ci;
		-- In Czech 'ch' is a single letter between 'h' and 'i'
		insert into t1 values
		('b'),('c'),('d'),('e'),('f'),('g'),('h'),('ch'),('i'),('j');
		select * from t1 where a like 'c%';
		alter table t1 convert to character set ucs2 collate ucs2_czech_ci;
		select * from t1 where a like 'c%';
		drop table t1;

		-- echo End for 5.0 tests

strings/ctype-mb.c

+70 −1

Original line number	Diff line number	Diff line
		@@ -563,6 +563,8 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
		char *min_end= min_str + res_length;
		char *max_end= max_str + res_length;
		uint maxcharlen= res_length / cs->mbmaxlen;
		const char *contraction_flags= cs->contractions ?
		((const char) cs->contractions) + 0x400x40 : NULL;

		for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--)
		{
		@@ -571,6 +573,7 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
		ptr++; /* Skip escape */
		else if (ptr == w_one \|\| ptr == w_many) /* '_' and '%' in SQL */
		{
		fill_max_and_min:
		/*
		Calculate length of keys:
		'a\0\0... is the smallest possible string when we have space expand
		@@ -602,8 +605,74 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
		min_str++= max_str++= *ptr++;
		}
		else
		min_str++= max_str++= *ptr++;
		{
		/*
		Special case for collations with contractions.
		For example, in Chezh, 'ch' is a separate letter
		which is sorted between 'h' and 'i'.
		If the pattern 'abc%', 'c' at the end can mean:
		- letter 'c' itself,
		- beginning of the contraction 'ch'.

		If we simply return this LIKE range:

		'abc\min\min\min' and 'abc\max\max\max'

		then this query: SELECT * FROM t1 WHERE a LIKE 'abc%'
		will only find values starting from 'abc[^h]',
		but won't find values starting from 'abch'.

		We must ignore contraction heads followed by w_one or w_many.
		('Contraction head' means any letter which can be the first
		letter in a contraction)

		For example, for Czech 'abc%', we will return LIKE range,
		which is equal to LIKE range for 'ab%':

		'ab\min\min\min\min' and 'ab\max\max\max\max'.

		*/
		if (contraction_flags && ptr + 1 < end &&
		contraction_flags[(uchar) *ptr])
		{
		/* Ptr[0] is a contraction head. */

		if (ptr[1] == w_one \|\| ptr[1] == w_many)
		{
		/* Contraction head followed by a wildcard, quit. */
		goto fill_max_and_min;
		}

		/*
		Some letters can be both contraction heads and contraction tails.
		For example, in Danish 'aa' is a separate single letter which
		is sorted after 'z'. So 'a' can be both head and tail.

		If ptr[0]+ptr[1] is a contraction,
		then put both letters together.

		If ptr[1] can be a contraction part, but ptr[0]+ptr[1]
		is not a contraction, then we put only ptr[0],
		and continue with ptr[1] on the next loop.
		*/
		if (contraction_flags[(uchar) ptr[1]] &&
		cs->contractions[(ptr-0x40)0x40 + ptr[1] - 0x40])
		{
		/* Contraction found */
		if (maxcharlen == 1 \|\| min_str + 1 >= min_end)
		{
		/* Both contraction parts don't fit, quit */
		goto fill_max_and_min;
		}

		/* Put contraction head */
		min_str++= max_str++= *ptr++;
		maxcharlen--;
		}
		}
		/* Put contraction tail, or a single character */
		min_str++= max_str++= *ptr++;
		}
		}

		min_length= max_length = (uint) (min_str - min_org);

strings/ctype-uca.c

+10 −1

Original line number	Diff line number	Diff line
		@@ -7937,10 +7937,16 @@ static my_bool create_tailoring(CHARSET_INFO cs, void (*alloc)(uint))
		/* Now process contractions */
		if (ncontractions)
		{
		uint size= 0x400x40sizeof(uint16); /* 8K, for basic latin letter only */
		/*
		8K for weights for basic latin letter pairs,
		plus 256 bytes for "is contraction part" flags.
		*/
		uint size= 0x400x40sizeof(uint16) + 256;
		char *contraction_flags;
		if (!(cs->contractions= (uint16) (alloc)(size)))
		return 1;
		bzero((void*)cs->contractions, size);
		contraction_flags= ((char) cs->contractions) + 0x400x40;
		for (i=0; i < rc; i++)
		{
		if (rule[i].curr[1])
		@@ -7966,6 +7972,9 @@ static my_bool create_tailoring(CHARSET_INFO cs, void (*alloc)(uint))

		/* Copy base weight applying primary difference */
		cs->contractions[offsc]= offsb[0] + rule[i].diff[0];
		/* Mark both letters as "is contraction part */
		contraction_flags[rule[i].curr[0]]= 1;
		contraction_flags[rule[i].curr[1]]= 1;
		}
		}
		}

strings/ctype-ucs2.c

+35 −0

Original line number	Diff line number	Diff line
		@@ -1524,6 +1524,8 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
		char *min_org=min_str;
		char *min_end=min_str+res_length;
		uint charlen= res_length / cs->mbmaxlen;
		const char *contraction_flags= cs->contractions ?
		((const char) cs->contractions) + 0x400x40 : NULL;

		for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0
		; ptr+=2, charlen--)
		@@ -1545,6 +1547,7 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
		}
		if (ptr[0] == '\0' && ptr[1] == w_many) /* '%' in SQL */
		{
		fill_max_and_min:
		/*
		Calculate length of keys:
		'a\0\0... is the smallest possible string when we have space expand
		@@ -1561,6 +1564,38 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
		} while (min_str + 1 < min_end);
		return 0;
		}

		if (contraction_flags && ptr + 3 < end &&
		ptr[0] == '\0' && contraction_flags[(uchar) ptr[1]])
		{
		/* Contraction head found */
		if (ptr[2] == '\0' && (ptr[3] == w_one \|\| ptr[3] == w_many))
		{
		/* Contraction head followed by a wildcard, quit */
		goto fill_max_and_min;
		}

		/*
		Check if the second letter can be contraction part,
		and if two letters really produce a contraction.
		*/
		if (ptr[2] == '\0' && contraction_flags[(uchar) ptr[3]] &&
		cs->contractions[(ptr[1]-0x40)*0x40 + ptr[3] - 0x40])
		{
		/* Contraction found */
		if (charlen == 1 \|\| min_str + 2 >= min_end)
		{
		/* Full contraction doesn't fit, quit */
		goto fill_max_and_min;
		}

		/* Put contraction head */
		min_str++= max_str++= *ptr++;
		min_str++= max_str++= *ptr++;
		charlen--;
		}
		}
		/* Put contraction tail, or a single character */
		min_str++= max_str++ = ptr[0];
		min_str++= max_str++ = ptr[1];
		}