Commit d6bc2d60 authored by unknown's avatar unknown
Browse files

BUG#20697 slave fails to rollback replicated transaction hang over innodb_lock_wait_timeou

Transaction on the slave sql thread got blocked against a slave's mysqld local ta's
lock. Since the default, slave-transaction-retries=10, there was replaying of the 
replicated ta. That failed because of a new started from 5.0.13 policy not to rollback
a timed-out transaction. Effectively the first round of a timed-out ta becomes committed
by the replaying's first "BEGIN".

It was decided to backport already existed method working in 5.1 implemented in
bug #16228 for handling symmetrical deadlock problem. That patch introduced end_trans
execution whenever a replicated ta deadlocks or timed-out.

Note, that this solution can be practically suboptimal - in the light of the changed behavior
due to timeout we still could replay only the last statement -  only with a high rate of timeouting
replicated transactions.


mysql-test/r/rpl_deadlock.result:
  results changed
mysql-test/t/rpl_deadlock.test:
  Refining the timeout part of the test to display that the timed-out transaction
  is rolled back prior its replaying by slave sql.
  Non-zero select's count would mean the first round work became persistent - wrong.
sql/slave.cc:
  applying bug#16228 fix, approbated for deadlock use case in 5.1, almost verbatim. 
  Another alternative to replay only the offending statement requires significant
  efforts, incl design work.
parent c317c2d2
Loading
Loading
Loading
Loading
+110 −15
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@ drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
create table t1 (a int not null, key(a)) engine=innodb;
create table t2 (a int not null, key(a)) engine=innodb;
create table t3 (a int) engine=innodb;
create table t3 (a int unique) engine=innodb;
create table t4 (a int) engine=innodb;
show variables like 'slave_transaction_retries';
Variable_name	Value
@@ -35,14 +35,14 @@ begin;
select * from t1 for update;
a
start slave;
insert into t2 values(22);
insert into t2 values(201);
commit;
select * from t1;
a
1
select * from t2;
a
22
201
show slave status;
Slave_IO_State	#
Master_Host	127.0.0.1
@@ -50,7 +50,7 @@ Master_User root
Master_Port	MASTER_MYPORT
Connect_Retry	1
Master_Log_File	master-bin.000001
Read_Master_Log_Pos	18911
Read_Master_Log_Pos	18918
Relay_Log_File	#
Relay_Log_Pos	#
Relay_Master_Log_File	master-bin.000001
@@ -65,7 +65,7 @@ Replicate_Wild_Ignore_Table
Last_Errno	0
Last_Error	
Skip_Counter	0
Exec_Master_Log_Pos	18911
Exec_Master_Log_Pos	18918
Relay_Log_Space	#
Until_Condition	None
Until_Log_File	
@@ -78,12 +78,16 @@ Master_SSL_Cipher
Master_SSL_Key	
Seconds_Behind_Master	#
stop slave;
change master to master_log_pos=532;
delete from t3;
change master to master_log_pos=539;
begin;
select * from t2 for update;
a
22
201
start slave;
select count(*) from t3  /* must be zero */;
count(*)
0
commit;
select * from t1;
a
@@ -91,7 +95,7 @@ a
1
select * from t2;
a
22
201
show slave status;
Slave_IO_State	#
Master_Host	127.0.0.1
@@ -99,7 +103,7 @@ Master_User root
Master_Port	MASTER_MYPORT
Connect_Retry	1
Master_Log_File	master-bin.000001
Read_Master_Log_Pos	18911
Read_Master_Log_Pos	18918
Relay_Log_File	#
Relay_Log_Pos	#
Relay_Master_Log_File	master-bin.000001
@@ -114,7 +118,7 @@ Replicate_Wild_Ignore_Table
Last_Errno	0
Last_Error	
Skip_Counter	0
Exec_Master_Log_Pos	18911
Exec_Master_Log_Pos	18918
Relay_Log_Space	#
Until_Condition	None
Until_Log_File	
@@ -128,12 +132,16 @@ Master_SSL_Key
Seconds_Behind_Master	#
set global max_relay_log_size=0;
stop slave;
change master to master_log_pos=532;
delete from t3;
change master to master_log_pos=539;
begin;
select * from t2 for update;
a
22
201
start slave;
select count(*) from t3  /* must be zero */;
count(*)
0
commit;
select * from t1;
a
@@ -142,7 +150,7 @@ a
1
select * from t2;
a
22
201
show slave status;
Slave_IO_State	#
Master_Host	127.0.0.1
@@ -150,7 +158,7 @@ Master_User root
Master_Port	MASTER_MYPORT
Connect_Retry	1
Master_Log_File	master-bin.000001
Read_Master_Log_Pos	18911
Read_Master_Log_Pos	18918
Relay_Log_File	#
Relay_Log_Pos	#
Relay_Master_Log_File	master-bin.000001
@@ -165,7 +173,7 @@ Replicate_Wild_Ignore_Table
Last_Errno	0
Last_Error	
Skip_Counter	0
Exec_Master_Log_Pos	18911
Exec_Master_Log_Pos	18918
Relay_Log_Space	#
Until_Condition	None
Until_Log_File	
@@ -177,4 +185,91 @@ Master_SSL_Cert
Master_SSL_Cipher	
Master_SSL_Key	
Seconds_Behind_Master	#
drop table if exists t1;
create table t1 (f int unique) engine=innodb;
insert into t1  values (0);
begin;
select * from t1 where f = 0 for update;
f
0
begin;
insert into t1 values (1);
update t1 set f=-1 where f = 0;
commit;
insert into t1 values (2);
show slave status;
Slave_IO_State	#
Master_Host	127.0.0.1
Master_User	root
Master_Port	MASTER_MYPORT
Connect_Retry	1
Master_Log_File	master-bin.000001
Read_Master_Log_Pos	19618
Relay_Log_File	#
Relay_Log_Pos	#
Relay_Master_Log_File	master-bin.000001
Slave_IO_Running	#
Slave_SQL_Running	No
Replicate_Do_DB	
Replicate_Ignore_DB	
Replicate_Do_Table	
Replicate_Ignore_Table	
Replicate_Wild_Do_Table	
Replicate_Wild_Ignore_Table	
Last_Errno	1205
Last_Error	Error 'Lock wait timeout exceeded; try restarting transaction' on query. Default database: 'test'. Query: 'update t1 set f=-1 where f = 0'
Skip_Counter	0
Exec_Master_Log_Pos	19227
Relay_Log_Space	#
Until_Condition	None
Until_Log_File	
Until_Log_Pos	0
Master_SSL_Allowed	No
Master_SSL_CA_File	
Master_SSL_CA_Path	
Master_SSL_Cert	
Master_SSL_Cipher	
Master_SSL_Key	
Seconds_Behind_Master	#
set @@global.sql_slave_skip_counter = 4;
start slave;
show slave status;
Slave_IO_State	#
Master_Host	127.0.0.1
Master_User	root
Master_Port	MASTER_MYPORT
Connect_Retry	1
Master_Log_File	master-bin.000001
Read_Master_Log_Pos	19618
Relay_Log_File	#
Relay_Log_Pos	#
Relay_Master_Log_File	master-bin.000001
Slave_IO_Running	#
Slave_SQL_Running	Yes
Replicate_Do_DB	
Replicate_Ignore_DB	
Replicate_Do_Table	
Replicate_Ignore_Table	
Replicate_Wild_Do_Table	
Replicate_Wild_Ignore_Table	
Last_Errno	0
Last_Error	
Skip_Counter	0
Exec_Master_Log_Pos	19618
Relay_Log_Space	#
Until_Condition	None
Until_Log_File	
Until_Log_Pos	0
Master_SSL_Allowed	No
Master_SSL_CA_File	
Master_SSL_CA_Path	
Master_SSL_Cert	
Master_SSL_Cipher	
Master_SSL_Key	
Seconds_Behind_Master	#
select * from t1;
f
0
2
commit;
drop table t1,t2,t3,t4;
+10 −6
Original line number Diff line number Diff line
@@ -16,7 +16,8 @@ source include/master-slave.inc;
connection master;
create table t1 (a int not null, key(a)) engine=innodb;
create table t2 (a int not null, key(a)) engine=innodb;
create table t3 (a int) engine=innodb;
# requiring 'unique' for the timeout part of the test
create table t3 (a int unique) engine=innodb;
create table t4 (a int) engine=innodb;
show variables like 'slave_transaction_retries';
sync_slave_with_master;
@@ -31,8 +32,7 @@ stop slave;
connection master;
begin;
# Let's keep BEGIN and the locked statement in two different relay logs.
let $1=200;
disable_query_log;
let $1=200;disable_query_log;
while ($1)
{
 eval insert into t3 values( $1 );
@@ -59,7 +59,7 @@ enable_query_log;
select * from t1 for update;
start slave;
--real_sleep 3 # hope that slave is blocked now
insert into t2 values(22); # provoke deadlock, slave should be victim
insert into t2 values(201); # provoke deadlock, slave should be victim
commit;
sync_with_master;
select * from t1; # check that slave succeeded finally
@@ -74,11 +74,13 @@ show slave status;
# 2) Test lock wait timeout

stop slave;
change master to master_log_pos=532; # the BEGIN log event
delete from t3;
change master to master_log_pos=539; # the BEGIN log event
begin;
select * from t2 for update; # hold lock
start slave;
--real_sleep 10 # slave should have blocked, and be retrying
select count(*) from t3  /* must be zero */; # replaying begins after rollback
commit;
sync_with_master;
select * from t1; # check that slave succeeded finally
@@ -97,11 +99,13 @@ set global max_relay_log_size=0;

# This is really copy-paste of 2) of above
stop slave;
change master to master_log_pos=532;
delete from t3;
change master to master_log_pos=539;
begin;
select * from t2 for update;
start slave;
--real_sleep 10
select count(*) from t3  /* must be zero */; # replaying begins after rollback
commit;
sync_with_master;
select * from t1;
+16 −7
Original line number Diff line number Diff line
@@ -3345,9 +3345,9 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
        const char *errmsg;
        /*
          We were in a transaction which has been rolled back because of a
          deadlock (currently, InnoDB deadlock detected by InnoDB) or lock
          wait timeout (innodb_lock_wait_timeout exceeded); let's seek back to
          BEGIN log event and retry it all again.
        Sonera  deadlock. if lock wait timeout (innodb_lock_wait_timeout exceeded)
	  there is no rollback since 5.0.13 (ref: manual).
	  let's seek back to BEGIN log event and retry it all again.
          We have to not only seek but also
          a) init_master_info(), to seek back to hot relay log's start for later
          (for when we will come back to this hot log after re-processing the
@@ -3369,6 +3369,7 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
          else
          {
            exec_res= 0;
	    end_trans(thd, ROLLBACK);
	    /* chance for concurrent connection to get more locks */
            safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
		       (CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
@@ -3386,9 +3387,17 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
                          "the slave_transaction_retries variable.",
                          slave_trans_retries);
      }
      if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
      else if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
      {
        /*
          Only reset the retry counter if the event succeeded or
          failed with a non-transient error.  On a successful event,
          the execution will proceed as usual; in the case of a
          non-transient error, the slave will stop with an error.
	*/
        rli->trans_retries= 0; // restart from fresh
      }
    }
    return exec_res;
  }
  else
@@ -4613,7 +4622,7 @@ static int connect_to_master(THD* thd, MYSQL* mysql, MASTER_INFO* mi,
      suppress_warnings= 0;
      sql_print_error("Slave I/O thread: error %s to master \
'%s@%s:%d': \
Error: '%s'  errno: %d  retry-time: %d  retries: %d",
Error: '%s'  errno: %d  retry-time: %d  retries: %lu",
		      (reconnect ? "reconnecting" : "connecting"),
		      mi->user,mi->host,mi->port,
		      mysql_error(mysql), last_errno,