Commit 501cce2b authored by aelkin/elkin@koti.dsl.inet.fi's avatar aelkin/elkin@koti.dsl.inet.fi
Browse files

Bug #29309 Incorrect "Seconds_Behind_Master" value in SHOW SLAVE STATUS after FLUSH LOGS

Report claims that Seconds_behind_master behaves unexpectedly. 

Code analysis shows that there is an evident flaw in that treating of FormatDescription event is wrong
so  that after FLUSH LOGS on slave the Seconds_behind_master's calculation slips and incorrect
value can be reported to SHOW SLAVE STATUS. 
Even worse is that the gap between the correct and incorrect deltas grows with time.

Fixed with prohibiting changes to rpl->last_master_timestamp by artifical events (any kind of).
suggestion as comments is added how to fight with lack of info on the slave side by means of
new heartbeat feature coming.

The test can not be done ealily fully determistic.
parent 341e484f
Loading
Loading
Loading
Loading
+121 −0
Original line number Diff line number Diff line
stop slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
show slave status /* Second_behind reports 0 */;;
Slave_IO_State	#
Master_Host	127.0.0.1
Master_User	root
Master_Port	9306
Connect_Retry	1
Master_Log_File	master-bin.000001
Read_Master_Log_Pos	98
Relay_Log_File	#
Relay_Log_Pos	#
Relay_Master_Log_File	master-bin.000001
Slave_IO_Running	Yes
Slave_SQL_Running	Yes
Replicate_Do_DB	
Replicate_Ignore_DB	
Replicate_Do_Table	
Replicate_Ignore_Table	
Replicate_Wild_Do_Table	
Replicate_Wild_Ignore_Table	
Last_Errno	0
Last_Error	
Skip_Counter	0
Exec_Master_Log_Pos	98
Relay_Log_Space	#
Until_Condition	None
Until_Log_File	
Until_Log_Pos	0
Master_SSL_Allowed	No
Master_SSL_CA_File	
Master_SSL_CA_Path	
Master_SSL_Cert	
Master_SSL_Cipher	
Master_SSL_Key	
Seconds_Behind_Master	0
drop table if exists t1;
Warnings:
Note	1051	Unknown table 't1'
create table t1 (f1 int);
flush logs /* contaminate rli->last_master_timestamp */;
lock table t1 write;
insert into t1 values (1);
show slave status /* bug emulated: reports slave threads starting time about 3*3 not 3 secs */;;
Slave_IO_State	#
Master_Host	127.0.0.1
Master_User	root
Master_Port	9306
Connect_Retry	1
Master_Log_File	master-bin.000001
Read_Master_Log_Pos	359
Relay_Log_File	#
Relay_Log_Pos	#
Relay_Master_Log_File	master-bin.000001
Slave_IO_Running	Yes
Slave_SQL_Running	Yes
Replicate_Do_DB	
Replicate_Ignore_DB	
Replicate_Do_Table	
Replicate_Ignore_Table	
Replicate_Wild_Do_Table	
Replicate_Wild_Ignore_Table	
Last_Errno	0
Last_Error	
Skip_Counter	0
Exec_Master_Log_Pos	271
Relay_Log_Space	#
Until_Condition	None
Until_Log_File	
Until_Log_Pos	0
Master_SSL_Allowed	No
Master_SSL_CA_File	
Master_SSL_CA_Path	
Master_SSL_Cert	
Master_SSL_Cipher	
Master_SSL_Key	
Seconds_Behind_Master	10
unlock tables;
flush logs /* this time rli->last_master_timestamp is not affected */;
lock table t1 write;
insert into t1 values (2);
show slave status /* reports the correct diff with master query time about 3+3 secs */;;
Slave_IO_State	#
Master_Host	127.0.0.1
Master_User	root
Master_Port	9306
Connect_Retry	1
Master_Log_File	master-bin.000001
Read_Master_Log_Pos	447
Relay_Log_File	#
Relay_Log_Pos	#
Relay_Master_Log_File	master-bin.000001
Slave_IO_Running	Yes
Slave_SQL_Running	Yes
Replicate_Do_DB	
Replicate_Ignore_DB	
Replicate_Do_Table	
Replicate_Ignore_Table	
Replicate_Wild_Do_Table	
Replicate_Wild_Ignore_Table	
Last_Errno	0
Last_Error	
Skip_Counter	0
Exec_Master_Log_Pos	359
Relay_Log_Space	#
Until_Condition	None
Until_Log_File	
Until_Log_Pos	0
Master_SSL_Allowed	No
Master_SSL_CA_File	
Master_SSL_CA_Path	
Master_SSL_Cert	
Master_SSL_Cipher	
Master_SSL_Key	
Seconds_Behind_Master	6
unlock tables;
drop table t1;
+1 −0
Original line number Diff line number Diff line
--loose-debug=d,let_first_flush_log_change_timestamp
+71 −0
Original line number Diff line number Diff line
#
# Testing replication delay reporting (bug#29309)
# there is an unavoidable non-determinism in the test
# please compare the results with the comments 
#


source include/master-slave.inc;

connection master;
#connection slave;
sync_slave_with_master;
--replace_result $DEFAULT_MASTER_PORT DEFAULT_MASTER_PORT
--replace_column 1 # 8 # 9 # 23 #
--query_vertical show slave status /* Second_behind reports 0 */;
sleep 3;

### bug emulation

connection master;
drop table if exists t1;
create table t1 (f1 int);
sleep 3;

#connection slave;
sync_slave_with_master;
flush logs /* contaminate rli->last_master_timestamp */;

connection slave;
lock table t1 write;

connection master;
insert into t1 values (1);

sleep 3;

connection slave;
--replace_result $DEFAULT_MASTER_PORT DEFAULT_MASTER_PORT
--replace_column 1 # 8 # 9 # 23 #
--query_vertical show slave status /* bug emulated: reports slave threads starting time about 3*3 not 3 secs */;
unlock tables;

connection master;
sync_slave_with_master;

### bugfix


connection slave;
flush logs /* this time rli->last_master_timestamp is not affected */;
lock table t1 write;

connection master;
insert into t1 values (2);
sleep 3;

connection slave;
--replace_result $DEFAULT_MASTER_PORT DEFAULT_MASTER_PORT
--replace_column 1 # 8 # 9 # 23 #
--query_vertical show slave status /* reports the correct diff with master query time about 3+3 secs */;
unlock tables;

connection master;
drop table t1;

#connection slave;
sync_slave_with_master;


# End of tests
+31 −1
Original line number Diff line number Diff line
@@ -27,6 +27,10 @@

#define log_cs	&my_charset_latin1

#ifndef DBUG_OFF
uint debug_not_change_ts_if_art_event= 1; // bug#29309 simulation
#endif

/*
  pretty_print_str()
*/
@@ -481,6 +485,18 @@ int Log_event::exec_event(struct st_relay_log_info* rli)
      rli->inc_event_relay_log_pos();
    else
    {
      /*
        bug#29309 simulation: resetting the flag to force
        wrong behaviour of artificial event to update
        rli->last_master_timestamp for only one time -
        the first FLUSH LOGS in the test.
      */
      DBUG_EXECUTE_IF("let_first_flush_log_change_timestamp",
                      if (debug_not_change_ts_if_art_event == 1
                          && is_artificial_event())
                      {
                        debug_not_change_ts_if_art_event= 0;
                      });
      rli->inc_group_relay_log_pos(log_pos);
      flush_relay_log_info(rli);
      /* 
@@ -491,7 +507,21 @@ int Log_event::exec_event(struct st_relay_log_info* rli)
         rare cases, only consequence is that value may take some time to
         display in Seconds_Behind_Master - not critical).
      */
#ifndef DBUG_OFF
      if (!(is_artificial_event() && debug_not_change_ts_if_art_event > 0))
#else
      if (!is_artificial_event())
#endif
        rli->last_master_timestamp= when;
      /*
        The flag is set back to be positive so that 
        any further FLUSH LOGS will be handled as prescribed.
      */
      DBUG_EXECUTE_IF("let_first_flush_log_change_timestamp",
                      if (debug_not_change_ts_if_art_event == 0)
                      {
                        debug_not_change_ts_if_art_event= 2;
                      });
    }
  }
  DBUG_RETURN(0);
+10 −1
Original line number Diff line number Diff line
@@ -4931,7 +4931,16 @@ Log_event* next_event(RELAY_LOG_INFO* rli)
          a new event and is queuing it; the false "0" will exist until SQL
          finishes executing the new event; it will be look abnormal only if
          the events have old timestamps (then you get "many", 0, "many").
          Transient phases like this can't really be fixed.

          Transient phases like this can be fixed with implemeting
          Heartbeat event which provides the slave the status of the
          master at time the master does not have any new update to send.
          Seconds_Behind_Master would be zero only when master has no
          more updates in binlog for slave. The heartbeat can be sent
          in a (small) fraction of slave_net_timeout. Until it's done
          rli->last_master_timestamp is temporarely (for time of
          waiting for the following event) reset whenever EOF is
          reached.
        */
        time_t save_timestamp= rli->last_master_timestamp;
        rli->last_master_timestamp= 0;