Commit 778b4aad authored by unknown's avatar unknown
Browse files

ndb - bug#26457

  master failure during master take over


ndb/src/kernel/blocks/ERROR_codes.txt:
  new error code
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Make sure to clear NF_XX_LCP if master fails during master take-over
ndb/test/include/NdbRestarter.hpp:
  Add support for querying next master and node group
    (for multi node failure testing)
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase
ndb/test/run-test/daily-basic-tests.txt:
  testcase
ndb/test/src/NdbRestarter.cpp:
  Add support for querying next master and node group
    (for multi node failure testing)
parent 0e39133a
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4014
Next DBLQH 5043
Next DBDICT 6007
Next DBDIH 7178
Next DBDIH 7181
Next DBTC 8039
Next CMVMI 9000
Next BACKUP 10022
@@ -71,6 +71,8 @@ Delay GCP_SAVEREQ by 10 secs

7177: Delay copying of sysfileData in execCOPY_GCIREQ

7180: Crash master during master-take-over in execMASTER_LCPCONF

ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
-----------------------------------------------------------------

+26 −1
Original line number Diff line number Diff line
@@ -4612,6 +4612,8 @@ void
Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){
  jam();

  Uint32 oldNode = c_lcpMasterTakeOverState.failedNodeId;

  c_lcpMasterTakeOverState.minTableId = ~0;
  c_lcpMasterTakeOverState.minFragId = ~0;
  c_lcpMasterTakeOverState.failedNodeId = nodeId;
@@ -4630,7 +4632,20 @@ Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){
    /**
     * Node failure during master take over...
     */
    ndbout_c("Nodefail during master take over");
    ndbout_c("Nodefail during master take over (old: %d)", oldNode);
  }
  
  NodeRecordPtr nodePtr;
  nodePtr.i = oldNode;
  if (oldNode > 0 && oldNode < MAX_NDB_NODES)
  {
    jam();
    ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
    if (nodePtr.p->m_nodefailSteps.get(NF_LCP_TAKE_OVER))
    {
      jam();
      checkLocalNodefailComplete(signal, oldNode, NF_LCP_TAKE_OVER);
    }
  }
  
  setLocalNodefailHandling(signal, nodeId, NF_LCP_TAKE_OVER);
@@ -5646,6 +5661,14 @@ void Dbdih::execMASTER_LCPREQ(Signal* signal)
  jamEntry();
  const BlockReference newMasterBlockref = req->masterRef;

  if (newMasterBlockref != cmasterdihref)
  {
    jam();
    ndbout_c("resending GSN_MASTER_LCPREQ");
    sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
			signal->getLength(), 50);
    return;
  }
  Uint32 failedNodeId = req->failedNodeId;

  /**
@@ -5946,6 +5969,8 @@ void Dbdih::execMASTER_LCPCONF(Signal* signal)
  ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
  nodePtr.p->lcpStateAtTakeOver = lcpState;

  CRASH_INSERTION(7180);
  
#ifdef VM_TRACE
  ndbout_c("MASTER_LCPCONF");
  printMASTER_LCP_CONF(stdout, &signal->theData[0], 0, 0);
+2 −0
Original line number Diff line number Diff line
@@ -62,6 +62,8 @@ public:
  int dumpStateAllNodes(int * _args, int _num_args);

  int getMasterNodeId();
  int getNextMasterNodeId(int nodeId);
  int getNodeGroup(int nodeId);
  int getRandomNodeSameNodeGroup(int nodeId, int randomNumber);
  int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber);
  int getRandomNotMasterNodeId(int randomNumber);
+42 −0
Original line number Diff line number Diff line
@@ -1045,6 +1045,45 @@ int runBug25554(NDBT_Context* ctx, NDBT_Step* step){
  return NDBT_OK;
}

int
runBug26457(NDBT_Context* ctx, NDBT_Step* step)
{
  NdbRestarter res;
  if (res.getNumDbNodes() < 4)
    return NDBT_OK;

  int loops = ctx->getNumLoops();
  while (loops --)
  {
retry:
    int master = res.getMasterNodeId();
    int next = res.getNextMasterNodeId(master);

    ndbout_c("master: %d next: %d", master, next);

    if (res.getNodeGroup(master) == res.getNodeGroup(next))
    {
      res.restartOneDbNode(next, false, false, true);
      if (res.waitClusterStarted())
	return NDBT_FAILED;
      goto retry;
    }

    int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 2 };
    
    if (res.dumpStateOneNode(next, val2, 2))
      return NDBT_FAILED;
    
    if (res.insertErrorInNode(next, 7180))
      return NDBT_FAILED;
    
    res.restartOneDbNode(master, false, false, true);
    if (res.waitClusterStarted())
      return NDBT_FAILED;
  }
  
  return NDBT_OK;
}

NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", 
@@ -1367,6 +1406,9 @@ TESTCASE("Bug25364", ""){
TESTCASE("Bug25554", ""){
  INITIALIZER(runBug25554);
}
TESTCASE("Bug26457", ""){
  INITIALIZER(runBug26457);
}
NDBT_TESTSUITE_END(testNodeRestart);

int main(int argc, const char** argv){
+4 −0
Original line number Diff line number Diff line
@@ -477,6 +477,10 @@ max-time: 1000
cmd: testNodeRestart
args: -n Bug25554 T1

max-time: 1000
cmd: testNodeRestart
args: -n Bug26457 T1

# OLD FLEX
max-time: 500
cmd: flexBench
Loading