Commit 5b10b7bc authored by unknown's avatar unknown
Browse files

bug#15632 - ndb

  Fix race between INCL_NODEREQ(prio b) and GCP_PREPARE(prio a) by also waiting for starting nodes


ndb/include/ndb_version.h.in:
  Handle upgrade of bug fix
ndb/src/kernel/blocks/ERROR_codes.txt:
  New error code for delaying INCL_NODE_REQ
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Fix race between INCL_NODEREQ(prio b) and GCP_PREPARE(prio a)
    by also waiting for starting nodes
ndb/test/ndbapi/testNodeRestart.cpp:
  Add testcase for bug#15632
parent fc4c198e
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -57,5 +57,8 @@ char ndb_version_string_buf[NDB_VERSION_STRING_BUF_SZ];
 */
/*#define NDB_VERSION_ID 0*/

#define NDBD_INCL_NODECONF_VERSION_4 MAKE_VERSION(4,1,17)
#define NDBD_INCL_NODECONF_VERSION_5 MAKE_VERSION(5,0,18)

#endif
 
+2 −0
Original line number Diff line number Diff line
@@ -61,6 +61,8 @@ Insert system error in GCP participant when receiving GCP_SAVEREQ.
5007:
Delay GCP_SAVEREQ by 10 secs

7165: Delay INCL_NODE_REQ in starting node yeilding error in GCP_PREPARE

ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
-----------------------------------------------------------------

+35 −3
Original line number Diff line number Diff line
@@ -215,7 +215,7 @@ void Dbdih::sendINCL_NODEREQ(Signal* signal, Uint32 nodeId)
  signal->theData[2] = c_nodeStartMaster.failNr;
  signal->theData[3] = 0;
  signal->theData[4] = currentgcp;  
  sendSignal(nodeDihRef, GSN_INCL_NODEREQ, signal, 5, JBB);
  sendSignal(nodeDihRef, GSN_INCL_NODEREQ, signal, 5, JBA);
}//Dbdih::sendINCL_NODEREQ()

void Dbdih::sendMASTER_GCPREQ(Signal* signal, Uint32 nodeId)
@@ -1857,6 +1857,14 @@ void Dbdih::gcpBlockedLab(Signal* signal)
  // global checkpoint id and the correct state. We do not wait for any reply
  // since the starting node will not send any.
  /*-------------------------------------------------------------------------*/
  Uint32 startVersion = getNodeInfo(c_nodeStartMaster.startNode).m_version;
  
  if ((getMajor(startVersion) == 4 && startVersion >= NDBD_INCL_NODECONF_VERSION_4) ||
      (getMajor(startVersion) == 5 && startVersion >= NDBD_INCL_NODECONF_VERSION_5))
  {
    c_INCL_NODEREQ_Counter.setWaitingFor(c_nodeStartMaster.startNode);
  }
  
  sendINCL_NODEREQ(signal, c_nodeStartMaster.startNode);
}//Dbdih::gcpBlockedLab()

@@ -2059,6 +2067,13 @@ void Dbdih::execINCL_NODEREQ(Signal* signal)
  jamEntry();
  Uint32 retRef = signal->theData[0];
  Uint32 nodeId = signal->theData[1];
  if (nodeId == getOwnNodeId() && ERROR_INSERTED(7165))
  {
    CLEAR_ERROR_INSERT_VALUE;
    sendSignalWithDelay(reference(), GSN_INCL_NODEREQ, signal, 5000, signal->getLength());
    return;
  }
  
  Uint32 tnodeStartFailNr = signal->theData[2];
  currentgcp = signal->theData[4];
  CRASH_INSERTION(7127);
@@ -2086,6 +2101,15 @@ void Dbdih::execINCL_NODEREQ(Signal* signal)
    // id's and the lcp status.
    /*-----------------------------------------------------------------------*/
    CRASH_INSERTION(7171);
    Uint32 masterVersion = getNodeInfo(refToNode(cmasterdihref)).m_version;
    
    if ((NDB_VERSION_MAJOR == 4 && masterVersion >= NDBD_INCL_NODECONF_VERSION_4) ||
	(NDB_VERSION_MAJOR == 5 && masterVersion >= NDBD_INCL_NODECONF_VERSION_5))
    {
      signal->theData[0] = getOwnNodeId();
      signal->theData[1] = getOwnNodeId();
      sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB);
    }
    return;
  }//if
  if (getNodeStatus(nodeId) != NodeRecord::STARTING) {
@@ -3737,8 +3761,16 @@ void Dbdih::execNODE_FAILREP(Signal* signal)
  /*------------------------------------------------------------------------*/
  // Verify that a starting node has also crashed. Reset the node start record.
  /*-------------------------------------------------------------------------*/
  if (c_nodeStartMaster.startNode != RNIL) {
    ndbrequire(getNodeStatus(c_nodeStartMaster.startNode)!= NodeRecord::ALIVE);
  if (false && c_nodeStartMaster.startNode != RNIL && getNodeStatus(c_nodeStartMaster.startNode) == NodeRecord::ALIVE)
  {
    BlockReference cntrRef = calcNdbCntrBlockRef(c_nodeStartMaster.startNode);
    SystemError * const sysErr = (SystemError*)&signal->theData[0];
    sysErr->errorCode = SystemError::StartInProgressError;
    sysErr->errorRef = reference();
    sysErr->data1= 0;
    sysErr->data2= __LINE__;
    sendSignal(cntrRef, GSN_SYSTEM_ERROR, signal,  SystemError::SignalLength, JBA);
    nodeResetStart();  
  }//if

  /*--------------------------------------------------*/
+60 −0
Original line number Diff line number Diff line
@@ -446,6 +446,56 @@ int runBug15587(NDBT_Context* ctx, NDBT_Step* step){
  return NDBT_OK;
}

int runBug15632(NDBT_Context* ctx, NDBT_Step* step){
  int result = NDBT_OK;
  int loops = ctx->getNumLoops();
  int records = ctx->getNumRecords();
  NdbRestarter restarter;
  
  int nodeId = restarter.getDbNodeId(1);

  ndbout << "Restart node " << nodeId << endl; 
  
  if (restarter.restartOneDbNode(nodeId,
				 /** initial */ false, 
				 /** nostart */ true,
				 /** abort   */ true))
    return NDBT_FAILED;
  
  if (restarter.waitNodesNoStart(&nodeId, 1))
    return NDBT_FAILED; 
   
  if (restarter.insertErrorInNode(nodeId, 7165))
    return NDBT_FAILED;
  
  if (restarter.startNodes(&nodeId, 1))
    return NDBT_FAILED;

  if (restarter.waitNodesStarted(&nodeId, 1))
    return NDBT_FAILED;

  if (restarter.restartOneDbNode(nodeId,
				 /** initial */ false, 
				 /** nostart */ true,
				 /** abort   */ true))
    return NDBT_FAILED;
  
  if (restarter.waitNodesNoStart(&nodeId, 1))
    return NDBT_FAILED; 
   
  if (restarter.insertErrorInNode(nodeId, 7171))
    return NDBT_FAILED;
  
  if (restarter.startNodes(&nodeId, 1))
    return NDBT_FAILED;
  
  if (restarter.waitNodesStarted(&nodeId, 1))
    return NDBT_FAILED;
  
  ctx->stopTest();
  return NDBT_OK;
}


NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", 
@@ -596,6 +646,8 @@ TESTCASE("RestartNFDuringNR",
  INITIALIZER(runCheckAllNodesStarted);
  INITIALIZER(runLoadTable);
  STEP(runRestarts);
  STEP(runPkUpdateUntilStopped);
  STEP(runScanUpdateUntilStopped);
  FINALIZER(runScanReadVerify);
  FINALIZER(runClearTable);
}
@@ -685,6 +737,8 @@ TESTCASE("RestartNodeDuringLCP",
  INITIALIZER(runCheckAllNodesStarted);
  INITIALIZER(runLoadTable);
  STEP(runRestarts);
  STEP(runPkUpdateUntilStopped);
  STEP(runScanUpdateUntilStopped);
  FINALIZER(runScanReadVerify);
  FINALIZER(runClearTable);
}
@@ -716,6 +770,12 @@ TESTCASE("Bug15587",
  STEP(runBug15587);
  FINALIZER(runClearTable);
}
TESTCASE("Bug15632",
	 "Test bug with NF during NR"){
  INITIALIZER(runLoadTable);
  STEP(runBug15632);
  FINALIZER(runClearTable);
}
NDBT_TESTSUITE_END(testNodeRestart);

int main(int argc, const char** argv){