Commit 3bfaf333 authored by unknown's avatar unknown
Browse files

ndb - bug#16772

  dont't allow node to join cluster until all nodes has completed failure handling


ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  When getting CM_ADD for node that I haven't completed failure handling for do _not_ just override.
  But instead set state...and send CM_ACK_ADD on execCONNECT_REP (much...later)
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase for bug#16772
ndb/test/run-test/daily-basic-tests.txt:
  Run test in basic suite
parent 6ac6b08c
Loading
Loading
Loading
Loading
+87 −14
Original line number Diff line number Diff line
@@ -257,6 +257,7 @@ void Qmgr::setArbitTimeout(UintR aArbitTimeout)

void Qmgr::execCONNECT_REP(Signal* signal)
{
  jamEntry();
  const Uint32 nodeId = signal->theData[0];
  c_connectedNodes.set(nodeId);
  NodeRecPtr nodePtr;
@@ -264,9 +265,13 @@ void Qmgr::execCONNECT_REP(Signal* signal)
  ptrCheckGuard(nodePtr, MAX_NODES, nodeRec);
  switch(nodePtr.p->phase){
  case ZSTARTING:
  case ZRUNNING:
    jam();
    if(!c_start.m_nodes.isWaitingFor(nodeId)){
      jam();
      return;
    }
    break;
  case ZRUNNING:
  case ZPREPARE_FAIL:
  case ZFAIL_CLOSING:
    jam();
@@ -278,20 +283,27 @@ void Qmgr::execCONNECT_REP(Signal* signal)
    return;
  }
  
  if(!c_start.m_nodes.isWaitingFor(nodeId)){
    jam();
    return;
  }

  switch(c_start.m_gsn){
  case GSN_CM_REGREQ:
    jam();
    sendCmRegReq(signal, nodeId);
    return;
  case GSN_CM_NODEINFOREQ:{
  case GSN_CM_NODEINFOREQ:
    jam();
    sendCmNodeInfoReq(signal, nodeId, nodePtr.p);
    return;
  case GSN_CM_ADD:{
    jam();

    ndbrequire(getOwnNodeId() != cpresident);
    c_start.m_nodes.clearWaitingFor(nodeId);
    c_start.m_gsn = RNIL;
    
    NodeRecPtr addNodePtr;
    addNodePtr.i = nodeId;
    ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec);
    cmAddPrepare(signal, addNodePtr, nodePtr.p);
    return;
  }
  default:
    return;
@@ -924,15 +936,27 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){
    return;
  case ZFAIL_CLOSING:
    jam();
#ifdef VM_TRACE
    ndbout_c("Enabling communication to CM_ADD node state=%d", 
    
#if 1
    warningEvent("Recieved request to incorperate node %u, "
		 "while error handling has not yet completed",
		 nodePtr.i);
    
    ndbrequire(getOwnNodeId() != cpresident);
    ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD);
    c_start.m_nodes.clearWaitingFor();
    c_start.m_nodes.setWaitingFor(nodePtr.i);
    c_start.m_gsn = GSN_CM_ADD;
#else
    warningEvent("Enabling communication to CM_ADD node %u state=%d", 
		 nodePtr.i,
		 nodePtr.p->phase);
#endif
    nodePtr.p->phase = ZSTARTING;
    nodePtr.p->failState = NORMAL;
    signal->theData[0] = 0;
    signal->theData[1] = nodePtr.i;
    sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA);
#endif
    return;
  case ZSTARTING:
    break;
@@ -1766,11 +1790,27 @@ void Qmgr::execNDB_FAILCONF(Signal* signal)

  jamEntry();
  failedNodePtr.i = signal->theData[0];  

  if (ERROR_INSERTED(930))
  {
    CLEAR_ERROR_INSERT_VALUE;
    infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i);
    return;
  }
  
  ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
  if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){
    failedNodePtr.p->failState = NORMAL;
  } else {
    jam();

    char buf[100];
    BaseString::snprintf(buf, 100, 
			 "Received NDB_FAILCONF for node %u with state: %d %d",
			 failedNodePtr.i,
			 failedNodePtr.p->phase,
			 failedNodePtr.p->failState);
    progError(__LINE__, 0, buf);
    systemErrorLab(signal, __LINE__);
  }//if
  if (cpresident == getOwnNodeId()) {
@@ -2077,7 +2117,39 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
  ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
  if (failedNodePtr.i == getOwnNodeId()) {
    jam();
    systemErrorLab(signal, __LINE__);

    const char * msg = 0;
    switch(aFailCause){
    case FailRep::ZOWN_FAILURE: 
      msg = "Own failure"; 
      break;
    case FailRep::ZOTHER_NODE_WHEN_WE_START: 
    case FailRep::ZOTHERNODE_FAILED_DURING_START:
      msg = "Other node died during start"; 
      break;
    case FailRep::ZIN_PREP_FAIL_REQ:
      msg = "Prep fail";
      break;
    case FailRep::ZSTART_IN_REGREQ:
      msg = "Start timeout";
      break;
    case FailRep::ZHEARTBEAT_FAILURE:
      msg = "Hearbeat failure";
      break;
    case FailRep::ZLINK_FAILURE:
      msg = "Connection failure";
      break;
    }
    
    char buf[100];
    BaseString::snprintf(buf, 100, 
			 "We(%u) have been declared dead by %u reason: %s(%u)",
			 getOwnNodeId(),
			 refToNode(signal->getSendersBlockRef()),
			 aFailCause,
			 msg ? msg : "<Unknown>");

    progError(__LINE__, 0, buf);
    return;
  }//if
  
@@ -2791,6 +2863,7 @@ void Qmgr::failReport(Signal* signal,
        cfailureNr = cprepareFailureNr;
        ctoFailureNr = 0;
        ctoStatus = Q_ACTIVE;
	c_start.reset(); // Don't take over nodes being started
        if (cnoCommitFailedNodes > 0) {
          jam();
	  /**-----------------------------------------------------------------
+50 −0
Original line number Diff line number Diff line
@@ -535,6 +535,52 @@ int runBug15685(NDBT_Context* ctx, NDBT_Step* step){
  return NDBT_FAILED;
}

int 
runBug16772(NDBT_Context* ctx, NDBT_Step* step){

  NdbRestarter restarter;
  if (restarter.getNumDbNodes() < 2)
  {
    ctx->stopTest();
    return NDBT_OK;
  }

  int aliveNodeId = restarter.getRandomNotMasterNodeId(rand());
  int deadNodeId = aliveNodeId;
  while (deadNodeId == aliveNodeId)
    deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes());
  
  if (restarter.insertErrorInNode(aliveNodeId, 930))
    return NDBT_FAILED;

  if (restarter.restartOneDbNode(deadNodeId,
				 /** initial */ false, 
				 /** nostart */ true,
				 /** abort   */ true))
    return NDBT_FAILED;
  
  if (restarter.waitNodesNoStart(&deadNodeId, 1))
    return NDBT_FAILED;

  if (restarter.startNodes(&deadNodeId, 1))
    return NDBT_FAILED;

  // It should now be hanging since we throw away NDB_FAILCONF
  int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10);
  // So this should fail...i.e it should not reach startphase 3

  // Now send a NDB_FAILCONF for deadNo
  int dump[] = { 7020, 323, 252, 0 };
  dump[3] = deadNodeId;
  if (restarter.dumpStateOneNode(aliveNodeId, dump, 4))
    return NDBT_FAILED;
  
  if (restarter.waitNodesStarted(&deadNodeId, 1))
    return NDBT_FAILED;

  return ret ? NDBT_OK : NDBT_FAILED;
}


NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", 
@@ -820,6 +866,10 @@ TESTCASE("Bug15685",
  STEP(runBug15685);
  FINALIZER(runClearTable);
}
TESTCASE("Bug16772",
	 "Test bug with restarting before NF handling is complete"){
  STEP(runBug16772);
}
NDBT_TESTSUITE_END(testNodeRestart);

int main(int argc, const char** argv){
+4 −0
Original line number Diff line number Diff line
@@ -446,6 +446,10 @@ max-time: 500
cmd: testNodeRestart
args: -n Bug15685 T1

max-time: 500
cmd: testNodeRestart
args: -n Bug16772 T1

# OLD FLEX
max-time: 500
cmd: flexBench