Commit 6acbe4eb authored by unknown's avatar unknown
Browse files

ndb - bug#25984 - more than 7 failed node restart can cause cluster failure

new behaviour is as follows:
1) node is refused to start, and should fail with message in error log that it must be restarted --initial
2) if cluster failure in this situation, node must also be restarted --intial
   if not SR will fail, with this message


storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Prevent node from starting _at all_ if it has performed more than 6 failed
    node restart.
storage/ndb/test/ndbapi/testNodeRestart.cpp:
  test prg 25984
storage/ndb/test/run-test/daily-basic-tests.txt:
  testcase
parent ba2e81e0
Loading
Loading
Loading
Loading
+27 −2
Original line number Diff line number Diff line
@@ -1525,10 +1525,26 @@ void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref)
       */
      SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
      ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
      warningEvent("Making filesystem for node %d unusable",
      warningEvent("Making filesystem for node %d unusable (need --initial)",
		   nodePtr.i);
    }
    else if (nodePtr.p->nodeStatus == NodeRecord::ALIVE &&
	     SYSFILE->lastCompletedGCI[nodePtr.i] == 0)
    {
      jam();
      CRASH_INSERTION(7170);
      char buf[255];
      BaseString::snprintf(buf, sizeof(buf), 
			   "Cluster requires this node to be started "
			   " with --initial as partial start has been performed"
			   " and this filesystem is unusable");
      progError(__LINE__, 
		NDBD_EXIT_SR_RESTARTCONFLICT,
		buf);
      ndbrequire(false);
    }
  }
  /**
   * This set which GCI we will try to restart to
   */
@@ -12515,14 +12531,23 @@ void Dbdih::newCrashedReplica(Uint32 nodeId, ReplicaRecordPtr ncrReplicaPtr)
  /*       THAT THE NEW REPLICA IS NOT STARTED YET AND REPLICA_LAST_GCI IS*/
  /*       SET TO -1 TO INDICATE THAT IT IS NOT DEAD YET.                 */
  /*----------------------------------------------------------------------*/
  Uint32 lastGCI = SYSFILE->lastCompletedGCI[nodeId];
  arrGuardErr(ncrReplicaPtr.p->noCrashedReplicas + 1, 8,
              NDBD_EXIT_MAX_CRASHED_REPLICAS);
  ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] = 
    SYSFILE->lastCompletedGCI[nodeId];
    lastGCI;
  ncrReplicaPtr.p->noCrashedReplicas = ncrReplicaPtr.p->noCrashedReplicas + 1;
  ncrReplicaPtr.p->createGci[ncrReplicaPtr.p->noCrashedReplicas] = 0;
  ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] = 
    (Uint32)-1;
  if (ncrReplicaPtr.p->noCrashedReplicas == 7 && lastGCI)
  {
    jam();
    SYSFILE->lastCompletedGCI[nodeId] = 0;
    warningEvent("Making filesystem for node %d unusable (need --initial)",
		 nodeId);
  }
}//Dbdih::newCrashedReplica()
/*************************************************************************/
+98 −0
Original line number Diff line number Diff line
@@ -1178,6 +1178,101 @@ int runBug25554(NDBT_Context* ctx, NDBT_Step* step){
  return NDBT_OK;
}

int runBug25984(NDBT_Context* ctx, NDBT_Step* step){
  
  int result = NDBT_OK;
  int loops = ctx->getNumLoops();
  int records = ctx->getNumRecords();
  NdbRestarter restarter;

  if (restarter.getNumDbNodes() < 2)
    return NDBT_OK;

  if (restarter.restartAll(true, true, true))
    return NDBT_FAILED;

  if (restarter.waitClusterNoStart())
    return NDBT_FAILED;

  if (restarter.startAll())
    return NDBT_FAILED;

  if (restarter.waitClusterStarted())
    return NDBT_FAILED;

  int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };    
  int master = restarter.getMasterNodeId();
  int victim = restarter.getRandomNodeOtherNodeGroup(master, rand());
  if (victim == -1)
    victim = restarter.getRandomNodeSameNodeGroup(master, rand());

  restarter.restartOneDbNode(victim, false, true, true);

  for (Uint32 i = 0; i<6; i++)
  {
    ndbout_c("Loop: %d", i);
    if (restarter.waitNodesNoStart(&victim, 1))
      return NDBT_FAILED;
    
    if (restarter.dumpStateOneNode(victim, val2, 2))
      return NDBT_FAILED;
    
    if (restarter.insertErrorInNode(victim, 7016))
      return NDBT_FAILED;
    
    if (restarter.startNodes(&victim, 1))
      return NDBT_FAILED;

    if (restarter.waitNodesStartPhase(&victim, 1, 2))
      return NDBT_FAILED;
  }

  if (restarter.waitNodesNoStart(&victim, 1))
    return NDBT_FAILED;

  if (restarter.dumpStateOneNode(victim, val2, 2))
    return NDBT_FAILED;
  
  if (restarter.insertErrorInNode(victim, 7170))
    return NDBT_FAILED;

  if (restarter.startNodes(&victim, 1))
    return NDBT_FAILED;

  if (restarter.waitNodesNoStart(&victim, 1))
    return NDBT_FAILED;
  
  if (restarter.restartAll(false, true, true))
    return NDBT_FAILED;

  if (restarter.insertErrorInAllNodes(932))
    return NDBT_FAILED;

  if (restarter.insertErrorInNode(master, 7170))
    return NDBT_FAILED;

  if (restarter.dumpStateAllNodes(val2, 2))
    return NDBT_FAILED;
  
  restarter.startNodes(&master, 1);
  NdbSleep_MilliSleep(3000);
  restarter.startAll();

  if (restarter.waitClusterNoStart())
    return NDBT_FAILED;

  if (restarter.restartOneDbNode(victim, true, true, true))
    return NDBT_FAILED;

  if (restarter.startAll())
    return NDBT_FAILED;

  if (restarter.waitClusterStarted())
    return NDBT_FAILED;

  return NDBT_OK;
}


NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", 
@@ -1514,6 +1609,9 @@ TESTCASE("Bug25468", ""){
TESTCASE("Bug25554", ""){
  INITIALIZER(runBug25554);
}
TESTCASE("Bug25984", ""){
  INITIALIZER(runBug25984);
}
NDBT_TESTSUITE_END(testNodeRestart);

int main(int argc, const char** argv){
+4 −0
Original line number Diff line number Diff line
@@ -525,6 +525,10 @@ max-time: 1000
cmd: testNodeRestart
args: -n Bug25554 T1

max-time: 1000
cmd: testNodeRestart
args: -n Bug25984

#
# DICT TESTS
max-time: 1500