Commit 8ed36cb6 authored by unknown's avatar unknown
Browse files

ndb - bug#18385

  Partial system restart, can not try to start with higher GCI that own
  even if knowing about a higher number


ndb/include/kernel/signaldata/DumpStateOrd.hpp:
  Add new dump for setting time between gcp
ndb/include/kernel/signaldata/StartPerm.hpp:
  Move error codes into StartPerm + Add new error code
ndb/src/kernel/blocks/ERROR_codes.txt:
  Add new error insert
ndb/src/kernel/blocks/dbdih/Dbdih.hpp:
  Move error codes into StartPerm + Add new error code
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Fix so that we don't try to restart to a too new GCI when doing a partial start
  Add new error code when this node later tries to join
ndb/test/include/NdbRestarter.hpp:
  Add new method for selecting random node
ndb/test/ndbapi/testSystemRestart.cpp:
  Add new testcase for bug#18385
ndb/test/run-test/daily-basic-tests.txt:
  Run test in daily-basic
ndb/test/src/NdbRestarter.cpp:
  Add new method for selecting random node
parent 591aedaa
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -127,6 +127,7 @@ public:
    DihMinTimeBetweenLCP = 7017,
    DihMaxTimeBetweenLCP = 7018,
    EnableUndoDelayDataWrite = 7080, // DIH+ACC+TUP
    DihSetTimeBetweenGcp = 7090,
    DihStartLcpImmediately = 7099,
    // 8000 Suma
    // 12000 Tux
+6 −0
Original line number Diff line number Diff line
@@ -64,5 +64,11 @@ private:
  
  Uint32 startingNodeId;
  Uint32 errorCode;  

  enum ErrorCode
  {
    ZNODE_ALREADY_STARTING_ERROR = 305,
    InitialStartRequired = 320
  };
};
#endif
+2 −0
Original line number Diff line number Diff line
@@ -303,6 +303,8 @@ Test Crashes in handling node restarts
7131: Crash when receiving START_COPYREQ in master node
7132: Crash when receiving START_COPYCONF in starting node

7170: Crash when receiving START_PERMREF (InitialStartRequired)

DICT:
6000  Crash during NR when receiving DICTSTARTREQ
6001  Crash during NR when receiving SCHEMA_INFO
+0 −1
Original line number Diff line number Diff line
@@ -81,7 +81,6 @@
#define ZWRONG_FAILURE_NUMBER_ERROR 302
#define ZWRONG_START_NODE_ERROR 303
#define ZNO_REPLICA_FOUND_ERROR 304
#define ZNODE_ALREADY_STARTING_ERROR 305
#define ZNODE_START_DISALLOWED_ERROR 309

// --------------------------------------
+77 −22
Original line number Diff line number Diff line
@@ -1420,6 +1420,33 @@ void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref)
    return;
  }
  
  NodeRecordPtr nodePtr;
  Uint32 gci = SYSFILE->lastCompletedGCI[getOwnNodeId()];
  for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) 
  {
    jam();
    ptrAss(nodePtr, nodeRecord);
    if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) 
    {
      jam();
      /**
       * Since we're starting(is master) and there 
       *   there are other nodes with higher GCI...
       *   there gci's must be invalidated...
       *   and they _must_ do an initial start
       *   indicate this by setting lastCompletedGCI = 0
       */
      SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
      ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
      warningEvent("Making filesystem for node %d unusable",
		   nodePtr.i);
    }
  }
  /**
   * This set which GCI we will try to restart to
   */
  SYSFILE->newestRestorableGCI = gci;
  
  ndbrequire(isMaster());
  copyGciLab(signal, CopyGCIReq::RESTART); // We have already read the file!
}//Dbdih::ndbStartReqLab()
@@ -1557,7 +1584,7 @@ void Dbdih::execSTART_PERMREF(Signal* signal)
{
  jamEntry();
  Uint32 errorCode = signal->theData[1];
  if (errorCode == ZNODE_ALREADY_STARTING_ERROR) {
  if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) {
    jam();
    /*-----------------------------------------------------------------------*/
    // The master was busy adding another node. We will wait for a second and
@@ -1567,6 +1594,20 @@ void Dbdih::execSTART_PERMREF(Signal* signal)
    sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 1);
    return;
  }//if

  if (errorCode == StartPermRef::InitialStartRequired)
  {
    CRASH_INSERTION(7170);
    char buf[255];
    BaseString::snprintf(buf, sizeof(buf), 
			 "Cluster requires this node to be started "
			 " with --initial as partial start has been performed"
			 " and this filesystem is unusable");
    progError(__LINE__, 
	      ERR_SR_RESTARTCONFLICT,
	      buf);
    ndbrequire(false);
  }
  /*------------------------------------------------------------------------*/
  // Some node process in another node involving our node was still active. We
  // will recover from this by crashing here. 
@@ -1657,7 +1698,7 @@ void Dbdih::execSTART_PERMREQ(Signal* signal)
      (c_nodeStartMaster.wait != ZFALSE)) {
    jam();
    signal->theData[0] = nodeId;
    signal->theData[1] = ZNODE_ALREADY_STARTING_ERROR;
    signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR;
    sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
    return;
  }//if
@@ -1667,6 +1708,16 @@ void Dbdih::execSTART_PERMREQ(Signal* signal)
    ndbrequire(false);
  }//if

  if (SYSFILE->lastCompletedGCI[nodeId] == 0 &&
      typeStart != NodeState::ST_INITIAL_NODE_RESTART)
  {
    jam();
    signal->theData[0] = nodeId;
    signal->theData[1] = StartPermRef::InitialStartRequired;
    sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
    return;
  }

  /*----------------------------------------------------------------------
   * WE START THE INCLUSION PROCEDURE 
   * ---------------------------------------------------------------------*/
@@ -3515,24 +3566,12 @@ void Dbdih::closingGcpLab(Signal* signal, FileRecordPtr filePtr)
/* ------------------------------------------------------------------------- */
void Dbdih::selectMasterCandidateAndSend(Signal* signal)
{
  Uint32 gci = 0;
  Uint32 masterCandidateId = 0;
  NodeRecordPtr nodePtr;
  for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
    jam();
    ptrAss(nodePtr, nodeRecord);
    if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) {
      jam();
      masterCandidateId = nodePtr.i;
      gci = SYSFILE->lastCompletedGCI[nodePtr.i];
    }//if
  }//for
  ndbrequire(masterCandidateId != 0);
  setNodeGroups();
  signal->theData[0] = masterCandidateId;
  signal->theData[1] = gci;
  signal->theData[0] = getOwnNodeId();
  signal->theData[1] = SYSFILE->lastCompletedGCI[getOwnNodeId()];
  sendSignal(cntrlblockref, GSN_DIH_RESTARTCONF, signal, 2, JBB);
  
  NodeRecordPtr nodePtr;
  Uint32 node_groups[MAX_NDB_NODES];
  memset(node_groups, 0, sizeof(node_groups));
  for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
@@ -13359,6 +13398,22 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal)
    c_lcpState.ctimer += (1 << c_lcpState.clcpDelay);
    return;
  }

  if (dumpState->args[0] == DumpStateOrd::DihSetTimeBetweenGcp)
  {
    if (signal->getLength() == 1)
    {
      const ndb_mgm_configuration_iterator * p = 
	theConfiguration.getOwnConfigIterator();
      ndbrequire(p != 0);
      ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &cgcpDelay);
    }
    else
    {
      cgcpDelay = signal->theData[1];
    }
    ndbout_c("Setting time between gcp : %d", cgcpDelay);
  }
}//Dbdih::execDUMP_STATE_ORD()

void
Loading