Commit c961fefd authored by unknown's avatar unknown
Browse files

ndb - bug#18612 (detection of partitioned cluster)

  this also impl. gcp safe multi node shutdown
  1) block gcp
  2) wait for ongoing gcp
  3) inform all stopping QMGR's (so that they don't start with error handler)
  4) wait for all QMGR's to reply
  5) broadcast failrep for stopping nodes
  6) (if !master died) unblock gcp

  


ndb/include/kernel/signaldata/DumpStateOrd.hpp:
  Impl. GCP safe multi node shutdown in order to test bug#18612
ndb/include/kernel/signaldata/FailRep.hpp:
  Impl. GCP safe multi node shutdown in order to test bug#18612
ndb/include/kernel/signaldata/StopReq.hpp:
  Impl. GCP safe multi node shutdown in order to test bug#18612
ndb/src/kernel/blocks/ndbcntr/Ndbcntr.hpp:
  Impl. GCP safe multi node shutdown in order to test bug#18612
ndb/src/kernel/blocks/ndbcntr/NdbcntrInit.cpp:
  Impl. GCP safe multi node shutdown in order to test bug#18612
ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp:
  Impl. GCP safe multi node shutdown in order to test bug#18612
ndb/src/kernel/blocks/qmgr/Qmgr.hpp:
  Impl. GCP safe multi node shutdown in order to test bug#18612
ndb/src/kernel/blocks/qmgr/QmgrInit.cpp:
  Impl. GCP safe multi node shutdown in order to test bug#18612
ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  Impl. GCP safe multi node shutdown in order to test bug#18612
ndb/test/ndbapi/testNodeRestart.cpp:
  test program for bug#18612
parent 7efbf0af
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -64,6 +64,7 @@ public:
    // 19 NDBFS Fipple with O_SYNC, O_CREATE etc.
    // 20-24 BACKUP
    NdbcntrTestStopOnError = 25,
    NdbcntrStopNodes = 70,
    // 100-105 TUP and ACC  
    // 200-240 UTIL
    // 300-305 TRIX
+4 −2
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@ class FailRep {
   * Sender(s) & Reciver(s)
   */
  friend class Qmgr;
  friend class Ndbcntr;
  
  /**
   * For printing
@@ -43,7 +44,8 @@ public:
    ZSTART_IN_REGREQ=3,
    ZHEARTBEAT_FAILURE=4,
    ZLINK_FAILURE=5,
    ZOTHERNODE_FAILED_DURING_START=6
    ZOTHERNODE_FAILED_DURING_START=6,
    ZMULTI_NODE_SHUTDOWN = 7
  };
  
private:
+22 −16
Original line number Diff line number Diff line
@@ -32,7 +32,7 @@ class StopReq
  friend class MgmtSrvr;

public:
  STATIC_CONST( SignalLength = 9 );
  STATIC_CONST( SignalLength = 9 + NdbNodeBitmask::Size);
  
public:
  Uint32 senderRef;
@@ -49,29 +49,34 @@ public:
  Int32 readOperationTimeout; // Timeout before read operations are aborted
  Int32 operationTimeout;     // Timeout before all operations are aborted

  Uint32 nodes[NdbNodeBitmask::Size];

  static void setSystemStop(Uint32 & requestInfo, bool value);
  static void setPerformRestart(Uint32 & requestInfo, bool value);
  static void setNoStart(Uint32 & requestInfo, bool value);
  static void setInitialStart(Uint32 & requestInfo, bool value);
  static void setEscalateOnNodeFail(Uint32 & requestInfo, bool value);
  /**
   * Don't perform "graceful" shutdown/restart...
   */
  static void setStopAbort(Uint32 & requestInfo, bool value);
  static void setStopNodes(Uint32 & requestInfo, bool value);

  static bool getSystemStop(const Uint32 & requestInfo);
  static bool getPerformRestart(const Uint32 & requestInfo);
  static bool getNoStart(const Uint32 & requestInfo);
  static bool getInitialStart(const Uint32 & requestInfo);
  static bool getEscalateOnNodeFail(const Uint32 & requestInfo);
  static bool getStopAbort(const Uint32 & requestInfo);
  static bool getStopNodes(const Uint32 & requestInfo);
};

struct StopConf
{
  STATIC_CONST( SignalLength = 2 );
  Uint32 senderData;
  union {
    Uint32 nodeState;
    Uint32 nodeId;
  };
};

class StopRef 
@@ -94,7 +99,9 @@ public:
    NodeShutdownInProgress = 1,
    SystemShutdownInProgress = 2,
    NodeShutdownWouldCauseSystemCrash = 3,
    TransactionAbortFailed = 4
    TransactionAbortFailed = 4,
    UnsupportedNodeShutdown = 5,
    MultiNodeShutdownNotMaster = 6
  };
  
public:
@@ -132,16 +139,16 @@ StopReq::getInitialStart(const Uint32 & requestInfo)

inline
bool
StopReq::getEscalateOnNodeFail(const Uint32 & requestInfo)
StopReq::getStopAbort(const Uint32 & requestInfo)
{
  return requestInfo & 16;
  return requestInfo & 32;
}

inline
bool
StopReq::getStopAbort(const Uint32 & requestInfo)
StopReq::getStopNodes(const Uint32 & requestInfo)
{
  return requestInfo & 32;
  return requestInfo & 64;
}


@@ -187,24 +194,23 @@ StopReq::setInitialStart(Uint32 & requestInfo, bool value)

inline
void
StopReq::setEscalateOnNodeFail(Uint32 & requestInfo, bool value)
StopReq::setStopAbort(Uint32 & requestInfo, bool value)
{
  if(value)
    requestInfo |= 16;
    requestInfo |= 32;
  else
    requestInfo &= ~16;
    requestInfo &= ~32;
}

inline
void
StopReq::setStopAbort(Uint32 & requestInfo, bool value)
StopReq::setStopNodes(Uint32 & requestInfo, bool value)
{
  if(value)
    requestInfo |= 32;
    requestInfo |= 64;
  else
    requestInfo &= ~32;
    requestInfo &= ~64;
}


#endif
+11 −0
Original line number Diff line number Diff line
@@ -202,6 +202,7 @@ private:
  void execWAIT_GCP_CONF(Signal* signal);

  void execSTOP_REQ(Signal* signal);
  void execSTOP_CONF(Signal* signal);
  void execRESUME_REQ(Signal* signal);

  void execCHANGE_NODE_STATE_CONF(Signal* signal);
@@ -337,6 +338,16 @@ public:
    void progError(int line, int cause, const char * extra) { 
      cntr.progError(line, cause, extra); 
    }

    enum StopNodesStep {
      SR_BLOCK_GCP_START_GCP = 0,
      SR_WAIT_COMPLETE_GCP = 1,
      SR_UNBLOCK_GCP_START_GCP = 2,
      SR_QMGR_STOP_REQ = 3,
      SR_WAIT_NODE_FAILURES = 4,
      SR_CLUSTER_SHUTDOWN = 12
    } m_state;
    SignalCounter m_stop_req_counter;
  };
private:
  StopRecord c_stopRec;
+1 −0
Original line number Diff line number Diff line
@@ -86,6 +86,7 @@ Ndbcntr::Ndbcntr(const class Configuration & conf):
  addRecSignal(GSN_STOP_ME_CONF, &Ndbcntr::execSTOP_ME_CONF);

  addRecSignal(GSN_STOP_REQ, &Ndbcntr::execSTOP_REQ);
  addRecSignal(GSN_STOP_CONF, &Ndbcntr::execSTOP_CONF);
  addRecSignal(GSN_RESUME_REQ, &Ndbcntr::execRESUME_REQ);

  addRecSignal(GSN_WAIT_GCP_REF, &Ndbcntr::execWAIT_GCP_REF);
Loading