Commit 9ed1b843 authored by unknown's avatar unknown
Browse files

Bug#26293 cluster mgmt node sometimes doesn't receive events from all nodes on restart

- signals where sometimes sent too early when setting up subscriptions


ndb/include/kernel/signaldata/DumpStateOrd.hpp:
  added dump for active subscriptions in cmvmi
ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp:
  added dump for active subscriptions in cmvmi
ndb/src/mgmsrv/MgmtSrvr.cpp:
  bug in that signals where sent prior to api reg conf arrived, causing thrown away signals and subsequent hangs in mgmtserver
  also add retry if node connected but not yet received api reg conf
ndb/src/ndbapi/ClusterMgr.cpp:
  added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable
ndb/src/ndbapi/ClusterMgr.hpp:
  added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable
ndb/src/ndbapi/SignalSender.cpp:
  assert to see that node is sendable when signal is sent
ndb/src/ndbapi/SignalSender.hpp:
  manke metchd const
parent 2ed7eaf5
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -107,6 +107,10 @@ public:
    CmvmiDumpLongSignalMemory = 2601,
    CmvmiSetRestartOnErrorInsert = 2602,
    CmvmiTestLongSigWithDelay = 2603,
    CmvmiDumpSubscriptions = 2604, /* note: done to respective outfile
                                      to be able to debug if events
                                      for some reason does not end up
                                      in clusterlog */
    // 7000 DIH
    // 7001 DIH
    // 7002 DIH
+19 −1
Original line number Diff line number Diff line
@@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* signal)
  case TimeToWaitAlive:

    // QMGR
  case HeartbeatIntervalDbDb: // TODO ev till Ndbcnt ocks
  case HeartbeatIntervalDbDb: // TODO possibly Ndbcnt too
  case HeartbeatIntervalDbApi:
  case ArbitTimeout:
    sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB);
@@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
    }
  }
  
  if (arg == DumpStateOrd::CmvmiDumpSubscriptions)
  {
    SubscriberPtr ptr;
    subscribers.first(ptr);  
    g_eventLogger.info("List subscriptions:");
    while(ptr.i != RNIL)
    {
      g_eventLogger.info("Subscription: %u, nodeId: %u, ref: 0x%x",
                         ptr.i,  refToNode(ptr.p->blockRef), ptr.p->blockRef);
      for(Uint32 i = 0; i < LogLevel::LOGLEVEL_CATEGORIES; i++)
      {
        Uint32 level = ptr.p->logLevel.getLogLevel((LogLevel::EventCategory)i);
        g_eventLogger.info("Category %u Level %u", i, level);
      }
      subscribers.next(ptr);
    }
  }

  if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){
    infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d",
	      g_sectionSegmentPool.getSize(),
+74 −20
Original line number Diff line number Diff line
@@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, bool unCond)
    return WRONG_PROCESS_TYPE;
  // Check if we have contact with it
  if(unCond){
    if(theFacade->theClusterMgr->getNodeInfo(nodeId).connected)
    if(theFacade->theClusterMgr->getNodeInfo(nodeId).m_api_reg_conf)
      return 0;
  }
  else if (theFacade->get_node_alive(nodeId) == true)
@@ -1562,12 +1562,17 @@ MgmtSrvr::status(int nodeId,
}

int 
MgmtSrvr::setEventReportingLevelImpl(int nodeId, 
MgmtSrvr::setEventReportingLevelImpl(int nodeId_arg, 
				     const EventSubscribeReq& ll)
{
  SignalSender ss(theFacade);
  NdbNodeBitmask nodes;
  int retries = 30;
  nodes.clear();
  while (1)
  {
    Uint32 nodeId, max;
    ss.lock();

    SimpleSignal ssig;
    EventSubscribeReq * dst = 
      CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
@@ -1575,19 +1580,67 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId,
             EventSubscribeReq::SignalLength);
    *dst = ll;

  NodeBitmask nodes;
  nodes.clear();
  Uint32 max = (nodeId == 0) ? (nodeId = 1, MAX_NDB_NODES) : nodeId;
  for(; (Uint32) nodeId <= max; nodeId++)
    if (nodeId_arg == 0)
    {
      // all nodes
      nodeId = 1;
      max = MAX_NDB_NODES;
    }
    else
    {
      // only one node
      max = nodeId = nodeId_arg;
    }
    // first make sure nodes are sendable
    for(; nodeId <= max; nodeId++)
    {
      if (nodeTypes[nodeId] != NODE_TYPE_DB)
        continue;
      if (okToSendTo(nodeId, true))
      {
        if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected  == false)
        {
          // node not connected we can safely skip this one
          continue;
    if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
        }
        // api_reg_conf not recevied yet, need to retry
        break;
      }
    }
    if (nodeId <= max)
    {
      if (--retries)
      {
        ss.unlock();
        NdbSleep_MilliSleep(100);  
        continue;
      }
      return SEND_OR_RECEIVE_FAILED;
    }

    if (nodeId_arg == 0)
    {
      // all nodes
      nodeId = 1;
      max = MAX_NDB_NODES;
    }
    else
    {
      // only one node
      max = nodeId = nodeId_arg;
    }
    // now send to all sendable nodes nodes
    // note, lock is held, so states have not changed
    for(; (Uint32) nodeId <= max; nodeId++)
    {
      if (nodeTypes[nodeId] != NODE_TYPE_DB)
        continue;
      if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected  == false)
        continue; // node is not connected, skip
      if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
        nodes.set(nodeId);
    }
    break;
  }

  if (nodes.isclear())
@@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId,
  int error = 0;
  while (!nodes.isclear())
  {
    Uint32 nodeId;
    SimpleSignal *signal = ss.waitFor();
    int gsn = signal->readSignalNumber();
    nodeId = refToNode(signal->header.theSendersBlockRef);
+4 −1
Original line number Diff line number Diff line
@@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){
ClusterMgr::Node::Node()
  : m_state(NodeState::SL_NOTHING) { 
  compatible = nfCompleteRep = true;
  connected = defined = m_alive = false; 
  connected = defined = m_alive = m_api_reg_conf = false; 
  m_state.m_connected_nodes.clear();
}

@@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){
					      node.m_info.m_version);
  }

  node.m_api_reg_conf = true;

  node.m_state = apiRegConf->nodeState;
  if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED  ||
			  node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
@@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId nodeId){

  noOfConnectedNodes--;
  theNodes[nodeId].connected = false;
  theNodes[nodeId].m_api_reg_conf = false;
  theNodes[nodeId].m_state.m_connected_nodes.clear();

  reportNodeFailed(nodeId, true);
+1 −0
Original line number Diff line number Diff line
@@ -65,6 +65,7 @@ public:
    bool compatible;    // Version is compatible
    bool nfCompleteRep; // NF Complete Rep has arrived
    bool m_alive;       // Node is alive
    bool m_api_reg_conf;// API_REGCONF has arrived
    
    NodeInfo  m_info;
    NodeState m_state;
Loading