Commit 2f955d19 authored by unknown's avatar unknown
Browse files

Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart

- addded more retries to wait for nodefailure to complete
Bug #19039 multi node failure causes node failure handling not to complete
- patch to avoid this scenario when the management server is used to perform the stop
- wait for NF_COMPLETE_REP in management server before returning
ndb: allocate nodeid
- only retry on retryable error

parent 51ddd6ff
Loading
Loading
Loading
Loading
+7 −1
Original line number Diff line number Diff line
@@ -232,6 +232,12 @@ extern "C" {
    /** Could not connect to socker */
    NDB_MGM_COULD_NOT_CONNECT_TO_SOCKET = 1011,

    /* Alloc node id failures */
    /** Generic error, retry may succeed */
    NDB_MGM_ALLOCID_ERROR = 1101,
    /** Non retriable error */
    NDB_MGM_ALLOCID_CONFIG_MISMATCH = 1102,

    /* Service errors - Start/Stop Node or System */
    /** Start failed */
    NDB_MGM_START_FAILED = 2001,
@@ -999,7 +1005,7 @@ extern "C" {
  void ndb_mgm_destroy_configuration(struct ndb_mgm_configuration *);

  int ndb_mgm_alloc_nodeid(NdbMgmHandle handle,
			   unsigned version, int nodetype);
			   unsigned version, int nodetype, int log_event);

  /**
   * End Session
+4 −2
Original line number Diff line number Diff line
@@ -349,12 +349,14 @@ ConfigRetriever::allocNodeId(int no_retries, int retry_delay_in_seconds)
	if(!ndb_mgm_connect(m_handle, 0, 0, 0))
	  goto next;

      res= ndb_mgm_alloc_nodeid(m_handle, m_version, m_node_type);
      res= ndb_mgm_alloc_nodeid(m_handle, m_version, m_node_type,
                                no_retries == 0 /* only log last retry */);
      if(res >= 0)
	return _ownNodeId= (Uint32)res;

  next:
      if (no_retries == 0)
      int error = ndb_mgm_get_latest_error(m_handle);
      if (no_retries == 0 || error == NDB_MGM_ALLOCID_CONFIG_MISMATCH)
	break;
      no_retries--;
      NdbSleep_SecSleep(retry_delay_in_seconds);
+2 −1
Original line number Diff line number Diff line
@@ -286,7 +286,8 @@ Configuration::fetch_configuration(){
  if (globalData.ownId)
    cr.setNodeId(globalData.ownId);

  globalData.ownId = cr.allocNodeId(2 /*retry*/,3 /*delay*/);
  globalData.ownId = cr.allocNodeId(globalData.ownId ? 10 : 2 /*retry*/,
                                    3 /*delay*/);
  
  if(globalData.ownId == 0){
    ERROR_SET(fatal, NDBD_EXIT_INVALID_CONFIG, 
+9 −4
Original line number Diff line number Diff line
@@ -1868,7 +1868,8 @@ const char *ndb_mgm_get_connectstring(NdbMgmHandle handle, char *buf, int buf_sz

extern "C"
int
ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype)
ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype,
                     int log_event)
{
  CHECK_HANDLE(handle, 0);
  CHECK_CONNECTED(handle, 0);
@@ -1888,9 +1889,11 @@ ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype)
  args.put("endian", (endian_check.c[sizeof(long)-1])?"big":"little");
  if (handle->m_name)
    args.put("name", handle->m_name);
  args.put("log_event", log_event);

  const ParserRow<ParserDummy> reply[]= {
    MGM_CMD("get nodeid reply", NULL, ""),
      MGM_ARG("error_code", Int, Optional, "Error code"),
      MGM_ARG("nodeid", Int, Optional, "Error message"),
      MGM_ARG("result", String, Mandatory, "Error message"),
    MGM_END()
@@ -1903,14 +1906,16 @@ ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype)
  nodeid= -1;
  do {
    const char * buf;
    if(!prop->get("result", &buf) || strcmp(buf, "Ok") != 0){
    if (!prop->get("result", &buf) || strcmp(buf, "Ok") != 0)
    {
      const char *hostname= ndb_mgm_get_connected_host(handle);
      unsigned port=  ndb_mgm_get_connected_port(handle);
      BaseString err;
      Uint32 error_code= NDB_MGM_ALLOCID_ERROR;
      err.assfmt("Could not alloc node id at %s port %d: %s",
		 hostname, port, buf);
      setError(handle, NDB_MGM_COULD_NOT_CONNECT_TO_SOCKET, __LINE__,
	       err.c_str());
      prop->get("error_code", &error_code);
      setError(handle, error_code, __LINE__, err.c_str());
      break;
    }
    Uint32 _nodeid;
+107 −39
Original line number Diff line number Diff line
@@ -507,9 +507,10 @@ MgmtSrvr::MgmtSrvr(SocketServer *socket_server,
  if (_ownNodeId == 0) // we did not get node id from other server
  {
    NodeId tmp= m_config_retriever->get_configuration_nodeid();
    int error_code;

    if (!alloc_node_id(&tmp, NDB_MGM_NODE_TYPE_MGM,
		       0, 0, error_string)){
		       0, 0, error_code, error_string)){
      ndbout << "Unable to obtain requested nodeid: "
	     << error_string.c_str() << endl;
      require(false);
@@ -1118,31 +1119,16 @@ int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids,
      const NFCompleteRep * const rep =
	CAST_CONSTPTR(NFCompleteRep, signal->getDataPtr());
#ifdef VM_TRACE
      ndbout_c("Node %d fail completed", rep->failedNodeId);
      ndbout_c("sendSTOP_REQ Node %d fail completed", rep->failedNodeId);
#endif
      nodes.clear(rep->failedNodeId); // clear the failed node
      if (singleUserNodeId == 0)
        stoppedNodes.set(rep->failedNodeId);
      break;
    }
    case GSN_NODE_FAILREP:{
      const NodeFailRep * const rep =
	CAST_CONSTPTR(NodeFailRep, signal->getDataPtr());
      NodeBitmask failedNodes;
      failedNodes.assign(NodeBitmask::Size, rep->theNodes);
#ifdef VM_TRACE
      {
	ndbout << "Failed nodes:";
	for (unsigned i = 0; i < 32*NodeBitmask::Size; i++)
	  if(failedNodes.get(i))
	    ndbout << " " << i;
	ndbout << endl;
      }
#endif
      failedNodes.bitAND(nodes);
      if (!failedNodes.isclear())
      {
	nodes.bitANDC(failedNodes); // clear the failed nodes
	if (singleUserNodeId == 0)
	  stoppedNodes.bitOR(failedNodes);
      }
      break;
    }
    default:
@@ -1263,11 +1249,47 @@ int MgmtSrvr::restartNodes(const Vector<NodeId> &node_ids,
                        abort,
                        false,
                        true,
                        nostart,
                        true,
                        initialStart);

  if (ret)
    return ret;

  if (stopCount)
    *stopCount = nodes.count();
  return ret;
  
  // start up the nodes again
  int waitTime = 12000;
  NDB_TICKS maxTime = NdbTick_CurrentMillisecond() + waitTime;
  for (unsigned i = 0; i < node_ids.size(); i++)
  {
    NodeId nodeId= node_ids[i];
    enum ndb_mgm_node_status s;
    s = NDB_MGM_NODE_STATUS_NO_CONTACT;
#ifdef VM_TRACE
    ndbout_c("Waiting for %d not started", nodeId);
#endif
    while (s != NDB_MGM_NODE_STATUS_NOT_STARTED && waitTime > 0)
    {
      Uint32 startPhase = 0, version = 0, dynamicId = 0, nodeGroup = 0;
      Uint32 connectCount = 0;
      bool system;
      const char *address;
      status(nodeId, &s, &version, &startPhase, 
             &system, &dynamicId, &nodeGroup, &connectCount, &address);
      NdbSleep_MilliSleep(100);  
      waitTime = (maxTime - NdbTick_CurrentMillisecond());
    }
  }

  if (nostart)
    return 0;

  for (unsigned i = 0; i < node_ids.size(); i++)
  {
    int result = start(node_ids[i]);
  }
  return 0;
}

/*
@@ -1918,7 +1940,8 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
			enum ndb_mgm_node_type type,
			struct sockaddr *client_addr, 
			SOCKET_SIZE_TYPE *client_addr_len,
			BaseString &error_string)
			int &error_code, BaseString &error_string,
                        int log_event)
{
  DBUG_ENTER("MgmtSrvr::alloc_node_id");
  DBUG_PRINT("enter", ("nodeid=%d, type=%d, client_addr=%d",
@@ -1927,6 +1950,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
    if (*nodeId == 0) {
      error_string.appfmt("no-nodeid-checks set in management server.\n"
			  "node id must be set explicitly in connectstring");
      error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
      DBUG_RETURN(false);
    }
    DBUG_RETURN(true);
@@ -1951,8 +1975,10 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,

  if(NdbMutex_Lock(m_configMutex))
  {
    // should not happen
    error_string.appfmt("unable to lock configuration mutex");
    return false;
    error_code = NDB_MGM_ALLOCID_ERROR;
    DBUG_RETURN(false);
  }
  ndb_mgm_configuration_iterator
    iter(* _config->m_configValues, CFG_SECTION_NODE);
@@ -2023,6 +2049,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
			  "or specifying unique host names in config file.",
			  id_found, tmp);
      NdbMutex_Unlock(m_configMutex);
      error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
      DBUG_RETURN(false);
    }
    if (config_hostname == 0) {
@@ -2031,6 +2058,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
			  "or specifying unique host names in config file,\n"
			  "or specifying just one mgmt server in config file.",
			  tmp);
      error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
      DBUG_RETURN(false);
    }
    id_found= tmp; // mgmt server matched, check for more matches
@@ -2072,7 +2100,8 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
    
    char tmp_str[128];
    m_reserved_nodes.getText(tmp_str);
    g_eventLogger.info("Mgmt server state: nodeid %d reserved for ip %s, m_reserved_nodes %s.",
    g_eventLogger.info("Mgmt server state: nodeid %d reserved for ip %s, "
                       "m_reserved_nodes %s.",
                       id_found, get_connect_address(id_found), tmp_str);
    DBUG_RETURN(true);
  }
@@ -2093,26 +2122,48 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
    type_c_string.assfmt("%s(%s)", alias, str);
  }

  if (*nodeId == 0) {
  if (*nodeId == 0)
  {
    if (found_matching_id)
    {
      if (found_matching_type)
      {
	if (found_free_node)
        {
	  error_string.appfmt("Connection done from wrong host ip %s.",
			      (client_addr)?
                              inet_ntoa(((struct sockaddr_in *)
					 (client_addr))->sin_addr):"");
          error_code = NDB_MGM_ALLOCID_ERROR;
        }
	else
        {
	  error_string.appfmt("No free node id found for %s.",
			      type_string.c_str());
          error_code = NDB_MGM_ALLOCID_ERROR;
        }
      }
      else
      {
	error_string.appfmt("No %s node defined in config file.",
			    type_string.c_str());
        error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
      }
    }
    else
    {
      error_string.append("No nodes defined in config file.");
  } else {
      error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
    }
  }
  else
  {
    if (found_matching_id)
    {
      if (found_matching_type)
	if (found_free_node) {
      {
	if (found_free_node)
        {
	  // have to split these into two since inet_ntoa overwrites itself
	  error_string.appfmt("Connection with id %d done from wrong host ip %s,",
			      *nodeId, inet_ntoa(((struct sockaddr_in *)
@@ -2120,27 +2171,44 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
	  error_string.appfmt(" expected %s(%s).", config_hostname,
			      r_config_addr ?
			      "lookup failed" : inet_ntoa(config_addr));
	} else
          error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
	}
        else
        {
	  error_string.appfmt("Id %d already allocated by another node.",
			      *nodeId);
          error_code = NDB_MGM_ALLOCID_ERROR;
        }
      }
      else
      {
	error_string.appfmt("Id %d configured as %s, connect attempted as %s.",
			    *nodeId, type_c_string.c_str(),
			    type_string.c_str());
        error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
      }
    }
    else
    {
      error_string.appfmt("No node defined with id=%d in config file.",
			  *nodeId);
      error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
    }
  }

  if (log_event || error_code == NDB_MGM_ALLOCID_CONFIG_MISMATCH)
  {
    g_eventLogger.warning("Allocate nodeid (%d) failed. Connection from ip %s."
                          " Returned error string \"%s\"",
                          *nodeId,
			client_addr != 0 ? inet_ntoa(((struct sockaddr_in *)(client_addr))->sin_addr) : "<none>",
                          client_addr != 0
                          ? inet_ntoa(((struct sockaddr_in *)
                                       (client_addr))->sin_addr)
                          : "<none>",
                          error_string.c_str());

    NodeBitmask connected_nodes2;
    get_connected_nodes(connected_nodes2);
  {
    BaseString tmp_connected, tmp_not_connected;
    for(Uint32 i = 0; i < MAX_NODES; i++)
    {
Loading