Commit 0a27a90a authored by unknown's avatar unknown
Browse files

ndb - bug#25801

  - improve error message if starting wo/ enough REDO
  - decrease likelyhood of trying to start too early


storage/ndb/include/mgmapi/ndbd_exit_codes.h:
  Add new error code (that maybe should have been there a looong time)
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Add new check (during SR) for that sufficient REDO is present
    before continuing SR
storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp:
  Add list of GCI's of nodes so that we can check for sufficient REDO during a SR
storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  Add check for REDO during SR
    so that
  1) cluster is not trying to start too soon
  2) a better error message (than internal error) is provided if not enough REDO is present
storage/ndb/src/kernel/error/ndbd_exit_codes.c:
  Add new error code (that maybe should have been there a looong time)
parent 6acbe4eb
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -146,6 +146,7 @@ typedef ndbd_exit_classification_enum ndbd_exit_classification;
#define NDBD_EXIT_AFS_READ_UNDERFLOW        2816

#define NDBD_EXIT_INVALID_LCP_FILE          2352
#define NDBD_EXIT_INSUFFICENT_NODES         2353

const char *
ndbd_exit_message(int faultId, ndbd_exit_classification *cl);
+53 −6
Original line number Diff line number Diff line
@@ -1194,12 +1194,59 @@ void Dbdih::execTAB_COMMITREQ(Signal* signal)
void Dbdih::execDIH_RESTARTREQ(Signal* signal) 
{
  jamEntry();
  if (signal->theData[0])
  {
    jam();
    cntrlblockref = signal->theData[0];
    if(m_ctx.m_config.getInitialStart()){
      sendSignal(cntrlblockref, GSN_DIH_RESTARTREF, signal, 1, JBB);
    } else {
      readGciFileLab(signal);
    }
  }
  else
  {
    /**
     * Precondition, (not checked)
     *   atleast 1 node in each node group
     */
    Uint32 i;
    NdbNodeBitmask mask;
    mask.assign(NdbNodeBitmask::Size, signal->theData + 1);
    Uint32 *node_gcis = signal->theData+1+NdbNodeBitmask::Size;
    Uint32 node_group_gcis[MAX_NDB_NODES+1];
    bzero(node_group_gcis, sizeof(node_group_gcis));
    for (i = 0; i<MAX_NDB_NODES; i++)
    {
      if (mask.get(i))
      {
	jam();
	Uint32 ng = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
	ndbrequire(ng < MAX_NDB_NODES);
	Uint32 gci = node_gcis[i];
	if (gci > node_group_gcis[ng])
	{
	  jam();
	  node_group_gcis[ng] = gci;
	}
      }
    }
    for (i = 0; i<MAX_NDB_NODES && node_group_gcis[i] == 0; i++);
    
    Uint32 gci = node_group_gcis[i];
    for (i++ ; i<MAX_NDB_NODES; i++)
    {
      jam();
      if (node_group_gcis[i] && node_group_gcis[i] != gci)
      {
	jam();
	signal->theData[0] = i;
	return;
      }
    }
    signal->theData[0] = MAX_NDB_NODES;
    return;
  }
  return;
}//Dbdih::execDIH_RESTARTREQ()
@@ -12391,7 +12438,7 @@ void Dbdih::makeNodeGroups(Uint32 nodeArray[])
	(buf, sizeof(buf), 
	 "Illegal initial start, no alive node in nodegroup %u", i);
      progError(__LINE__, 
		NDBD_EXIT_SR_RESTARTCONFLICT,
		NDBD_EXIT_INSUFFICENT_NODES,
		buf);
      
    }
+1 −0
Original line number Diff line number Diff line
@@ -128,6 +128,7 @@ public:
    Uint32 m_president_candidate_gci;
    Uint16 m_regReqReqSent;
    Uint16 m_regReqReqRecv;
    Uint32 m_node_gci[MAX_NDB_NODES];
  } c_start;
  
  NdbNodeBitmask c_definedNodes; // DB nodes in config
+83 −22
Original line number Diff line number Diff line
@@ -1093,6 +1093,7 @@ void Qmgr::execCM_REGREF(Signal* signal)
    jam();
    c_start.m_starting_nodes_w_log.set(TaddNodeno);
  }
  c_start.m_node_gci[TaddNodeno] = node_gci;

  skip_nodes.bitAND(c_definedNodes);
  c_start.m_skip_nodes.bitOR(skip_nodes);
@@ -1242,6 +1243,7 @@ Qmgr::check_startup(Signal* signal)
  wait.bitANDC(tmp);

  Uint32 retVal = 0;
  Uint32 incompleteng = MAX_NDB_NODES; // Illegal value
  NdbNodeBitmask report_mask;

  if ((c_start.m_latest_gci == 0) || 
@@ -1327,7 +1329,7 @@ Qmgr::check_startup(Signal* signal)
        report_mask.assign(c_definedNodes);
        report_mask.bitANDC(c_start.m_starting_nodes);
        retVal = 1;
        goto start_report;
        goto check_log;
      case CheckNodeGroups::Partitioning:
        ndbrequire(result != CheckNodeGroups::Lose);
        signal->theData[1] = 
@@ -1335,7 +1337,7 @@ Qmgr::check_startup(Signal* signal)
        report_mask.assign(c_definedNodes);
        report_mask.bitANDC(c_start.m_starting_nodes);
        retVal = 1;
        goto start_report;
        goto check_log;
      }
    }

@@ -1359,12 +1361,7 @@ Qmgr::check_startup(Signal* signal)
    case CheckNodeGroups::Partitioning:
      if (now < partitioned_timeout && result != CheckNodeGroups::Win)
      {
        signal->theData[1] = c_restartPartionedTimeout == (Uint32) ~0 ? 4 : 5;
        signal->theData[2] = Uint32((partitioned_timeout - now + 500) / 1000);
        report_mask.assign(c_definedNodes);
        report_mask.bitANDC(c_start.m_starting_nodes);
        retVal = 0;
        goto start_report;
        goto missinglog;
      }
      // Fall through...
    case CheckNodeGroups::Win:
@@ -1372,12 +1369,61 @@ Qmgr::check_startup(Signal* signal)
        all ? 0x8001 : (result == CheckNodeGroups::Win ? 0x8002 : 0x8003);
      report_mask.assign(c_definedNodes);
      report_mask.bitANDC(c_start.m_starting_nodes);
      retVal = 1;
      goto start_report;
      retVal = 2;
      goto check_log;
    }
  }
  ndbrequire(false);

check_log:
  jam();
  {
    Uint32 save[4+4*NdbNodeBitmask::Size];
    memcpy(save, signal->theData, sizeof(save));
    
    signal->theData[0] = 0;
    c_start.m_starting_nodes.copyto(NdbNodeBitmask::Size, signal->theData+1);
    memcpy(signal->theData+1+NdbNodeBitmask::Size, c_start.m_node_gci,
	   4*MAX_NDB_NODES);
    EXECUTE_DIRECT(DBDIH, GSN_DIH_RESTARTREQ, signal, 
		   1+NdbNodeBitmask::Size+MAX_NDB_NODES);
    
    incompleteng = signal->theData[0];
    memcpy(signal->theData, save, sizeof(save));

    if (incompleteng != MAX_NDB_NODES)
    {
      jam();
      if (retVal == 1)
      {
	jam();
	goto incomplete_log;
      }
      else if (retVal == 2)
      {
	if (now <= partitioned_timeout)
	{
	  jam();
	  goto missinglog;
	}
	else
	{
	  goto incomplete_log;
	}
      }
      ndbrequire(false);
    }
  }
  goto start_report;

missinglog:
  signal->theData[1] = c_restartPartionedTimeout == (Uint32) ~0 ? 4 : 5;
  signal->theData[2] = Uint32((partitioned_timeout - now + 500) / 1000);
  report_mask.assign(c_definedNodes);
  report_mask.bitANDC(c_start.m_starting_nodes);
  retVal = 0;
  goto start_report;
  
start_report:
  jam();
  {
@@ -1396,6 +1442,7 @@ Qmgr::check_startup(Signal* signal)
  
missing_nodegroup:
  jam();
  {
    char buf[100], mask1[100], mask2[100];
    c_start.m_starting_nodes.getText(mask1);
    tmp.assign(c_start.m_starting_nodes);
@@ -1405,9 +1452,23 @@ Qmgr::check_startup(Signal* signal)
			 "Unable to start missing node group! "
			 " starting: %s (missing fs for: %s)",
			 mask1, mask2);
  progError(__LINE__, NDBD_EXIT_SR_RESTARTCONFLICT, buf);
    progError(__LINE__, NDBD_EXIT_INSUFFICENT_NODES, buf);
    return 0;                                     // Deadcode
  }

incomplete_log:
  jam();
  {
    char buf[100], mask1[100];
    c_start.m_starting_nodes.getText(mask1);
    BaseString::snprintf(buf, sizeof(buf),
			 "Incomplete log for node group: %d! "
			 " starting nodes: %s",
			 incompleteng, mask1);
    progError(__LINE__, NDBD_EXIT_INSUFFICENT_NODES, buf);
    return 0;                                     // Deadcode
  }
}

void
Qmgr::electionWon(Signal* signal){
+1 −0
Original line number Diff line number Diff line
@@ -160,6 +160,7 @@ static const ErrStruct errArray[] =
   {NDBD_EXIT_AFS_READ_UNDERFLOW        , XFI, "Read underflow"},
   
   {NDBD_EXIT_INVALID_LCP_FILE, XFI, "Invalid LCP" },
   {NDBD_EXIT_INSUFFICENT_NODES, XRE, "Insufficent nodes for system restart" },
   
   /* Sentinel */
   {0, XUE,