Commit 3465c5bc authored by jonas@perch.ndb.mysql.com's avatar jonas@perch.ndb.mysql.com
Browse files

ndb - bug#31525

  Fix bug regarding node that missed 2 LCP's (that was not included in next LCP after SR)
parent c9f7d224
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4029
Next DBLQH 5045
Next DBDICT 6007
Next DBDIH 7186
Next DBDIH 7193
Next DBTC 8054
Next CMVMI 9000
Next BACKUP 10038
@@ -155,6 +155,9 @@ And crash when all have "not" been sent
7027: Crash in  master when changing state to LCP_TAB_SAVED
7018: Crash in  master when changing state to LCP_TAB_SAVED

7191: Crash when receiving LCP_COMPLETE_REP
7192: Crash in setLcpActiveStatusStart - when dead node missed to LCP's

ERROR CODES FOR TESTING NODE FAILURE, FAILURE IN COPY FRAGMENT PROCESS:
-----------------------------------------------------------------------

+4 −0
Original line number Diff line number Diff line
@@ -10853,6 +10853,8 @@ void Dbdih::execLCP_COMPLETE_REP(Signal* signal)
{
  jamEntry();
  CRASH_INSERTION(7191);
#if 0
  g_eventLogger.info("LCP_COMPLETE_REP"); 
  printLCP_COMPLETE_REP(stdout, 
@@ -13603,6 +13605,7 @@ void Dbdih::setLcpActiveStatusStart(Signal* signal)
	// It must be taken over with the copy fragment process after a system
	// crash. We indicate this by setting the active status to TAKE_OVER.
	/*-------------------------------------------------------------------*/
	c_lcpState.m_participatingLQH.set(nodePtr.i);
        nodePtr.p->activeStatus = Sysfile::NS_TakeOver;
        //break; // Fall through
      case Sysfile::NS_TakeOver:{
@@ -13645,6 +13648,7 @@ void Dbdih::setLcpActiveStatusStart(Signal* signal)
        break;
      case Sysfile::NS_ActiveMissed_2:
        jam();
        CRASH_INSERTION(7192);
        if ((nodePtr.p->nodeStatus == NodeRecord::ALIVE) &&
            (!nodePtr.p->copyCompleted)) {
          jam();
+77 −0
Original line number Diff line number Diff line
@@ -1668,6 +1668,80 @@ runBug28717(NDBT_Context* ctx, NDBT_Step* step)
  return NDBT_OK;
}

int
runBug31525(NDBT_Context* ctx, NDBT_Step* step)
{
  int result = NDBT_OK;
  int loops = ctx->getNumLoops();
  int records = ctx->getNumRecords();
  Ndb* pNdb = GETNDB(step);
  NdbRestarter res;

  if (res.getNumDbNodes() < 2)
  {
    return NDBT_OK;
  }

  int nodes[2];
  nodes[0] = res.getMasterNodeId();
  nodes[1] = res.getNextMasterNodeId(nodes[0]);
  
  while (res.getNodeGroup(nodes[0]) != res.getNodeGroup(nodes[1]))
  {
    ndbout_c("Restarting %u as it not in same node group as %u",
             nodes[1], nodes[0]);
    if (res.restartOneDbNode(nodes[1], false, true, true))
      return NDBT_FAILED;
    
    if (res.waitNodesNoStart(nodes+1, 1))
      return NDBT_FAILED;
    
    if (res.startNodes(nodes+1, 1))
      return NDBT_FAILED;
    
    if (res.waitClusterStarted())
      return NDBT_FAILED;

    nodes[1] = res.getNextMasterNodeId(nodes[0]);
  }
  
  ndbout_c("nodes[0]: %u nodes[1]: %u", nodes[0], nodes[1]);
  
  int val = DumpStateOrd::DihMinTimeBetweenLCP;
  if (res.dumpStateAllNodes(&val, 1))
    return NDBT_FAILED;

  int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };  
  if (res.dumpStateAllNodes(val2, 2))
    return NDBT_FAILED;
  
  if (res.insertErrorInAllNodes(932))
    return NDBT_FAILED;

  if (res.insertErrorInNode(nodes[1], 7192))
    return NDBT_FAILED;
  
  if (res.insertErrorInNode(nodes[0], 7191))
    return NDBT_FAILED;
  
  if (res.waitClusterNoStart())
    return NDBT_FAILED;

  if (res.startAll())
    return NDBT_FAILED;
  
  if (res.waitClusterStarted())
    return NDBT_FAILED;

  if (res.restartOneDbNode(nodes[1], false, false, true))
    return NDBT_FAILED;

  if (res.waitClusterStarted())
    return NDBT_FAILED;
  
  return NDBT_OK;
}

NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", 
	 "Test that one node at a time can be stopped and then restarted "\
@@ -1991,6 +2065,9 @@ TESTCASE("Bug21271",
  STEP(runPkUpdateUntilStopped);
  FINALIZER(runClearTable);
}
TESTCASE("Bug31525", ""){
  INITIALIZER(runBug31525);
}
TESTCASE("Bug24717", ""){
  INITIALIZER(runBug24717);
}
+4 −0
Original line number Diff line number Diff line
@@ -934,3 +934,7 @@ max-time: 1500
cmd: testSystemRestart
args: -n SR_DD_2b_LCP D2

max-time: 600
cmd: testNodeRestart
args: -n Bug31525 T1