Commit 7a787690 authored by unknown's avatar unknown
Browse files

ndb - bug#25364

  on master node failure during qmgr-commitreq
    make sure to remove all committed failed nodes from failed/prepfailed arrays


ndb/include/kernel/signaldata/DumpStateOrd.hpp:
  new error code
ndb/src/kernel/blocks/ERROR_codes.txt:
  new error code
ndb/src/kernel/blocks/qmgr/Qmgr.hpp:
  extra error insert variable
ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  make sure to remove all committed failed nodes from failed/prepfailed arrays
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase
ndb/test/run-test/daily-basic-tests.txt:
  testcase
parent 9a79c0dc
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -68,6 +68,7 @@ public:
    // 100-105 TUP and ACC  
    // 200-240 UTIL
    // 300-305 TRIX
    QmgrErr935 = 935,
    NdbfsDumpFileStat = 400,
    NdbfsDumpAllFiles = 401,
    NdbfsDumpOpenFiles = 402,
+3 −0
Original line number Diff line number Diff line
@@ -21,6 +21,9 @@ Crash president when he starts to run in ArbitState 1-9.

910: Crash new president after node crash

935 : Crash master on node failure (delayed) 
      and skip sending GSN_COMMIT_FAILREQ to specified node

ERROR CODES FOR TESTING NODE FAILURE, GLOBAL CHECKPOINT HANDLING:
-----------------------------------------------------------------

+4 −0
Original line number Diff line number Diff line
@@ -426,6 +426,10 @@ private:
  
  StopReq c_stopReq;
  bool check_multi_node_shutdown(Signal* signal);

#ifdef ERROR_INSERT
  Uint32 c_error_insert_extra;
#endif
};

#endif
+59 −13
Original line number Diff line number Diff line
@@ -3110,6 +3110,18 @@ Qmgr::sendCommitFailReq(Signal* signal)
  for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
    jam();
    ptrAss(nodePtr, nodeRec);

#ifdef ERROR_INSERT    
    if (ERROR_INSERTED(935) && nodePtr.i == c_error_insert_extra)
    {
      ndbout_c("skipping node %d", c_error_insert_extra);
      CLEAR_ERROR_INSERT_VALUE;
      signal->theData[0] = 9999;
      sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
      continue;
    }
#endif

    if (nodePtr.p->phase == ZRUNNING) {
      jam();
      nodePtr.p->sendCommitFailReqStatus = Q_ACTIVE;
@@ -3180,6 +3192,33 @@ void Qmgr::execPREP_FAILREF(Signal* signal)
  return;
}//Qmgr::execPREP_FAILREF()

static
Uint32
clear_nodes(Uint32 dstcnt, Uint16 dst[], Uint32 srccnt, const Uint16 src[])
{
  if (srccnt == 0)
    return dstcnt;
  
  Uint32 pos = 0;
  for (Uint32 i = 0; i<dstcnt; i++)
  {
    Uint32 node = dst[i];
    for (Uint32 j = 0; j<srccnt; j++)
    {
      if (node == dst[j])
      {
	node = RNIL;
	break;
      }
    }
    if (node != RNIL)
    {
      dst[pos++] = node;
    }
  }
  return pos;
}

/*---------------------------------------------------------------------------*/
/*    THE PRESIDENT IS NOW COMMITTING THE PREVIOUSLY PREPARED NODE FAILURE.  */
/*---------------------------------------------------------------------------*/
@@ -3267,19 +3306,18 @@ void Qmgr::execCOMMIT_FAILREQ(Signal* signal)
		   NodeFailRep::SignalLength, JBB);
      }//if
    }//for
    if (cpresident != getOwnNodeId()) {
      jam();
      cnoFailedNodes = cnoCommitFailedNodes - cnoFailedNodes;
      if (cnoFailedNodes > 0) {
        jam();
        guard0 = cnoFailedNodes - 1;
        arrGuard(guard0 + cnoCommitFailedNodes, MAX_NDB_NODES);
        for (Tj = 0; Tj <= guard0; Tj++) {
          jam();
          cfailedNodes[Tj] = cfailedNodes[Tj + cnoCommitFailedNodes];
        }//for
      }//if
    }//if

    /**
     * Remove committed nodes from failed/prepared
     */
    cnoFailedNodes = clear_nodes(cnoFailedNodes, 
				 cfailedNodes, 
				 cnoCommitFailedNodes, 
				 ccommitFailedNodes);
    cnoPrepFailedNodes = clear_nodes(cnoPrepFailedNodes, 
				     cprepFailedNodes,
				     cnoCommitFailedNodes,
				     ccommitFailedNodes);
    cnoCommitFailedNodes = 0;
  }//if
  /**-----------------------------------------------------------------------
@@ -4658,6 +4696,14 @@ Qmgr::execDUMP_STATE_ORD(Signal* signal)
  default:
    ;
  }//switch

#ifdef ERROR_INSERT
  if (signal->theData[0] == 935 && signal->getLength() == 2)
  {
    SET_ERROR_INSERT_VALUE(935);
    c_error_insert_extra = signal->theData[1];
  }
#endif
}//Qmgr::execDUMP_STATE_ORD()

void Qmgr::execSET_VAR_REQ(Signal* signal) 
+43 −0
Original line number Diff line number Diff line
@@ -955,6 +955,46 @@ int runBug24717(NDBT_Context* ctx, NDBT_Step* step){
  return NDBT_OK;
}

int runBug25364(NDBT_Context* ctx, NDBT_Step* step){
  int result = NDBT_OK;
  NdbRestarter restarter;
  Ndb* pNdb = GETNDB(step);
  int loops = ctx->getNumLoops();
  
  if (restarter.getNumDbNodes() < 4)
    return NDBT_OK;

  int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };

  for (; loops; loops --)
  {
    int master = restarter.getMasterNodeId();
    int victim = restarter.getRandomNodeOtherNodeGroup(master, rand());
    int second = restarter.getRandomNodeSameNodeGroup(victim, rand());
    
    int dump[] = { 935, victim } ;
    if (restarter.dumpStateOneNode(master, dump, 2))
      return NDBT_FAILED;
  
    if (restarter.dumpStateOneNode(master, val2, 2))
      return NDBT_FAILED;
  
    if (restarter.restartOneDbNode(second, false, true, true))
      return NDBT_FAILED;

    int nodes[2] = { master, second };
    if (restarter.waitNodesNoStart(nodes, 2))
      return NDBT_FAILED;

    restarter.startNodes(nodes, 2);

    if (restarter.waitNodesStarted(nodes, 2))
      return NDBT_FAILED;
  }
  
  return NDBT_OK;
}


NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", 
@@ -1271,6 +1311,9 @@ TESTCASE("Bug20185",
TESTCASE("Bug24717", ""){
  INITIALIZER(runBug24717);
}
TESTCASE("Bug25364", ""){
  INITIALIZER(runBug25364);
}
NDBT_TESTSUITE_END(testNodeRestart);

int main(int argc, const char** argv){
Loading