Commit d230d0e1 authored by unknown's avatar unknown
Browse files

ndb - wl2610, bug#18352

  Remove useless and tricky state fiddleing in TC
    to syncronize NF_CompleteRep as code is already present in DIH aswell
  Keep broadcast of TAKEOVER_TCCONF for online upgrade


ndb/src/kernel/blocks/dblqh/DblqhMain.cpp:
  Add clever dump for showing active operations
ndb/src/kernel/blocks/dbtc/Dbtc.hpp:
  Remove useless and tricky state fiddleing in TC
    to syncronize NF_CompleteRep as code is already present in DIH aswell
    Keep broadcast of TAKEOVER_TCCONF for online upgrade
ndb/src/kernel/blocks/dbtc/DbtcMain.cpp:
  Remove useless and tricky state fiddleing in TC
    to syncronize NF_CompleteRep as code is already present in DIH aswell
    Keep broadcast of TAKEOVER_TCCONF for online upgrade
parent 51a093f1
Loading
Loading
Loading
Loading
+166 −0
Original line number Diff line number Diff line
@@ -18449,6 +18449,172 @@ Dblqh::execDUMP_STATE_ORD(Signal* signal)
    SET_ERROR_INSERT_VALUE(5042);
  }
  TcConnectionrec *regTcConnectionrec = tcConnectionrec;
  Uint32 ttcConnectrecFileSize = ctcConnectrecFileSize;
  Uint32 arg = dumpState->args[0];
  if(arg == 2306)
  {
    for(Uint32 i = 0; i<1024; i++)
    {
      TcConnectionrecPtr tcRec;
      tcRec.i = ctransidHash[i];
      while(tcRec.i != RNIL)
      {
	ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec);
	ndbout << "TcConnectionrec " << tcRec.i;
	signal->theData[0] = 2307;
	signal->theData[1] = tcRec.i;
	execDUMP_STATE_ORD(signal);
	tcRec.i = tcRec.p->nextHashRec;
      }
    }
  }
  if(arg == 2307 || arg == 2308)
  {
    TcConnectionrecPtr tcRec;
    tcRec.i = signal->theData[1];
    ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec);
    
    ndbout << " transactionState = " << tcRec.p->transactionState<<endl;
    ndbout << " operation = " << tcRec.p->operation<<endl;
    ndbout << " tcNodeFailrec = " << tcRec.p->tcNodeFailrec
	   << " seqNoReplica = " << tcRec.p->seqNoReplica
	   << " simpleRead = " << tcRec.p->simpleRead
	   << endl;
    ndbout << " replicaType = " << tcRec.p->replicaType
	   << " reclenAiLqhkey = " << tcRec.p->reclenAiLqhkey
	   << " opExec = " << tcRec.p->opExec
	   << endl;
    ndbout << " opSimple = " << tcRec.p->opSimple
	   << " nextSeqNoReplica = " << tcRec.p->nextSeqNoReplica
	   << " lockType = " << tcRec.p->lockType
	   << endl;
    ndbout << " lastReplicaNo = " << tcRec.p->lastReplicaNo
	   << " indTakeOver = " << tcRec.p->indTakeOver
	   << " dirtyOp = " << tcRec.p->dirtyOp
	   << endl;
    ndbout << " activeCreat = " << tcRec.p->activeCreat
	   << " tcBlockref = " << hex << tcRec.p->tcBlockref
	   << " reqBlockref = " << hex << tcRec.p->reqBlockref
	   << " primKeyLen = " << tcRec.p->primKeyLen
	   << endl;
    ndbout << " nextReplica = " << tcRec.p->nextReplica
	   << " tcBlockref = " << hex << tcRec.p->tcBlockref
	   << " reqBlockref = " << hex << tcRec.p->reqBlockref
	   << " primKeyLen = " << tcRec.p->primKeyLen
	   << endl;
    ndbout << " logStopPageNo = " << tcRec.p->logStopPageNo
	   << " logStartPageNo = " << tcRec.p->logStartPageNo
	   << " logStartPageIndex = " << tcRec.p->logStartPageIndex
	   << endl;
    ndbout << " errorCode = " << tcRec.p->errorCode
	   << " clientBlockref = " << hex << tcRec.p->clientBlockref
	   << " applRef = " << hex << tcRec.p->applRef
	   << " totSendlenAi = " << tcRec.p->totSendlenAi
	   << endl;
    ndbout << " totReclenAi = " << tcRec.p->totReclenAi
	   << " tcScanRec = " << tcRec.p->tcScanRec
	   << " tcScanInfo = " << tcRec.p->tcScanInfo
	   << " tcOprec = " << hex << tcRec.p->tcOprec
	   << endl;
    ndbout << " tableref = " << tcRec.p->tableref
	   << " simpleTcConnect = " << tcRec.p->simpleTcConnect
	   << " storedProcId = " << tcRec.p->storedProcId
	   << " schemaVersion = " << tcRec.p->schemaVersion
	   << endl;
    ndbout << " reqinfo = " << tcRec.p->reqinfo
	   << " reqRef = " << tcRec.p->reqRef
	   << " readlenAi = " << tcRec.p->readlenAi
	   << " prevTc = " << tcRec.p->prevTc
	   << endl;
    ndbout << " prevLogTcrec = " << tcRec.p->prevLogTcrec
	   << " prevHashRec = " << tcRec.p->prevHashRec
	   << " nodeAfterNext0 = " << tcRec.p->nodeAfterNext[0]
	   << " nodeAfterNext1 = " << tcRec.p->nodeAfterNext[1]
	   << endl;
    ndbout << " nextTcConnectrec = " << tcRec.p->nextTcConnectrec
	   << " nextTc = " << tcRec.p->nextTc
	   << " nextTcLogQueue = " << tcRec.p->nextTcLogQueue
	   << " nextLogTcrec = " << tcRec.p->nextLogTcrec
	   << endl;
    ndbout << " nextHashRec = " << tcRec.p->nextHashRec
	   << " logWriteState = " << tcRec.p->logWriteState
	   << " logStartFileNo = " << tcRec.p->logStartFileNo
	   << " listState = " << tcRec.p->listState
	   << endl;
    ndbout << " lastAttrinbuf = " << tcRec.p->lastAttrinbuf
	   << " lastTupkeybuf = " << tcRec.p->lastTupkeybuf
	   << " hashValue = " << tcRec.p->hashValue
	   << endl;
    ndbout << " gci = " << tcRec.p->gci
	   << " fragmentptr = " << tcRec.p->fragmentptr
	   << " fragmentid = " << tcRec.p->fragmentid
	   << " firstTupkeybuf = " << tcRec.p->firstTupkeybuf
	   << endl;
    ndbout << " firstAttrinbuf = " << tcRec.p->firstAttrinbuf
	   << " currTupAiLen = " << tcRec.p->currTupAiLen
	   << " currReclenAi = " << tcRec.p->currReclenAi
	   << endl;
    ndbout << " tcTimer = " << tcRec.p->tcTimer
	   << " clientConnectrec = " << tcRec.p->clientConnectrec
	   << " applOprec = " << hex << tcRec.p->applOprec
	   << " abortState = " << tcRec.p->abortState
	   << endl;
    ndbout << " transid0 = " << hex << tcRec.p->transid[0]
	   << " transid1 = " << hex << tcRec.p->transid[1]
	   << " tupkeyData0 = " << tcRec.p->tupkeyData[0]
	   << " tupkeyData1 = " << tcRec.p->tupkeyData[1]
	   << endl;
    ndbout << " tupkeyData2 = " << tcRec.p->tupkeyData[2]
	   << " tupkeyData3 = " << tcRec.p->tupkeyData[3]
	   << endl;
    switch (tcRec.p->transactionState) {
	
    case TcConnectionrec::SCAN_STATE_USED:
      if (tcRec.p->tcScanRec < cscanrecFileSize){
	ScanRecordPtr TscanPtr;
	c_scanRecordPool.getPtr(TscanPtr, tcRec.p->tcScanRec);
	ndbout << " scanState = " << TscanPtr.p->scanState << endl;
	//TscanPtr.p->scanLocalref[2];
	ndbout << " copyPtr="<<TscanPtr.p->copyPtr
	       << " scanAccPtr="<<TscanPtr.p->scanAccPtr
	       << " scanAiLength="<<TscanPtr.p->scanAiLength
	       << endl;
	ndbout << " m_curr_batch_size_rows="<<
	  TscanPtr.p->m_curr_batch_size_rows
	       << " m_max_batch_size_rows="<<
	  TscanPtr.p->m_max_batch_size_rows
	       << " scanErrorCounter="<<TscanPtr.p->scanErrorCounter
	       << endl;
	ndbout << " scanSchemaVersion="<<TscanPtr.p->scanSchemaVersion
	       << "  scanStoredProcId="<<TscanPtr.p->scanStoredProcId
	       << "  scanTcrec="<<TscanPtr.p->scanTcrec
	       << endl;
	ndbout << "  scanType="<<TscanPtr.p->scanType
	       << "  scanApiBlockref="<<TscanPtr.p->scanApiBlockref
	       << "  scanNodeId="<<TscanPtr.p->scanNodeId
	       << "  scanCompletedStatus="<<TscanPtr.p->scanCompletedStatus
	       << endl;
	ndbout << "  scanFlag="<<TscanPtr.p->scanFlag
	       << "  scanLockHold="<<TscanPtr.p->scanLockHold
	       << "  scanLockMode="<<TscanPtr.p->scanLockMode
	       << "  scanNumber="<<TscanPtr.p->scanNumber
	       << endl;
	ndbout << "  scanReleaseCounter="<<TscanPtr.p->scanReleaseCounter
	       << "  scanTcWaiting="<<TscanPtr.p->scanTcWaiting
	       << "  scanKeyinfoFlag="<<TscanPtr.p->scanKeyinfoFlag
	       << endl;
      } else{
	ndbout << "No connected scan record found" << endl;
      }
      break;
    default:
      break;
    }
    ndbrequire(arg != 2308);
  }
  
}//Dblqh::execDUMP_STATE_ORD()
void Dblqh::execSET_VAR_REQ(Signal* signal) 
+0 −9
Original line number Diff line number Diff line
@@ -211,14 +211,6 @@ public:
    LTS_ACTIVE = 1
  };

  enum TakeOverState {
    TOS_NOT_DEFINED = 0,
    TOS_IDLE = 1,
    TOS_ACTIVE = 2,
    TOS_COMPLETED = 3,
    TOS_NODE_FAILED = 4
  };

  enum FailState {
    FS_IDLE = 0,
    FS_LISTENING = 1,
@@ -933,7 +925,6 @@ public:
  struct HostRecord {
    HostState hostStatus;
    LqhTransState lqhTransStatus;
    TakeOverState takeOverStatus;
    bool  inPackedList;
    UintR noOfPackedWordsLqh;
    UintR packedWordsLqh[26];
+42 −144
Original line number Diff line number Diff line
@@ -303,7 +303,6 @@ void Dbtc::execINCL_NODEREQ(Signal* signal)
  hostptr.i = signal->theData[1];
  ptrCheckGuard(hostptr, chostFilesize, hostRecord);
  hostptr.p->hostStatus = HS_ALIVE;
  hostptr.p->takeOverStatus = TOS_IDLE;
  signal->theData[0] = cownref;
  c_alive_nodes.set(hostptr.i);
  sendSignal(tblockref, GSN_INCL_NODECONF, signal, 1, JBB);
@@ -856,8 +855,6 @@ void Dbtc::execREAD_NODESCONF(Signal* signal)
      hostptr.i = i;
      ptrCheckGuard(hostptr, chostFilesize, hostRecord);

      hostptr.p->takeOverStatus = TOS_IDLE;
      
      if (NodeBitmask::get(readNodes->inactiveNodes, i)) {
        jam();
        hostptr.p->hostStatus = HS_DEAD;
@@ -6826,21 +6823,27 @@ void Dbtc::execNODE_FAILREP(Signal* signal)
  const Uint32 tnewMasterId = nodeFail->masterNodeId;
  
  arrGuard(tnoOfNodes, MAX_NDB_NODES);
  Uint32 i;
  int index = 0;
  for (unsigned i = 1; i< MAX_NDB_NODES; i++) {
    if(NodeBitmask::get(nodeFail->theNodes, i)){
  for (i = 1; i< MAX_NDB_NODES; i++) 
  {
    if(NodeBitmask::get(nodeFail->theNodes, i))
    {
      cdata[index] = i;
      index++;
    }//if
  }//for

  cmasterNodeId = tnewMasterId;
  
  tcNodeFailptr.i = 0;
  ptrAss(tcNodeFailptr, tcFailRecord);
  Uint32 tindex;
  for (tindex = 0; tindex < tnoOfNodes; tindex++) {
  for (i = 0; i < tnoOfNodes; i++) 
  {
    jam();
    hostptr.i = cdata[tindex];
    hostptr.i = cdata[i];
    ptrCheckGuard(hostptr, chostFilesize, hostRecord);
    
    /*------------------------------------------------------------*/
    /*       SET STATUS OF THE FAILED NODE TO DEAD SINCE IT HAS   */
    /*       FAILED.                                              */
@@ -6849,30 +6852,15 @@ void Dbtc::execNODE_FAILREP(Signal* signal)
    hostptr.p->m_nf_bits = HostRecord::NF_NODE_FAIL_BITS;
    c_alive_nodes.clear(hostptr.i);

    if (hostptr.p->takeOverStatus == TOS_COMPLETED) {
      jam();
      /*------------------------------------------------------------*/
      /*       A VERY UNUSUAL SITUATION. THE TAKE OVER WAS COMPLETED*/
      /*       EVEN BEFORE WE HEARD ABOUT THE NODE FAILURE REPORT.  */
      /*       HOWEVER UNUSUAL THIS SITUATION IS POSSIBLE.          */
      /*------------------------------------------------------------*/
      /*       RELEASE THE CURRENTLY UNUSED LQH CONNECTIONS. THE    */
      /*       REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */
      /*       USED THEM IS COMPLETED.                              */
      /*------------------------------------------------------------*/
      hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER;
    } else {
      ndbrequire(hostptr.p->takeOverStatus == TOS_IDLE);
      hostptr.p->takeOverStatus = TOS_NODE_FAILED;
    }//if
    
    if (tcNodeFailptr.p->failStatus == FS_LISTENING) {
    if (tcNodeFailptr.p->failStatus == FS_LISTENING) 
    {
      jam();
      /*------------------------------------------------------------*/
      /*       THE CURRENT TAKE OVER CAN BE AFFECTED BY THIS NODE   */
      /*       FAILURE.                                             */
      /*------------------------------------------------------------*/
      if (hostptr.p->lqhTransStatus == LTS_ACTIVE) {
      if (hostptr.p->lqhTransStatus == LTS_ACTIVE) 
      {
	jam();
	/*------------------------------------------------------------*/
	/*       WE WERE WAITING FOR THE FAILED NODE IN THE TAKE OVER */
@@ -6884,78 +6872,25 @@ void Dbtc::execNODE_FAILREP(Signal* signal)
      }//if
    }//if
    
  }//for

  const bool masterFailed = (cmasterNodeId != tnewMasterId);
  cmasterNodeId = tnewMasterId;

  if(getOwnNodeId() == cmasterNodeId && masterFailed){
    if (getOwnNodeId() != tnewMasterId)
    {
      jam();
      /**
     * Master has failed and I'm the new master
       * Only master does takeover currently
       */
    jam();
    
    for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) {
      jam();
      ptrAss(hostptr, hostRecord);
      if (hostptr.p->hostStatus != HS_ALIVE) {
	jam();
	if (hostptr.p->takeOverStatus == TOS_COMPLETED) {
	  jam();
	  /*------------------------------------------------------------*/
	  /*       SEND TAKE OVER CONFIRMATION TO ALL ALIVE NODES IF    */
	  /*       TAKE OVER IS COMPLETED. THIS IS PERFORMED TO ENSURE  */
	  /*       THAT ALL NODES AGREE ON THE IDLE STATE OF THE TAKE   */
	  /*       OVER. THIS MIGHT BE MISSED IN AN ERROR SITUATION IF  */
	  /*       MASTER FAILS AFTER SENDING CONFIRMATION TO NEW       */
	  /*       MASTER BUT FAILING BEFORE SENDING TO ANOTHER NODE    */
	  /*       WHICH WAS NOT MASTER. IF THIS NODE LATER BECOMES     */
	  /*       MASTER IT MIGHT START A NEW TAKE OVER EVEN AFTER THE */
	  /*       CRASHED NODE HAVE ALREADY RECOVERED.                 */
	  /*------------------------------------------------------------*/
	  NodeReceiverGroup rg(DBTC, c_alive_nodes);
	  signal->theData[0] = hostptr.i;
	  sendSignal(rg, GSN_TAKE_OVERTCCONF, signal, 1, JBB);
	}//if
      }//if
    }//for
      hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER;
    }

  if(getOwnNodeId() == cmasterNodeId){
    jam();
    for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) {
      jam();
      ptrAss(hostptr, hostRecord);
      if (hostptr.p->hostStatus != HS_ALIVE) {
        jam();
        if (hostptr.p->takeOverStatus == TOS_NODE_FAILED) {
    else
    {
      jam();
	  /*------------------------------------------------------------*/
	  /*       CONCLUDE ALL ACTIVITIES THE FAILED TC DID CONTROL    */
	  /*       SINCE WE ARE THE MASTER. THIS COULD HAVE BEEN STARTED*/
	  /*       BY A PREVIOUS MASTER BUT HAVE NOT BEEN CONCLUDED YET.*/
	  /*------------------------------------------------------------*/
          hostptr.p->takeOverStatus = TOS_ACTIVE;
      signal->theData[0] = hostptr.i;
      sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB);
        }//if
      }//if
    }//for
  }//if
  for (tindex = 0; tindex < tnoOfNodes; tindex++) {
    jam();
    hostptr.i = cdata[tindex];
    ptrCheckGuard(hostptr, chostFilesize, hostRecord);
    /*------------------------------------------------------------*/
    /*       LOOP THROUGH AND ABORT ALL SCANS THAT WHERE          */
    /*       CONTROLLED BY THIS TC AND ACTIVE IN THE FAILED       */
    /*       NODE'S LQH                                           */
    /*------------------------------------------------------------*/
    }

    checkScanActiveInFailedLqh(signal, 0, hostptr.i);
    checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid
    nodeFailCheckTransactions(signal, 0, hostptr.i);
  }//for

  }
}//Dbtc::execNODE_FAILREP()

void
@@ -7071,47 +7006,17 @@ void Dbtc::execTAKE_OVERTCCONF(Signal* signal)
  tfailedNodeId = signal->theData[0];
  hostptr.i = tfailedNodeId;
  ptrCheckGuard(hostptr, chostFilesize, hostRecord);
  switch (hostptr.p->takeOverStatus) {
  case TOS_IDLE:
    jam();
    /*------------------------------------------------------------*/
    /*       THIS MESSAGE ARRIVED EVEN BEFORE THE NODE_FAILREP    */
    /*       MESSAGE. THIS IS POSSIBLE IN EXTREME SITUATIONS.     */
    /*       WE SET THE STATE TO TAKE_OVER_COMPLETED AND WAIT     */
    /*       FOR THE NODE_FAILREP MESSAGE.                        */
    /*------------------------------------------------------------*/
    hostptr.p->takeOverStatus = TOS_COMPLETED;
    break;
  case TOS_NODE_FAILED:
  case TOS_ACTIVE:
    jam();
    /*------------------------------------------------------------*/
    /*       WE ARE NOT MASTER AND THE TAKE OVER IS ACTIVE OR WE  */
    /*       ARE MASTER AND THE TAKE OVER IS ACTIVE. IN BOTH      */
    /*       WE SET THE STATE TO TAKE_OVER_COMPLETED.             */
    /*------------------------------------------------------------*/
    /*       RELEASE THE CURRENTLY UNUSED LQH CONNECTIONS. THE    */
    /*       REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */
    /*       USED THEM IS COMPLETED.                              */
    /*------------------------------------------------------------*/
    hostptr.p->takeOverStatus = TOS_COMPLETED;
    checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER);
    break;
  case TOS_COMPLETED:
    jam();
    /*------------------------------------------------------------*/
    /*       WE HAVE ALREADY RECEIVED THE CONF SIGNAL. IT IS MOST */
    /*       LIKELY SENT FROM A NEW MASTER WHICH WASN'T SURE IF   */
    /*       THIS NODE HEARD THE CONF SIGNAL FROM THE OLD MASTER. */
    /*       WE SIMPLY IGNORE THE MESSAGE.                        */
    /*------------------------------------------------------------*/
    /*empty*/;
    break;
  default:

  ndbout_c("received execTAKE_OVERTCCONF(%d) from %x (%x)",
	   tfailedNodeId, signal->getSendersBlockRef(), reference());
  if (signal->getSendersBlockRef() != reference())
  {
    jam();
    systemErrorLab(signal);
    return;
  }//switch
  }
  
  
  checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER);
}//Dbtc::execTAKE_OVERTCCONF()

void Dbtc::execTAKE_OVERTCREQ(Signal* signal) 
@@ -7351,16 +7256,10 @@ void Dbtc::completeTransAtTakeOverDoLast(Signal* signal, UintR TtakeOverInd)
    /*       TO REPORT THE COMPLETION OF THE TAKE OVER TO ALL     */
    /*       NODES THAT ARE ALIVE.                                */
    /*------------------------------------------------------------*/
    for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) {
      jam();
      ptrAss(hostptr, hostRecord);
      if (hostptr.p->hostStatus == HS_ALIVE) {
        jam();
        tblockref = calcTcBlockRef(hostptr.i);
    NodeReceiverGroup rg(DBTC, c_alive_nodes);
    signal->theData[0] = tcNodeFailptr.p->takeOverNode;
        sendSignal(tblockref, GSN_TAKE_OVERTCCONF, signal, 1, JBB);
      }//if
    }//for
    sendSignal(rg, GSN_TAKE_OVERTCCONF, signal, 1, JBB);
    
    if (tcNodeFailptr.p->queueIndex > 0) {
      jam();
      /*------------------------------------------------------------*/
@@ -9937,7 +9836,6 @@ void Dbtc::inithost(Signal* signal)
    ptrAss(hostptr, hostRecord);
    hostptr.p->hostStatus = HS_DEAD;
    hostptr.p->inPackedList = false;
    hostptr.p->takeOverStatus = TOS_NOT_DEFINED;
    hostptr.p->lqhTransStatus = LTS_IDLE;
    hostptr.p->noOfWordsTCKEYCONF = 0;
    hostptr.p->noOfWordsTCINDXCONF = 0;