[op-conductor] part 2 - core control logic (#8854)

* Implement main control logic * Add more tests

[op-conductor] part 2 - core control logic (#8854)
* Implement main control logic * Add more tests
8ba2e1e1 · Francis Li · GitHub · fe6dfa6f · 8ba2e1e1 · 8ba2e1e1
Commit 8ba2e1e1 authored Jan 08, 2024 by Francis Li Committed by GitHub Jan 08, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 476 additions and 17 deletions

service.go op-conductor/conductor/service.go +130 -7

service_test.go op-conductor/conductor/service_test.go +346 -10

No files found.
--- a/op-conductor/conductor/service.go
+++ b/op-conductor/conductor/service.go
@@ -3,12 +3,15 @@ package conductor
 import (
 	"context"
 	"fmt"
+	"math/rand"
 	"sync"
 	"sync/atomic"
+	"time"
 	"github.com/ethereum/go-ethereum/log"
 	"github.com/ethereum/go-ethereum/rpc"
 	"github.com/hashicorp/go-multierror"
+	"github.com/hashicorp/raft"
 	"github.com/pkg/errors"
 	"github.com/ethereum-optimism/optimism/op-conductor/client"
@@ -136,6 +139,7 @@ func (c *OpConductor) initConsensus(ctx context.Context) error {
 		return errors.Wrap(err, "failed to create raft consensus")
 	}
 	c.cons = cons
+	c.leaderUpdateCh = c.cons.LeaderCh()
 	return nil
 }
@@ -165,6 +169,7 @@ func (c *OpConductor) initHealthMonitor(ctx context.Context) error {
 		node,
 		p2p,
 	)
+	c.healthUpdateCh = c.hmon.Subscribe()
 	return nil
 }
@@ -191,7 +196,9 @@ type OpConductor struct {
 	healthy   atomic.Bool
 	seqActive atomic.Bool
-	actionFn func() // actionFn defines the action to be executed to bring the sequencer to the desired state.
+	healthUpdateCh <-chan bool
+	leaderUpdateCh <-chan bool
+	actionFn       func() // actionFn defines the action to be executed to bring the sequencer to the desired state.
 	wg             sync.WaitGroup
 	pauseCh        chan struct{}
@@ -225,8 +232,12 @@ func (oc *OpConductor) Start(ctx context.Context) error {
 // Stop implements cliapp.Lifecycle.
 func (oc *OpConductor) Stop(ctx context.Context) error {
-	oc.log.Info("stopping OpConductor")
+	if oc.Stopped() {
+		oc.log.Info("OpConductor already stopped")
+		return nil
+	}
+	oc.log.Info("stopping OpConductor")
 	var result *multierror.Error
 	// close control loop
@@ -286,16 +297,14 @@ func (oc *OpConductor) Paused() bool {
 func (oc *OpConductor) loop() {
 	defer oc.wg.Done()
-	healthUpdate := oc.hmon.Subscribe()
-	leaderUpdate := oc.cons.LeaderCh()
 	for {
 		select {
 		// We process status update (health, leadership) first regardless of the paused state.
 		// This way we could properly bring the sequencer to the desired state when resumed.
-		case healthy := <-healthUpdate:
+		case healthy := <-oc.healthUpdateCh:
 			oc.handleHealthUpdate(healthy)
-		case leader := <-leaderUpdate:
+		case leader := <-oc.leaderUpdateCh:
 			oc.handleLeaderUpdate(leader)
 		case <-oc.pauseCh:
 			oc.paused.Store(true)
@@ -349,5 +358,119 @@ func (oc *OpConductor) action() {
 		return
 	}
-	// TODO: (https://github.com/ethereum-optimism/protocol-quest/issues/47) implement
+	var err error
+	// exhaust all cases below for completeness, 3 state, 8 cases.
+	switch status := struct{ leader, healthy, active bool }{oc.leader.Load(), oc.healthy.Load(), oc.seqActive.Load()}; {
+	case !status.leader && !status.healthy && !status.active:
+		// if follower is not healthy and not sequencing, just log an error
+		oc.log.Error("server (follower) is not healthy", "server", oc.cons.ServerID())
+	case !status.leader && !status.healthy && status.active:
+		// sequencer is not leader, not healthy, but it is sequencing, stop it
+		err = oc.stopSequencer()
+	case !status.leader && status.healthy && !status.active:
+		// normal follower, do nothing
+	case !status.leader && status.healthy && status.active:
+		// stop sequencer, this happens when current server steps down as leader.
+		err = oc.stopSequencer()
+	case status.leader && !status.healthy && !status.active:
+		// transfer leadership to another node
+		err = oc.transferLeader()
+	case status.leader && !status.healthy && status.active:
+		var result *multierror.Error
+		// Try to stop sequencer first, but since sequencer is not healthy, we may not be able to stop it.
+		// In this case, it's fine to continue to try to transfer leadership to another server. This is safe because
+		// 1. if leadership transfer succeeded, then we'll retry and enter case !status.leader && status.healthy && status.active, which will try to stop sequencer.
+		// 2. even if the retry continues to fail and current server stays in active sequencing mode, it would be safe because our hook in op-node will prevent it from committing any new blocks to the network via p2p (if it's not leader any more)
+		if e := oc.stopSequencer(); e != nil {
+			result = multierror.Append(result, e)
+		}
+		// try to transfer leadership to another server despite if sequencer is stopped or not. There are 4 scenarios here:
+		// 1. [sequencer stopped, leadership transfer succeeded] which is the happy case and we handed over sequencing to another server.
+		// 2. [sequencer stopped, leadership transfer failed] we'll enter into case status.leader && !status.healthy && !status.active and retry transfer leadership.
+		// 3. [sequencer active, leadership transfer succeeded] we'll enter into case !status.leader && status.healthy && status.active and retry stop sequencer.
+		// 4. [sequencer active, leadership transfer failed] we're in the same state and will retry here again.
+		if e := oc.transferLeader(); e != nil {
+			result = multierror.Append(result, e)
+		}
+		err = result.ErrorOrNil()
+	case status.leader && status.healthy && !status.active:
+		// start sequencer
+		err = oc.startSequencer()
+	case status.leader && status.healthy && status.active:
+		// normal leader, do nothing
+	}
+	if err != nil {
+		oc.log.Error("failed to execute step, queueing another one to retry", "err", err)
+		// randomly sleep for 0-200ms to avoid excessive retry
+		time.Sleep(time.Duration(rand.Intn(200)) * time.Millisecond)
+		oc.queueAction()
+	}
+}
+// transferLeader tries to transfer leadership to another server.
+func (oc *OpConductor) transferLeader() error {
+	// TransferLeader here will do round robin to try to transfer leadership to the next healthy node.
+	err := oc.cons.TransferLeader()
+	if err == nil {
+		oc.leader.Store(false)
+		return nil // success
+	}
+	switch {
+	case errors.Is(err, raft.ErrNotLeader):
+		// This node is not the leader, do nothing.
+		oc.log.Warn("cannot transfer leadership since current server is not the leader")
+		return nil
+	default:
+		oc.log.Error("failed to transfer leadership", "err", err)
+		return err
+	}
+}
+func (oc *OpConductor) stopSequencer() error {
+	oc.log.Info("stopping sequencer", "server", oc.cons.ServerID(), "leader", oc.leader.Load(), "healthy", oc.healthy.Load(), "active", oc.seqActive.Load())
+	if _, err := oc.ctrl.StopSequencer(context.Background()); err != nil {
+		return errors.Wrap(err, "failed to stop sequencer")
+	}
+	oc.seqActive.Store(false)
+	return nil
+}
+func (oc *OpConductor) startSequencer() error {
+	oc.log.Info("starting sequencer", "server", oc.cons.ServerID(), "leader", oc.leader.Load(), "healthy", oc.healthy.Load(), "active", oc.seqActive.Load())
+	// When starting sequencer, we need to make sure that the current node has the latest unsafe head from the consensus protocol
+	// If not, then we wait for the unsafe head to catch up or gossip it to op-node manually from op-conductor.
+	unsafeInCons := oc.cons.LatestUnsafePayload()
+	unsafeInNode, err := oc.ctrl.LatestUnsafeBlock(context.Background())
+	if err != nil {
+		return errors.Wrap(err, "failed to get latest unsafe block from EL during startSequencer phase")
+	}
+	if unsafeInCons.BlockHash != unsafeInNode.Hash() {
+		oc.log.Warn(
+			"latest unsafe block in consensus is not the same as the one in op-node",
+			"consensus_hash", unsafeInCons.BlockHash,
+			"consensus_block_num", unsafeInCons.BlockNumber,
+			"node_hash", unsafeInNode.Hash(),
+			"node_block_num", unsafeInNode.NumberU64(),
+		)
+		if uint64(unsafeInCons.BlockNumber)-unsafeInNode.NumberU64() == 1 {
+			// tries to post the unsafe head to op-node when head is only 1 block behind (most likely due to gossip delay)
+			if err = oc.ctrl.PostUnsafePayload(context.Background(), &unsafeInCons); err != nil {
+				oc.log.Error("failed to post unsafe head payload to op-node", "err", err)
+			}
+		}
+		return ErrUnsafeHeadMismarch // return error to allow retry
+	}
+	if err := oc.ctrl.StartSequencer(context.Background(), unsafeInCons.BlockHash); err != nil {
+		return errors.Wrap(err, "failed to start sequencer")
+	}
+	oc.seqActive.Store(true)
+	return nil
 }
--- a/op-conductor/conductor/service_test.go
+++ b/op-conductor/conductor/service_test.go
@@ -2,13 +2,16 @@ package conductor
 import (
 	"context"
+	"errors"
 	"math/big"
-	"os"
+	"sync"
 	"testing"
 	"time"
+	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/common/hexutil"
 	"github.com/ethereum/go-ethereum/log"
-	"github.com/stretchr/testify/require"
+	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/suite"
 	clientmocks "github.com/ethereum-optimism/optimism/op-conductor/client/mocks"
@@ -17,20 +20,20 @@ import (
 	"github.com/ethereum-optimism/optimism/op-node/rollup"
 	"github.com/ethereum-optimism/optimism/op-service/eth"
 	"github.com/ethereum-optimism/optimism/op-service/testlog"
+	"github.com/ethereum-optimism/optimism/op-service/testutils"
 )
 func mockConfig(t *testing.T) Config {
 	now := uint64(time.Now().Unix())
-	dir, err := os.MkdirTemp("/tmp", "")
-	require.NoError(t, err)
 	return Config{
 		ConsensusAddr:  "127.0.0.1",
 		ConsensusPort:  50050,
 		RaftServerID:   "SequencerA",
-		RaftStorageDir: dir,
+		RaftStorageDir: "/tmp/raft",
 		RaftBootstrap:  false,
 		NodeRPC:        "http://node:8545",
 		ExecutionRPC:   "http://geth:8545",
+		Paused:         false,
 		HealthCheck: HealthCheckConfig{
 			Interval:     1,
 			SafeInterval: 5,
@@ -84,6 +87,9 @@ type OpConductorTestSuite struct {
 	ctrl    *clientmocks.SequencerControl
 	cons    *consensusmocks.Consensus
 	hmon    *healthmocks.HealthMonitor
+	next chan struct{}
+	wg   sync.WaitGroup
 }
 func (s *OpConductorTestSuite) SetupSuite() {
@@ -91,30 +97,70 @@ func (s *OpConductorTestSuite) SetupSuite() {
 	s.log = testlog.Logger(s.T(), log.LvlDebug)
 	s.cfg = mockConfig(s.T())
 	s.version = "v0.0.1"
+	s.next = make(chan struct{}, 1)
+}
+func (s *OpConductorTestSuite) SetupTest() {
+	// initialize for every test so that method call count starts from 0
 	s.ctrl = &clientmocks.SequencerControl{}
 	s.cons = &consensusmocks.Consensus{}
 	s.hmon = &healthmocks.HealthMonitor{}
 	s.cons.EXPECT().ServerID().Return("SequencerA")
-}
-func (s *OpConductorTestSuite) SetupTest() {
 	conductor, err := NewOpConductor(s.ctx, &s.cfg, s.log, s.version, s.ctrl, s.cons, s.hmon)
 	s.NoError(err)
 	s.conductor = conductor
 	s.healthUpdateCh = make(chan bool)
 	s.hmon.EXPECT().Start().Return(nil)
-	s.hmon.EXPECT().Subscribe().Return(s.healthUpdateCh)
+	s.conductor.healthUpdateCh = s.healthUpdateCh
 	s.leaderUpdateCh = make(chan bool)
-	s.cons.EXPECT().LeaderCh().Return(s.leaderUpdateCh)
+	s.conductor.leaderUpdateCh = s.leaderUpdateCh
 	err = s.conductor.Start(s.ctx)
 	s.NoError(err)
 	s.False(s.conductor.Stopped())
 }
+func (s *OpConductorTestSuite) TearDownTest() {
+	s.hmon.EXPECT().Stop().Return(nil)
+	s.cons.EXPECT().Shutdown().Return(nil)
+	s.NoError(s.conductor.Stop(s.ctx))
+	s.True(s.conductor.Stopped())
+}
+// enableSynchronization wraps conductor actionFn with extra synchronization logic
+// so that we could control the execution of actionFn and observe the internal state transition in between.
+func (s *OpConductorTestSuite) enableSynchronization() {
+	s.conductor.actionFn = func() {
+		<-s.next
+		s.conductor.action()
+		s.wg.Done()
+	}
+}
+func (s *OpConductorTestSuite) execute(fn func()) {
+	s.wg.Add(1)
+	s.next <- struct{}{}
+	if fn != nil {
+		fn()
+	}
+	s.wg.Wait()
+}
+func (s *OpConductorTestSuite) updateStatusAndExecuteAction(ch chan bool, status bool) {
+	fn := func() {
+		ch <- status
+	}
+	s.execute(fn)
+}
+func (s *OpConductorTestSuite) executeAction() {
+	s.execute(nil)
+}
 // Scenario 1: pause -> resume -> stop
 func (s *OpConductorTestSuite) TestControlLoop1() {
 	// Pause
@@ -159,6 +205,13 @@ func (s *OpConductorTestSuite) TestControlLoop2() {
 	err = s.conductor.Resume(s.ctx)
 	s.NoError(err)
 	s.False(s.conductor.Paused())
+	// Stop
+	s.hmon.EXPECT().Stop().Return(nil)
+	s.cons.EXPECT().Shutdown().Return(nil)
+	err = s.conductor.Stop(s.ctx)
+	s.NoError(err)
+	s.True(s.conductor.Stopped())
 }
 // Scenario 3: pause -> stop
@@ -176,6 +229,289 @@ func (s *OpConductorTestSuite) TestControlLoop3() {
 	s.True(s.conductor.Stopped())
 }
+// In this test, we have a follower that is not healthy and not sequencing, it becomes leader through election and we expect it to transfer leadership to another node.
+// [follower, not healthy, not sequencing] -- become leader --> [leader, not healthy, not sequencing] -- transfer leadership --> [follower, not healthy, not sequencing]
+func (s *OpConductorTestSuite) TestScenario1() {
+	s.enableSynchronization()
+	// set initial state
+	s.conductor.leader.Store(false)
+	s.conductor.healthy.Store(false)
+	s.conductor.seqActive.Store(false)
+	s.cons.EXPECT().TransferLeader().Return(nil)
+	// become leader
+	s.updateStatusAndExecuteAction(s.leaderUpdateCh, true)
+	// expect to transfer leadership, go back to [follower, not healthy, not sequencing]
+	s.False(s.conductor.leader.Load())
+	s.False(s.conductor.healthy.Load())
+	s.False(s.conductor.seqActive.Load())
+	s.cons.AssertCalled(s.T(), "TransferLeader")
+}
+// In this test, we have a follower that is not healthy and not sequencing. it becomes healthy and we expect it to stay as follower and not start sequencing.
+// [follower, not healthy, not sequencing] -- become healthy --> [follower, healthy, not sequencing]
+func (s *OpConductorTestSuite) TestScenario2() {
+	s.enableSynchronization()
+	// set initial state
+	s.conductor.leader.Store(false)
+	s.conductor.healthy.Store(false)
+	s.conductor.seqActive.Store(false)
+	// become healthy
+	s.updateStatusAndExecuteAction(s.healthUpdateCh, true)
+	// expect to stay as follower, go to [follower, healthy, not sequencing]
+	s.False(s.conductor.leader.Load())
+	s.True(s.conductor.healthy.Load())
+	s.False(s.conductor.seqActive.Load())
+}
+// In this test, we have a follower that is healthy and not sequencing, we send a leader update to it and expect it to start sequencing.
+// [follower, healthy, not sequencing] -- become leader --> [leader, healthy, sequencing]
+func (s *OpConductorTestSuite) TestScenario3() {
+	s.enableSynchronization()
+	mockPayload := eth.ExecutionPayload{
+		BlockNumber: 1,
+		Timestamp:   hexutil.Uint64(time.Now().Unix()),
+		BlockHash:   [32]byte{1, 2, 3},
+	}
+	mockBlockInfo := &testutils.MockBlockInfo{
+		InfoNum:  1,
+		InfoHash: [32]byte{1, 2, 3},
+	}
+	s.cons.EXPECT().LatestUnsafePayload().Return(mockPayload).Times(1)
+	s.ctrl.EXPECT().LatestUnsafeBlock(mock.Anything).Return(mockBlockInfo, nil).Times(1)
+	s.ctrl.EXPECT().StartSequencer(mock.Anything, mock.Anything).Return(nil).Times(1)
+	// [follower, healthy, not sequencing]
+	s.False(s.conductor.leader.Load())
+	s.True(s.conductor.healthy.Load())
+	s.False(s.conductor.seqActive.Load())
+	// become leader
+	s.updateStatusAndExecuteAction(s.leaderUpdateCh, true)
+	// [leader, healthy, sequencing]
+	s.True(s.conductor.leader.Load())
+	s.True(s.conductor.healthy.Load())
+	s.True(s.conductor.seqActive.Load())
+	s.ctrl.AssertCalled(s.T(), "StartSequencer", mock.Anything, mock.Anything)
+	s.ctrl.AssertCalled(s.T(), "LatestUnsafeBlock", mock.Anything)
+}
+// This test setup is the same as Scenario 3, the difference is that scenario 3 is all happy case and in this test, we try to exhaust all the error cases.
+// [follower, healthy, not sequencing] -- become leader, unsafe head does not match, retry, eventually succeed --> [leader, healthy, sequencing]
+func (s *OpConductorTestSuite) TestScenario4() {
+	s.enableSynchronization()
+	// unsafe in consensus is 1 block ahead of unsafe in sequencer, we try to post the unsafe payload to sequencer and return error to allow retry
+	// this is normal because the latest unsafe (in consensus) might not arrive at sequencer through p2p yet
+	mockPayload := eth.ExecutionPayload{
+		BlockNumber: 2,
+		Timestamp:   hexutil.Uint64(time.Now().Unix()),
+		BlockHash:   [32]byte{1, 2, 3},
+	}
+	mockBlockInfo := &testutils.MockBlockInfo{
+		InfoNum:  1,
+		InfoHash: [32]byte{2, 3, 4},
+	}
+	s.cons.EXPECT().LatestUnsafePayload().Return(mockPayload).Times(1)
+	s.ctrl.EXPECT().LatestUnsafeBlock(mock.Anything).Return(mockBlockInfo, nil).Times(1)
+	s.ctrl.EXPECT().PostUnsafePayload(mock.Anything, mock.Anything).Return(nil).Times(1)
+	s.updateStatusAndExecuteAction(s.leaderUpdateCh, true)
+	// [leader, healthy, not sequencing]
+	s.True(s.conductor.leader.Load())
+	s.True(s.conductor.healthy.Load())
+	s.False(s.conductor.seqActive.Load())
+	s.ctrl.AssertNotCalled(s.T(), "StartSequencer", mock.Anything, mock.Anything)
+	s.ctrl.AssertNumberOfCalls(s.T(), "LatestUnsafeBlock", 1)
+	s.ctrl.AssertNumberOfCalls(s.T(), "PostUnsafePayload", 1)
+	s.cons.AssertNumberOfCalls(s.T(), "LatestUnsafePayload", 1)
+	// unsafe caught up, we try to start sequencer at specified block and succeeds
+	mockBlockInfo.InfoNum = 2
+	mockBlockInfo.InfoHash = [32]byte{1, 2, 3}
+	s.cons.EXPECT().LatestUnsafePayload().Return(mockPayload).Times(1)
+	s.ctrl.EXPECT().LatestUnsafeBlock(mock.Anything).Return(mockBlockInfo, nil).Times(1)
+	s.ctrl.EXPECT().StartSequencer(mock.Anything, mockBlockInfo.InfoHash).Return(nil).Times(1)
+	s.executeAction()
+	// [leader, healthy, sequencing]
+	s.True(s.conductor.leader.Load())
+	s.True(s.conductor.healthy.Load())
+	s.True(s.conductor.seqActive.Load())
+	s.ctrl.AssertNumberOfCalls(s.T(), "LatestUnsafeBlock", 2)
+	s.ctrl.AssertNumberOfCalls(s.T(), "PostUnsafePayload", 1)
+	s.ctrl.AssertNumberOfCalls(s.T(), "StartSequencer", 1)
+	s.cons.AssertNumberOfCalls(s.T(), "LatestUnsafePayload", 2)
+}
+// In this test, we have a follower that is healthy and not sequencing, we send a unhealthy update to it and expect it to stay as follower and not start sequencing.
+// [follower, healthy, not sequencing] -- become unhealthy --> [follower, not healthy, not sequencing]
+func (s *OpConductorTestSuite) TestScenario5() {
+	s.enableSynchronization()
+	// set initial state
+	s.conductor.leader.Store(false)
+	s.conductor.healthy.Store(true)
+	s.conductor.seqActive.Store(false)
+	// become unhealthy
+	s.updateStatusAndExecuteAction(s.healthUpdateCh, false)
+	// expect to stay as follower, go to [follower, not healthy, not sequencing]
+	s.False(s.conductor.leader.Load())
+	s.False(s.conductor.healthy.Load())
+	s.False(s.conductor.seqActive.Load())
+}
+// In this test, we have a leader that is healthy and sequencing, we send a leader update to it and expect it to stop sequencing.
+// [leader, healthy, sequencing] -- step down as leader --> [follower, healthy, not sequencing]
+func (s *OpConductorTestSuite) TestScenario6() {
+	s.enableSynchronization()
+	// set initial state
+	s.conductor.leader.Store(true)
+	s.conductor.healthy.Store(true)
+	s.conductor.seqActive.Store(true)
+	s.ctrl.EXPECT().StopSequencer(mock.Anything).Return(common.Hash{}, nil).Times(1)
+	// step down as leader
+	s.updateStatusAndExecuteAction(s.leaderUpdateCh, false)
+	// expect to stay as follower, go to [follower, healthy, not sequencing]
+	s.False(s.conductor.leader.Load())
+	s.True(s.conductor.healthy.Load())
+	s.False(s.conductor.seqActive.Load())
+	s.ctrl.AssertCalled(s.T(), "StopSequencer", mock.Anything)
+}
+// In this test, we have a leader that is healthy and sequencing, we send a unhealthy update to it and expect it to stop sequencing and transfer leadership.
+// 1. [leader, healthy, sequencing] -- become unhealthy -->
+// 2. [leader, unhealthy, sequencing] -- stop sequencing, transfer leadership --> [follower, unhealthy, not sequencing]
+func (s *OpConductorTestSuite) TestScenario7() {
+	s.enableSynchronization()
+	// set initial state
+	s.conductor.leader.Store(true)
+	s.conductor.healthy.Store(true)
+	s.conductor.seqActive.Store(true)
+	s.cons.EXPECT().TransferLeader().Return(nil).Times(1)
+	s.ctrl.EXPECT().StopSequencer(mock.Anything).Return(common.Hash{}, nil).Times(1)
+	// become unhealthy
+	s.updateStatusAndExecuteAction(s.healthUpdateCh, false)
+	// expect to step down as leader and stop sequencing
+	s.False(s.conductor.leader.Load())
+	s.False(s.conductor.healthy.Load())
+	s.False(s.conductor.seqActive.Load())
+	s.ctrl.AssertCalled(s.T(), "StopSequencer", mock.Anything)
+	s.cons.AssertCalled(s.T(), "TransferLeader")
+}
+// In this test, we have a leader that is healthy and sequencing, we send a unhealthy update to it and expect it to stop sequencing and transfer leadership.
+// However, the action we needed to take failed temporarily, so we expect it to retry until it succeeds.
+// 1. [leader, healthy, sequencing] -- become unhealthy -->
+// 2. [leader, unhealthy, sequencing] -- stop sequencing failed, transfer leadership failed, retry -->
+// 3. [leader, unhealthy, sequencing] -- stop sequencing succeeded, transfer leadership failed, retry -->
+// 4. [leader, unhealthy, not sequencing] -- transfer leadership succeeded -->
+// 5. [follower, unhealthy, not sequencing]
+func (s *OpConductorTestSuite) TestFailureAndRetry1() {
+	s.enableSynchronization()
+	err := errors.New("failure")
+	// set initial state
+	s.conductor.leader.Store(true)
+	s.conductor.healthy.Store(true)
+	s.conductor.seqActive.Store(true)
+	// step 1 & 2: become unhealthy, stop sequencing failed, transfer leadership failed
+	s.cons.EXPECT().TransferLeader().Return(err).Times(1)
+	s.ctrl.EXPECT().StopSequencer(mock.Anything).Return(common.Hash{}, err).Times(1)
+	s.updateStatusAndExecuteAction(s.healthUpdateCh, false)
+	s.True(s.conductor.leader.Load())
+	s.False(s.conductor.healthy.Load())
+	s.True(s.conductor.seqActive.Load())
+	s.ctrl.AssertNumberOfCalls(s.T(), "StopSequencer", 1)
+	s.cons.AssertNumberOfCalls(s.T(), "TransferLeader", 1)
+	// step 3: [leader, unhealthy, sequencing] -- stop sequencing succeeded, transfer leadership failed, retry
+	s.ctrl.EXPECT().StopSequencer(mock.Anything).Return(common.Hash{}, nil).Times(1)
+	s.cons.EXPECT().TransferLeader().Return(err).Times(1)
+	s.executeAction()
+	s.True(s.conductor.leader.Load())
+	s.False(s.conductor.healthy.Load())
+	s.False(s.conductor.seqActive.Load())
+	s.ctrl.AssertNumberOfCalls(s.T(), "StopSequencer", 2)
+	s.cons.AssertNumberOfCalls(s.T(), "TransferLeader", 2)
+	// step 4: [leader, unhealthy, not sequencing] -- transfer leadership succeeded
+	s.cons.EXPECT().TransferLeader().Return(nil).Times(1)
+	s.executeAction()
+	// [follower, unhealthy, not sequencing]
+	s.False(s.conductor.leader.Load())
+	s.False(s.conductor.healthy.Load())
+	s.False(s.conductor.seqActive.Load())
+	s.ctrl.AssertNumberOfCalls(s.T(), "StopSequencer", 2)
+	s.cons.AssertNumberOfCalls(s.T(), "TransferLeader", 3)
+}
+// In this test, we have a leader that is healthy and sequencing, we send a unhealthy update to it and expect it to stop sequencing and transfer leadership.
+// However, the action we needed to take failed temporarily, so we expect it to retry until it succeeds.
+// 1. [leader, healthy, sequencing] -- become unhealthy -->
+// 2. [leader, unhealthy, sequencing] -- stop sequencing failed, transfer leadership succeeded, retry -->
+// 3. [follower, unhealthy, sequencing] -- stop sequencing succeeded -->
+// 4. [follower, unhealthy, not sequencing]
+func (s *OpConductorTestSuite) TestFailureAndRetry2() {
+	s.enableSynchronization()
+	err := errors.New("failure")
+	// set initial state
+	s.conductor.leader.Store(true)
+	s.conductor.healthy.Store(true)
+	s.conductor.seqActive.Store(true)
+	// step 1 & 2: become unhealthy, stop sequencing failed, transfer leadership succeeded, retry
+	s.cons.EXPECT().TransferLeader().Return(nil).Times(1)
+	s.ctrl.EXPECT().StopSequencer(mock.Anything).Return(common.Hash{}, err).Times(1)
+	s.updateStatusAndExecuteAction(s.healthUpdateCh, false)
+	s.False(s.conductor.leader.Load())
+	s.False(s.conductor.healthy.Load())
+	s.True(s.conductor.seqActive.Load())
+	s.ctrl.AssertNumberOfCalls(s.T(), "StopSequencer", 1)
+	s.cons.AssertNumberOfCalls(s.T(), "TransferLeader", 1)
+	// step 3: [follower, unhealthy, sequencing] -- stop sequencing succeeded
+	s.ctrl.EXPECT().StopSequencer(mock.Anything).Return(common.Hash{}, nil).Times(1)
+	s.executeAction()
+	s.False(s.conductor.leader.Load())
+	s.False(s.conductor.healthy.Load())
+	s.False(s.conductor.seqActive.Load())
+	s.ctrl.AssertNumberOfCalls(s.T(), "StopSequencer", 2)
+	s.cons.AssertNumberOfCalls(s.T(), "TransferLeader", 1)
+}
 func TestHealthMonitor(t *testing.T) {
 	suite.Run(t, new(OpConductorTestSuite))
 }