[op-conductor] state transition bug fix (#9248)

* Fix state transition bug * Finish state transition tests * Fix monitor bugs * update upon suggestion

[op-conductor] state transition bug fix (#9248)
* Fix state transition bug * Finish state transition tests * Fix monitor bugs * update upon suggestion
0c0b907b · Francis Li · GitHub · f0f6aa7c · 0c0b907b · 0c0b907b
Commit 0c0b907b authored Jan 31, 2024 by Francis Li Committed by GitHub Jan 31, 2024
6 changed files
--- a/op-conductor/conductor/service.go
+++ b/op-conductor/conductor/service.go
@@ -68,7 +68,7 @@ func NewOpConductor(
 		cons:         cons,
 		hmon:         hmon,
 	}
-	oc.actionFn = oc.action
+	oc.loopActionFn = oc.loopAction
 	// explicitly set all atomic.Bool values
 	oc.leader.Store(false)    // upon start, it should not be the leader unless specified otherwise by raft bootstrap, in that case, it'll receive a leadership update from consensus.
@@ -90,6 +90,7 @@ func NewOpConductor(
 		}
 		return nil, err
 	}
+	oc.prevState = NewState(oc.leader.Load(), oc.healthy.Load(), oc.seqActive.Load())
 	return oc, nil
 }
@@ -252,10 +253,11 @@ type OpConductor struct {
 	seqActive atomic.Bool
 	healthy   atomic.Bool
 	hcerr     error // error from health check
+	prevState *state
 	healthUpdateCh <-chan error
 	leaderUpdateCh <-chan bool
-	actionFn       func() // actionFn defines the action to be executed to bring the sequencer to the desired state.
+	loopActionFn   func() // loopActionFn defines the logic to be executed inside control loop.
 	wg             sync.WaitGroup
 	pauseCh        chan struct{}
@@ -271,6 +273,23 @@ type OpConductor struct {
 	rpcServer *oprpc.Server
 }
+type state struct {
+	leader, healthy, active bool
+}
+// NewState creates a new state instance.
+func NewState(leader, healthy, active bool) *state {
+	return &state{
+		leader:  leader,
+		healthy: healthy,
+		active:  active,
+	}
+}
+func (s *state) Equal(other *state) bool {
+	return s.leader == other.leader && s.healthy == other.healthy && s.active == other.active
+}
 var _ cliapp.Lifecycle = (*OpConductor)(nil)
 // Start implements cliapp.Lifecycle.
@@ -430,29 +449,35 @@ func (oc *OpConductor) loop() {
 	for {
 		select {
-		// We process status update (health, leadership) first regardless of the paused state.
-		// This way we could properly bring the sequencer to the desired state when resumed.
-		case healthy := <-oc.healthUpdateCh:
-			oc.handleHealthUpdate(healthy)
-		case leader := <-oc.leaderUpdateCh:
-			oc.handleLeaderUpdate(leader)
-		case <-oc.pauseCh:
-			oc.paused.Store(true)
-			oc.pauseDoneCh <- struct{}{}
-		case <-oc.resumeCh:
-			oc.paused.Store(false)
-			oc.resumeDoneCh <- struct{}{}
-			// queue an action to make sure sequencer is in the desired state after resume.
-			oc.queueAction()
 		case <-oc.shutdownCtx.Done():
 			return
-		// Handle control action last, so that when executing the action, we have the latest status and bring the sequencer to the desired state.
+		default:
-		case <-oc.actionCh:
+			oc.loopActionFn()
-			oc.actionFn()
 		}
 	}
 }
+func (oc *OpConductor) loopAction() {
+	select {
+	case healthy := <-oc.healthUpdateCh:
+		oc.handleHealthUpdate(healthy)
+	case leader := <-oc.leaderUpdateCh:
+		oc.handleLeaderUpdate(leader)
+	case <-oc.pauseCh:
+		oc.paused.Store(true)
+		oc.pauseDoneCh <- struct{}{}
+	case <-oc.resumeCh:
+		oc.paused.Store(false)
+		oc.resumeDoneCh <- struct{}{}
+		// queue an action to make sure sequencer is in the desired state after resume.
+		oc.queueAction()
+	case <-oc.shutdownCtx.Done():
+		return
+	case <-oc.actionCh:
+		oc.action()
+	}
+}
 func (oc *OpConductor) queueAction() {
 	select {
 	case oc.actionCh <- struct{}{}:
@@ -472,6 +497,7 @@ func (oc *OpConductor) handleLeaderUpdate(leader bool) {
 // handleHealthUpdate handles health update from health monitor.
 func (oc *OpConductor) handleHealthUpdate(hcerr error) {
+	oc.log.Debug("received health update", "server", oc.cons.ServerID(), "error", hcerr)
 	healthy := hcerr == nil
 	if !healthy {
 		oc.log.Error("Sequencer is unhealthy", "server", oc.cons.ServerID(), "err", hcerr)
@@ -495,8 +521,11 @@ func (oc *OpConductor) action() {
 	}
 	var err error
+	status := NewState(oc.leader.Load(), oc.healthy.Load(), oc.seqActive.Load())
+	oc.log.Debug("entering action with status", "status", status)
 	// exhaust all cases below for completeness, 3 state, 8 cases.
-	switch status := struct{ leader, healthy, active bool }{oc.leader.Load(), oc.healthy.Load(), oc.seqActive.Load()}; {
+	switch {
 	case !status.leader && !status.healthy && !status.active:
 		// if follower is not healthy and not sequencing, just log an error
 		oc.log.Error("server (follower) is not healthy", "server", oc.cons.ServerID())
@@ -509,9 +538,35 @@ func (oc *OpConductor) action() {
 		// stop sequencer, this happens when current server steps down as leader.
 		err = oc.stopSequencer()
 	case status.leader && !status.healthy && !status.active:
-		// transfer leadership to another node
+		// There are 2 scenarios we need to handle:
+		// 1. current node is follower, active sequencer became unhealthy and started the leadership transfer process.
+		//    however if leadership transfer took longer than the time for health monitor to treat the node as unhealthy,
+		//    then basically the entire network is stalled and we need to start sequencing in this case.
+		if !oc.prevState.leader && !oc.prevState.active {
+			_, _, cerr := oc.compareUnsafeHead(oc.shutdownCtx)
+			if cerr == nil && !errors.Is(oc.hcerr, health.ErrSequencerConnectionDown) {
+				// if unsafe in consensus is the same as unsafe in op-node, then it is scenario #1 and we should start sequencer.
+				err = oc.startSequencer()
+				break
+			}
+		}
+		// 2. for other cases, we should try to transfer leader to another node.
+		//    for example, if follower became a leader and unhealthy at the same time (just unhealthy itself), then we should transfer leadership.
 		err = oc.transferLeader()
 	case status.leader && !status.healthy && status.active:
+		// There are two scenarios we need to handle here:
+		// 1. we're transitioned from case status.leader && !status.healthy && !status.active, see description above
+		//    then we should continue to sequence blocks and try to bring ourselves back to healthy state.
+		//    note: we need to also make sure that the health error is not due to ErrSequencerConnectionDown
+		//    		because in this case, we should stop sequencing and transfer leadership to other nodes.
+		if oc.prevState.leader && !oc.prevState.healthy && !oc.prevState.active && !errors.Is(oc.hcerr, health.ErrSequencerConnectionDown) {
+			err = errors.New("waiting for sequencing to become healthy by itself")
+			break
+		}
+		// 2. we're here becasuse an healthy leader became unhealthy itself
+		//    then we should try to stop sequencing locally and transfer leadership.
 		var result *multierror.Error
 		// Try to stop sequencer first, but since sequencer is not healthy, we may not be able to stop it.
 		// In this case, it's fine to continue to try to transfer leadership to another server. This is safe because
@@ -536,17 +591,25 @@ func (oc *OpConductor) action() {
 		// normal leader, do nothing
 	}
+	oc.log.Debug("exiting action with status and error", "status", status, "err", err)
 	if err != nil {
 		oc.log.Error("failed to execute step, queueing another one to retry", "err", err)
 		// randomly sleep for 0-200ms to avoid excessive retry
 		time.Sleep(time.Duration(rand.Intn(200)) * time.Millisecond)
 		oc.queueAction()
+		return
+	}
+	if !status.Equal(oc.prevState) {
+		oc.log.Info("state changed", "prev_state", oc.prevState, "new_state", status)
+		oc.prevState = status
 	}
 }
 // transferLeader tries to transfer leadership to another server.
 func (oc *OpConductor) transferLeader() error {
 	// TransferLeader here will do round robin to try to transfer leadership to the next healthy node.
+	oc.log.Info("transferring leadership", "server", oc.cons.ServerID())
 	err := oc.cons.TransferLeader()
 	if err == nil {
 		oc.leader.Store(false)
@@ -567,15 +630,20 @@ func (oc *OpConductor) transferLeader() error {
 func (oc *OpConductor) stopSequencer() error {
 	oc.log.Info("stopping sequencer", "server", oc.cons.ServerID(), "leader", oc.leader.Load(), "healthy", oc.healthy.Load(), "active", oc.seqActive.Load())
-	if _, err := oc.ctrl.StopSequencer(context.Background()); err != nil {
+	_, err := oc.ctrl.StopSequencer(context.Background())
-		return errors.Wrap(err, "failed to stop sequencer")
+	if err != nil {
+		if strings.Contains(err.Error(), driver.ErrSequencerAlreadyStopped.Error()) {
+			oc.log.Warn("sequencer already stopped.", "err", err)
+		} else {
+			return errors.Wrap(err, "failed to stop sequencer")
+		}
 	}
 	oc.seqActive.Store(false)
 	return nil
 }
 func (oc *OpConductor) startSequencer() error {
-	oc.log.Info("starting sequencer", "server", oc.cons.ServerID(), "leader", oc.leader.Load(), "healthy", oc.healthy.Load(), "active", oc.seqActive.Load())
 	ctx := context.Background()
 	// When starting sequencer, we need to make sure that the current node has the latest unsafe head from the consensus protocol
@@ -592,6 +660,7 @@ func (oc *OpConductor) startSequencer() error {
 		return err
 	}
+	oc.log.Info("starting sequencer", "server", oc.cons.ServerID(), "leader", oc.leader.Load(), "healthy", oc.healthy.Load(), "active", oc.seqActive.Load())
 	if err = oc.ctrl.StartSequencer(ctx, unsafeInCons.ExecutionPayload.BlockHash); err != nil {
 		// cannot directly compare using Errors.Is because the error is returned from an JSON RPC server which lost its type.
 		if !strings.Contains(err.Error(), driver.ErrSequencerAlreadyStarted.Error()) {
@@ -616,6 +685,7 @@ func (oc *OpConductor) compareUnsafeHead(ctx context.Context) (*eth.ExecutionPay
 		return unsafeInCons, nil, errors.Wrap(err, "failed to get latest unsafe block from EL during compareUnsafeHead phase")
 	}
+	oc.log.Debug("comparing unsafe head", "consensus", unsafeInCons.ExecutionPayload.BlockNumber, "node", unsafeInNode.NumberU64())
 	if unsafeInCons.ExecutionPayload.BlockHash != unsafeInNode.Hash() {
 		oc.log.Warn(
 			"latest unsafe block in consensus is not the same as the one in op-node",
@@ -636,6 +706,7 @@ func (oc *OpConductor) updateSequencerActiveStatus() error {
 	if err != nil {
 		return errors.Wrap(err, "failed to get sequencer active status")
 	}
+	oc.log.Info("sequencer active status updated", "active", active)
 	oc.seqActive.Store(active)
 	return nil
 }
--- a/op-conductor/conductor/service_test.go
+++ b/op-conductor/conductor/service_test.go
--- a/op-conductor/health/monitor.go
+++ b/op-conductor/health/monitor.go
@@ -44,6 +44,7 @@ func NewSequencerHealthMonitor(log log.Logger, interval, unsafeInterval, safeInt
 		unsafeInterval: unsafeInterval,
 		safeInterval:   safeInterval,
 		minPeerCount:   minPeerCount,
+		timeProviderFn: currentTimeProvicer,
 		node:           node,
 		p2p:            p2p,
 	}
@@ -64,6 +65,8 @@ type SequencerHealthMonitor struct {
 	lastSeenUnsafeNum  uint64
 	lastSeenUnsafeTime uint64
+	timeProviderFn func() uint64
 	node dial.RollupClientInterface
 	p2p  p2p.API
 }
@@ -125,23 +128,17 @@ func (hm *SequencerHealthMonitor) healthCheck() error {
 		return ErrSequencerConnectionDown
 	}
-	now := uint64(time.Now().Unix())
+	now := hm.timeProviderFn()
+	var timeDiff, blockDiff, expectedBlocks uint64
 	if hm.lastSeenUnsafeNum != 0 {
-		diff := now - hm.lastSeenUnsafeTime
+		timeDiff = now - hm.lastSeenUnsafeTime
+		blockDiff = status.UnsafeL2.Number - hm.lastSeenUnsafeNum
 		// how many blocks do we expect to see, minus 1 to account for edge case with respect to time.
 		// for example, if diff = 2.001s and block time = 2s, expecting to see 1 block could potentially cause sequencer to be considered unhealthy.
-		blocks := diff/hm.rollupCfg.BlockTime - 1
+		expectedBlocks = timeDiff / hm.rollupCfg.BlockTime
-		if diff > hm.rollupCfg.BlockTime && blocks > status.UnsafeL2.Number-hm.lastSeenUnsafeNum {
+		if expectedBlocks > 0 {
-			hm.log.Error(
+			expectedBlocks--
-				"unsafe head is not progressing as expected",
-				"now", now,
-				"unsafe_head_num", status.UnsafeL2.Number,
-				"last_seen_unsafe_num", hm.lastSeenUnsafeNum,
-				"last_seen_unsafe_time", hm.lastSeenUnsafeTime,
-				"unsafe_interval", hm.unsafeInterval,
-			)
-			return ErrSequencerNotHealthy
 		}
 	}
 	if status.UnsafeL2.Number > hm.lastSeenUnsafeNum {
@@ -149,6 +146,18 @@ func (hm *SequencerHealthMonitor) healthCheck() error {
 		hm.lastSeenUnsafeTime = now
 	}
+	if timeDiff > hm.rollupCfg.BlockTime && expectedBlocks > blockDiff {
+		hm.log.Error(
+			"unsafe head is not progressing as expected",
+			"now", now,
+			"unsafe_head_num", status.UnsafeL2.Number,
+			"last_seen_unsafe_num", hm.lastSeenUnsafeNum,
+			"last_seen_unsafe_time", hm.lastSeenUnsafeTime,
+			"unsafe_interval", hm.unsafeInterval,
+		)
+		return ErrSequencerNotHealthy
+	}
 	if now-status.UnsafeL2.Time > hm.unsafeInterval {
 		hm.log.Error(
 			"unsafe head is not progressing as expected",
@@ -183,3 +192,7 @@ func (hm *SequencerHealthMonitor) healthCheck() error {
 	return nil
 }
+func currentTimeProvicer() uint64 {
+	return uint64(time.Now().Unix())
+}
--- a/op-conductor/health/monitor_test.go
+++ b/op-conductor/health/monitor_test.go
@@ -26,130 +26,192 @@ const (
 type HealthMonitorTestSuite struct {
 	suite.Suite
-	log            log.Logger
+	log          log.Logger
-	rc             *testutils.MockRollupClient
+	interval     uint64
-	pc             *p2pMocks.API
+	minPeerCount uint64
-	interval       uint64
+	rollupCfg    *rollup.Config
-	unsafeInterval uint64
-	safeInterval   uint64
-	minPeerCount   uint64
-	rollupCfg      *rollup.Config
-	monitor        HealthMonitor
 }
 func (s *HealthMonitorTestSuite) SetupSuite() {
-	s.log = testlog.Logger(s.T(), log.LvlInfo)
+	s.log = testlog.Logger(s.T(), log.LvlDebug)
-	s.rc = &testutils.MockRollupClient{}
-	s.pc = &p2pMocks.API{}
 	s.interval = 1
-	s.unsafeInterval = 3
-	s.safeInterval = 5
 	s.minPeerCount = minPeerCount
 	s.rollupCfg = &rollup.Config{
 		BlockTime: blockTime,
 	}
 }
-func (s *HealthMonitorTestSuite) SetupTest() {
+func (s *HealthMonitorTestSuite) SetupMonitor(
-	s.monitor = NewSequencerHealthMonitor(s.log, s.interval, s.unsafeInterval, s.safeInterval, s.minPeerCount, s.rollupCfg, s.rc, s.pc)
+	now, unsafeInterval, safeInterval uint64,
-	err := s.monitor.Start()
+	mockRollupClient *testutils.MockRollupClient,
-	s.NoError(err)
+	mockP2P *p2pMocks.API,
-}
+) *SequencerHealthMonitor {
+	tp := &timeProvider{now: now}
-func (s *HealthMonitorTestSuite) TearDownTest() {
+	if mockP2P == nil {
-	err := s.monitor.Stop()
+		mockP2P = &p2pMocks.API{}
+		ps1 := &p2p.PeerStats{
+			Connected: healthyPeerCount,
+		}
+		mockP2P.EXPECT().PeerStats(context.Background()).Return(ps1, nil)
+	}
+	monitor := &SequencerHealthMonitor{
+		log:            s.log,
+		done:           make(chan struct{}),
+		interval:       s.interval,
+		healthUpdateCh: make(chan error),
+		rollupCfg:      s.rollupCfg,
+		unsafeInterval: unsafeInterval,
+		safeInterval:   safeInterval,
+		minPeerCount:   s.minPeerCount,
+		timeProviderFn: tp.Now,
+		node:           mockRollupClient,
+		p2p:            mockP2P,
+	}
+	err := monitor.Start()
 	s.NoError(err)
+	return monitor
 }
 func (s *HealthMonitorTestSuite) TestUnhealthyLowPeerCount() {
+	s.T().Parallel()
 	now := uint64(time.Now().Unix())
-	ss1 := &eth.SyncStatus{
-		UnsafeL2: eth.L2BlockRef{
-			Time: now - 1,
-		},
-		SafeL2: eth.L2BlockRef{
-			Time: now - 2,
-		},
-	}
-	s.rc.ExpectSyncStatus(ss1, nil)
+	rc := &testutils.MockRollupClient{}
+	ss1 := mockSyncStatus(now-1, 1, now-3, 0)
+	rc.ExpectSyncStatus(ss1, nil)
+	rc.ExpectSyncStatus(ss1, nil)
+	pc := &p2pMocks.API{}
 	ps1 := &p2p.PeerStats{
 		Connected: unhealthyPeerCount,
 	}
-	s.pc.EXPECT().PeerStats(context.Background()).Return(ps1, nil).Times(1)
+	pc.EXPECT().PeerStats(context.Background()).Return(ps1, nil).Times(1)
+	monitor := s.SetupMonitor(now, 60, 60, rc, pc)
-	healthUpdateCh := s.monitor.Subscribe()
+	healthUpdateCh := monitor.Subscribe()
 	healthy := <-healthUpdateCh
 	s.NotNil(healthy)
+	s.NoError(monitor.Stop())
 }
 func (s *HealthMonitorTestSuite) TestUnhealthyUnsafeHeadNotProgressing() {
-	ps1 := &p2p.PeerStats{
+	s.T().Parallel()
-		Connected: healthyPeerCount,
-	}
-	s.pc.EXPECT().PeerStats(context.Background()).Return(ps1, nil).Times(3)
 	now := uint64(time.Now().Unix())
-	ss1 := &eth.SyncStatus{
-		UnsafeL2: eth.L2BlockRef{
+	rc := &testutils.MockRollupClient{}
-			Number: 5,
+	ss1 := mockSyncStatus(now, 5, now-8, 1)
-			Time:   now - 1,
+	for i := 0; i < 6; i++ {
-		},
+		rc.ExpectSyncStatus(ss1, nil)
-		SafeL2: eth.L2BlockRef{
-			Number: 1,
-			Time:   now - 2,
-		},
 	}
-	s.rc.ExpectSyncStatus(ss1, nil)
-	s.rc.ExpectSyncStatus(ss1, nil)
-	s.rc.ExpectSyncStatus(ss1, nil)
-	healthUpdateCh := s.monitor.Subscribe()
+	monitor := s.SetupMonitor(now, 60, 60, rc, nil)
-	for i := 0; i < 3; i++ {
+	healthUpdateCh := monitor.Subscribe()
+	for i := 0; i < 5; i++ {
 		healthy := <-healthUpdateCh
-		if i < 2 {
+		if i < 4 {
 			s.Nil(healthy)
+			s.Equal(now, monitor.lastSeenUnsafeTime)
+			s.Equal(uint64(5), monitor.lastSeenUnsafeNum)
 		} else {
 			s.NotNil(healthy)
 		}
 	}
+	s.NoError(monitor.Stop())
 }
 func (s *HealthMonitorTestSuite) TestUnhealthySafeHeadNotProgressing() {
-	ps1 := &p2p.PeerStats{
+	s.T().Parallel()
-		Connected: healthyPeerCount,
-	}
-	s.pc.EXPECT().PeerStats(context.Background()).Return(ps1, nil).Times(6)
 	now := uint64(time.Now().Unix())
-	syncStatusGenerator := func(unsafeTime uint64) *eth.SyncStatus {
-		return &eth.SyncStatus{
+	rc := &testutils.MockRollupClient{}
-			UnsafeL2: eth.L2BlockRef{
+	rc.ExpectSyncStatus(mockSyncStatus(now, 1, now, 1), nil)
-				Time: unsafeTime,
+	rc.ExpectSyncStatus(mockSyncStatus(now, 1, now, 1), nil)
-			},
+	rc.ExpectSyncStatus(mockSyncStatus(now+2, 2, now, 1), nil)
-			SafeL2: eth.L2BlockRef{
+	rc.ExpectSyncStatus(mockSyncStatus(now+2, 2, now, 1), nil)
-				Time: now,
+	rc.ExpectSyncStatus(mockSyncStatus(now+4, 3, now, 1), nil)
-			},
+	rc.ExpectSyncStatus(mockSyncStatus(now+4, 3, now, 1), nil)
-		}
-	}
+	monitor := s.SetupMonitor(now, 60, 3, rc, nil)
-	s.rc.ExpectSyncStatus(syncStatusGenerator(now), nil)
+	healthUpdateCh := monitor.Subscribe()
-	s.rc.ExpectSyncStatus(syncStatusGenerator(now), nil)
-	s.rc.ExpectSyncStatus(syncStatusGenerator(now+2), nil)
+	for i := 0; i < 5; i++ {
-	s.rc.ExpectSyncStatus(syncStatusGenerator(now+2), nil)
-	s.rc.ExpectSyncStatus(syncStatusGenerator(now+4), nil)
-	s.rc.ExpectSyncStatus(syncStatusGenerator(now+4), nil)
-	healthUpdateCh := s.monitor.Subscribe()
-	for i := 0; i < 6; i++ {
 		healthy := <-healthUpdateCh
-		if i < 5 {
+		if i < 4 {
 			s.Nil(healthy)
 		} else {
 			s.NotNil(healthy)
 		}
 	}
+	s.NoError(monitor.Stop())
+}
+func (s *HealthMonitorTestSuite) TestHealthyWithUnsafeLag() {
+	s.T().Parallel()
+	now := uint64(time.Now().Unix())
+	rc := &testutils.MockRollupClient{}
+	// although unsafe has lag of 20 seconds, it's within the configured unsafe interval
+	// and it is advancing every block time, so it should be considered safe.
+	rc.ExpectSyncStatus(mockSyncStatus(now-10, 1, now, 1), nil)
+	rc.ExpectSyncStatus(mockSyncStatus(now-10, 1, now, 1), nil)
+	rc.ExpectSyncStatus(mockSyncStatus(now-8, 2, now, 1), nil)
+	rc.ExpectSyncStatus(mockSyncStatus(now-8, 2, now, 1), nil)
+	monitor := s.SetupMonitor(now, 60, 60, rc, nil)
+	healthUpdateCh := monitor.Subscribe()
+	// confirm initial state
+	s.Zero(monitor.lastSeenUnsafeNum)
+	s.Zero(monitor.lastSeenUnsafeTime)
+	// confirm state after first check
+	healthy := <-healthUpdateCh
+	s.Nil(healthy)
+	lastSeenUnsafeTime := monitor.lastSeenUnsafeTime
+	s.NotZero(monitor.lastSeenUnsafeTime)
+	s.Equal(uint64(1), monitor.lastSeenUnsafeNum)
+	healthy = <-healthUpdateCh
+	s.Nil(healthy)
+	s.Equal(lastSeenUnsafeTime, monitor.lastSeenUnsafeTime)
+	s.Equal(uint64(1), monitor.lastSeenUnsafeNum)
+	healthy = <-healthUpdateCh
+	s.Nil(healthy)
+	s.Equal(lastSeenUnsafeTime+2, monitor.lastSeenUnsafeTime)
+	s.Equal(uint64(2), monitor.lastSeenUnsafeNum)
+	s.NoError(monitor.Stop())
+}
+func mockSyncStatus(unsafeTime, unsafeNum, safeTime, safeNum uint64) *eth.SyncStatus {
+	return &eth.SyncStatus{
+		UnsafeL2: eth.L2BlockRef{
+			Time:   unsafeTime,
+			Number: unsafeNum,
+		},
+		SafeL2: eth.L2BlockRef{
+			Time:   safeTime,
+			Number: safeNum,
+		},
+	}
 }
 func TestHealthMonitor(t *testing.T) {
 	suite.Run(t, new(HealthMonitorTestSuite))
 }
+type timeProvider struct {
+	now uint64
+}
+func (tp *timeProvider) Now() uint64 {
+	now := tp.now
+	tp.now++
+	return now
+}
--- a/op-conductor/run_test_100times.sh
+++ b/op-conductor/run_test_100times.sh
+set -e
+for i in {1..100}; do
+  echo "======================="
+  echo "Running iteration $i"
+  gotestsum -- -run 'TestControlLoop' ./... --count=1 --timeout=5s -race
+  if [ $? -ne 0 ]; then
+    echo "Test failed"
+    exit 1
+  fi
+done
--- a/op-node/rollup/driver/state.go
+++ b/op-node/rollup/driver/state.go
@@ -22,7 +22,10 @@ import (
 	"github.com/ethereum-optimism/optimism/op-service/retry"
 )
-var ErrSequencerAlreadyStarted = errors.New("sequencer already running")
+var (
+	ErrSequencerAlreadyStarted = errors.New("sequencer already running")
+	ErrSequencerAlreadyStopped = errors.New("sequencer not running")
+)
 // Deprecated: use eth.SyncStatus instead.
 type SyncStatus = eth.SyncStatus
@@ -429,7 +432,7 @@ func (s *Driver) eventLoop() {
 			}
 		case respCh := <-s.stopSequencer:
 			if s.driverConfig.SequencerStopped {
-				respCh <- hashAndError{err: errors.New("sequencer not running")}
+				respCh <- hashAndError{err: ErrSequencerAlreadyStopped}
 			} else {
 				if err := s.sequencerNotifs.SequencerStopped(); err != nil {
 					respCh <- hashAndError{err: fmt.Errorf("sequencer start notification: %w", err)}