Commit 0c0b907b authored by Francis Li's avatar Francis Li Committed by GitHub

[op-conductor] state transition bug fix (#9248)

* Fix state transition bug

* Finish state transition tests

* Fix monitor bugs

* update upon suggestion
parent f0f6aa7c
...@@ -68,7 +68,7 @@ func NewOpConductor( ...@@ -68,7 +68,7 @@ func NewOpConductor(
cons: cons, cons: cons,
hmon: hmon, hmon: hmon,
} }
oc.actionFn = oc.action oc.loopActionFn = oc.loopAction
// explicitly set all atomic.Bool values // explicitly set all atomic.Bool values
oc.leader.Store(false) // upon start, it should not be the leader unless specified otherwise by raft bootstrap, in that case, it'll receive a leadership update from consensus. oc.leader.Store(false) // upon start, it should not be the leader unless specified otherwise by raft bootstrap, in that case, it'll receive a leadership update from consensus.
...@@ -90,6 +90,7 @@ func NewOpConductor( ...@@ -90,6 +90,7 @@ func NewOpConductor(
} }
return nil, err return nil, err
} }
oc.prevState = NewState(oc.leader.Load(), oc.healthy.Load(), oc.seqActive.Load())
return oc, nil return oc, nil
} }
...@@ -252,10 +253,11 @@ type OpConductor struct { ...@@ -252,10 +253,11 @@ type OpConductor struct {
seqActive atomic.Bool seqActive atomic.Bool
healthy atomic.Bool healthy atomic.Bool
hcerr error // error from health check hcerr error // error from health check
prevState *state
healthUpdateCh <-chan error healthUpdateCh <-chan error
leaderUpdateCh <-chan bool leaderUpdateCh <-chan bool
actionFn func() // actionFn defines the action to be executed to bring the sequencer to the desired state. loopActionFn func() // loopActionFn defines the logic to be executed inside control loop.
wg sync.WaitGroup wg sync.WaitGroup
pauseCh chan struct{} pauseCh chan struct{}
...@@ -271,6 +273,23 @@ type OpConductor struct { ...@@ -271,6 +273,23 @@ type OpConductor struct {
rpcServer *oprpc.Server rpcServer *oprpc.Server
} }
type state struct {
leader, healthy, active bool
}
// NewState creates a new state instance.
func NewState(leader, healthy, active bool) *state {
return &state{
leader: leader,
healthy: healthy,
active: active,
}
}
func (s *state) Equal(other *state) bool {
return s.leader == other.leader && s.healthy == other.healthy && s.active == other.active
}
var _ cliapp.Lifecycle = (*OpConductor)(nil) var _ cliapp.Lifecycle = (*OpConductor)(nil)
// Start implements cliapp.Lifecycle. // Start implements cliapp.Lifecycle.
...@@ -430,29 +449,35 @@ func (oc *OpConductor) loop() { ...@@ -430,29 +449,35 @@ func (oc *OpConductor) loop() {
for { for {
select { select {
// We process status update (health, leadership) first regardless of the paused state.
// This way we could properly bring the sequencer to the desired state when resumed.
case healthy := <-oc.healthUpdateCh:
oc.handleHealthUpdate(healthy)
case leader := <-oc.leaderUpdateCh:
oc.handleLeaderUpdate(leader)
case <-oc.pauseCh:
oc.paused.Store(true)
oc.pauseDoneCh <- struct{}{}
case <-oc.resumeCh:
oc.paused.Store(false)
oc.resumeDoneCh <- struct{}{}
// queue an action to make sure sequencer is in the desired state after resume.
oc.queueAction()
case <-oc.shutdownCtx.Done(): case <-oc.shutdownCtx.Done():
return return
// Handle control action last, so that when executing the action, we have the latest status and bring the sequencer to the desired state. default:
case <-oc.actionCh: oc.loopActionFn()
oc.actionFn()
} }
} }
} }
func (oc *OpConductor) loopAction() {
select {
case healthy := <-oc.healthUpdateCh:
oc.handleHealthUpdate(healthy)
case leader := <-oc.leaderUpdateCh:
oc.handleLeaderUpdate(leader)
case <-oc.pauseCh:
oc.paused.Store(true)
oc.pauseDoneCh <- struct{}{}
case <-oc.resumeCh:
oc.paused.Store(false)
oc.resumeDoneCh <- struct{}{}
// queue an action to make sure sequencer is in the desired state after resume.
oc.queueAction()
case <-oc.shutdownCtx.Done():
return
case <-oc.actionCh:
oc.action()
}
}
func (oc *OpConductor) queueAction() { func (oc *OpConductor) queueAction() {
select { select {
case oc.actionCh <- struct{}{}: case oc.actionCh <- struct{}{}:
...@@ -472,6 +497,7 @@ func (oc *OpConductor) handleLeaderUpdate(leader bool) { ...@@ -472,6 +497,7 @@ func (oc *OpConductor) handleLeaderUpdate(leader bool) {
// handleHealthUpdate handles health update from health monitor. // handleHealthUpdate handles health update from health monitor.
func (oc *OpConductor) handleHealthUpdate(hcerr error) { func (oc *OpConductor) handleHealthUpdate(hcerr error) {
oc.log.Debug("received health update", "server", oc.cons.ServerID(), "error", hcerr)
healthy := hcerr == nil healthy := hcerr == nil
if !healthy { if !healthy {
oc.log.Error("Sequencer is unhealthy", "server", oc.cons.ServerID(), "err", hcerr) oc.log.Error("Sequencer is unhealthy", "server", oc.cons.ServerID(), "err", hcerr)
...@@ -495,8 +521,11 @@ func (oc *OpConductor) action() { ...@@ -495,8 +521,11 @@ func (oc *OpConductor) action() {
} }
var err error var err error
status := NewState(oc.leader.Load(), oc.healthy.Load(), oc.seqActive.Load())
oc.log.Debug("entering action with status", "status", status)
// exhaust all cases below for completeness, 3 state, 8 cases. // exhaust all cases below for completeness, 3 state, 8 cases.
switch status := struct{ leader, healthy, active bool }{oc.leader.Load(), oc.healthy.Load(), oc.seqActive.Load()}; { switch {
case !status.leader && !status.healthy && !status.active: case !status.leader && !status.healthy && !status.active:
// if follower is not healthy and not sequencing, just log an error // if follower is not healthy and not sequencing, just log an error
oc.log.Error("server (follower) is not healthy", "server", oc.cons.ServerID()) oc.log.Error("server (follower) is not healthy", "server", oc.cons.ServerID())
...@@ -509,9 +538,35 @@ func (oc *OpConductor) action() { ...@@ -509,9 +538,35 @@ func (oc *OpConductor) action() {
// stop sequencer, this happens when current server steps down as leader. // stop sequencer, this happens when current server steps down as leader.
err = oc.stopSequencer() err = oc.stopSequencer()
case status.leader && !status.healthy && !status.active: case status.leader && !status.healthy && !status.active:
// transfer leadership to another node // There are 2 scenarios we need to handle:
// 1. current node is follower, active sequencer became unhealthy and started the leadership transfer process.
// however if leadership transfer took longer than the time for health monitor to treat the node as unhealthy,
// then basically the entire network is stalled and we need to start sequencing in this case.
if !oc.prevState.leader && !oc.prevState.active {
_, _, cerr := oc.compareUnsafeHead(oc.shutdownCtx)
if cerr == nil && !errors.Is(oc.hcerr, health.ErrSequencerConnectionDown) {
// if unsafe in consensus is the same as unsafe in op-node, then it is scenario #1 and we should start sequencer.
err = oc.startSequencer()
break
}
}
// 2. for other cases, we should try to transfer leader to another node.
// for example, if follower became a leader and unhealthy at the same time (just unhealthy itself), then we should transfer leadership.
err = oc.transferLeader() err = oc.transferLeader()
case status.leader && !status.healthy && status.active: case status.leader && !status.healthy && status.active:
// There are two scenarios we need to handle here:
// 1. we're transitioned from case status.leader && !status.healthy && !status.active, see description above
// then we should continue to sequence blocks and try to bring ourselves back to healthy state.
// note: we need to also make sure that the health error is not due to ErrSequencerConnectionDown
// because in this case, we should stop sequencing and transfer leadership to other nodes.
if oc.prevState.leader && !oc.prevState.healthy && !oc.prevState.active && !errors.Is(oc.hcerr, health.ErrSequencerConnectionDown) {
err = errors.New("waiting for sequencing to become healthy by itself")
break
}
// 2. we're here becasuse an healthy leader became unhealthy itself
// then we should try to stop sequencing locally and transfer leadership.
var result *multierror.Error var result *multierror.Error
// Try to stop sequencer first, but since sequencer is not healthy, we may not be able to stop it. // Try to stop sequencer first, but since sequencer is not healthy, we may not be able to stop it.
// In this case, it's fine to continue to try to transfer leadership to another server. This is safe because // In this case, it's fine to continue to try to transfer leadership to another server. This is safe because
...@@ -536,17 +591,25 @@ func (oc *OpConductor) action() { ...@@ -536,17 +591,25 @@ func (oc *OpConductor) action() {
// normal leader, do nothing // normal leader, do nothing
} }
oc.log.Debug("exiting action with status and error", "status", status, "err", err)
if err != nil { if err != nil {
oc.log.Error("failed to execute step, queueing another one to retry", "err", err) oc.log.Error("failed to execute step, queueing another one to retry", "err", err)
// randomly sleep for 0-200ms to avoid excessive retry // randomly sleep for 0-200ms to avoid excessive retry
time.Sleep(time.Duration(rand.Intn(200)) * time.Millisecond) time.Sleep(time.Duration(rand.Intn(200)) * time.Millisecond)
oc.queueAction() oc.queueAction()
return
}
if !status.Equal(oc.prevState) {
oc.log.Info("state changed", "prev_state", oc.prevState, "new_state", status)
oc.prevState = status
} }
} }
// transferLeader tries to transfer leadership to another server. // transferLeader tries to transfer leadership to another server.
func (oc *OpConductor) transferLeader() error { func (oc *OpConductor) transferLeader() error {
// TransferLeader here will do round robin to try to transfer leadership to the next healthy node. // TransferLeader here will do round robin to try to transfer leadership to the next healthy node.
oc.log.Info("transferring leadership", "server", oc.cons.ServerID())
err := oc.cons.TransferLeader() err := oc.cons.TransferLeader()
if err == nil { if err == nil {
oc.leader.Store(false) oc.leader.Store(false)
...@@ -567,15 +630,20 @@ func (oc *OpConductor) transferLeader() error { ...@@ -567,15 +630,20 @@ func (oc *OpConductor) transferLeader() error {
func (oc *OpConductor) stopSequencer() error { func (oc *OpConductor) stopSequencer() error {
oc.log.Info("stopping sequencer", "server", oc.cons.ServerID(), "leader", oc.leader.Load(), "healthy", oc.healthy.Load(), "active", oc.seqActive.Load()) oc.log.Info("stopping sequencer", "server", oc.cons.ServerID(), "leader", oc.leader.Load(), "healthy", oc.healthy.Load(), "active", oc.seqActive.Load())
if _, err := oc.ctrl.StopSequencer(context.Background()); err != nil { _, err := oc.ctrl.StopSequencer(context.Background())
return errors.Wrap(err, "failed to stop sequencer") if err != nil {
if strings.Contains(err.Error(), driver.ErrSequencerAlreadyStopped.Error()) {
oc.log.Warn("sequencer already stopped.", "err", err)
} else {
return errors.Wrap(err, "failed to stop sequencer")
}
} }
oc.seqActive.Store(false) oc.seqActive.Store(false)
return nil return nil
} }
func (oc *OpConductor) startSequencer() error { func (oc *OpConductor) startSequencer() error {
oc.log.Info("starting sequencer", "server", oc.cons.ServerID(), "leader", oc.leader.Load(), "healthy", oc.healthy.Load(), "active", oc.seqActive.Load())
ctx := context.Background() ctx := context.Background()
// When starting sequencer, we need to make sure that the current node has the latest unsafe head from the consensus protocol // When starting sequencer, we need to make sure that the current node has the latest unsafe head from the consensus protocol
...@@ -592,6 +660,7 @@ func (oc *OpConductor) startSequencer() error { ...@@ -592,6 +660,7 @@ func (oc *OpConductor) startSequencer() error {
return err return err
} }
oc.log.Info("starting sequencer", "server", oc.cons.ServerID(), "leader", oc.leader.Load(), "healthy", oc.healthy.Load(), "active", oc.seqActive.Load())
if err = oc.ctrl.StartSequencer(ctx, unsafeInCons.ExecutionPayload.BlockHash); err != nil { if err = oc.ctrl.StartSequencer(ctx, unsafeInCons.ExecutionPayload.BlockHash); err != nil {
// cannot directly compare using Errors.Is because the error is returned from an JSON RPC server which lost its type. // cannot directly compare using Errors.Is because the error is returned from an JSON RPC server which lost its type.
if !strings.Contains(err.Error(), driver.ErrSequencerAlreadyStarted.Error()) { if !strings.Contains(err.Error(), driver.ErrSequencerAlreadyStarted.Error()) {
...@@ -616,6 +685,7 @@ func (oc *OpConductor) compareUnsafeHead(ctx context.Context) (*eth.ExecutionPay ...@@ -616,6 +685,7 @@ func (oc *OpConductor) compareUnsafeHead(ctx context.Context) (*eth.ExecutionPay
return unsafeInCons, nil, errors.Wrap(err, "failed to get latest unsafe block from EL during compareUnsafeHead phase") return unsafeInCons, nil, errors.Wrap(err, "failed to get latest unsafe block from EL during compareUnsafeHead phase")
} }
oc.log.Debug("comparing unsafe head", "consensus", unsafeInCons.ExecutionPayload.BlockNumber, "node", unsafeInNode.NumberU64())
if unsafeInCons.ExecutionPayload.BlockHash != unsafeInNode.Hash() { if unsafeInCons.ExecutionPayload.BlockHash != unsafeInNode.Hash() {
oc.log.Warn( oc.log.Warn(
"latest unsafe block in consensus is not the same as the one in op-node", "latest unsafe block in consensus is not the same as the one in op-node",
...@@ -636,6 +706,7 @@ func (oc *OpConductor) updateSequencerActiveStatus() error { ...@@ -636,6 +706,7 @@ func (oc *OpConductor) updateSequencerActiveStatus() error {
if err != nil { if err != nil {
return errors.Wrap(err, "failed to get sequencer active status") return errors.Wrap(err, "failed to get sequencer active status")
} }
oc.log.Info("sequencer active status updated", "active", active)
oc.seqActive.Store(active) oc.seqActive.Store(active)
return nil return nil
} }
This diff is collapsed.
...@@ -44,6 +44,7 @@ func NewSequencerHealthMonitor(log log.Logger, interval, unsafeInterval, safeInt ...@@ -44,6 +44,7 @@ func NewSequencerHealthMonitor(log log.Logger, interval, unsafeInterval, safeInt
unsafeInterval: unsafeInterval, unsafeInterval: unsafeInterval,
safeInterval: safeInterval, safeInterval: safeInterval,
minPeerCount: minPeerCount, minPeerCount: minPeerCount,
timeProviderFn: currentTimeProvicer,
node: node, node: node,
p2p: p2p, p2p: p2p,
} }
...@@ -64,6 +65,8 @@ type SequencerHealthMonitor struct { ...@@ -64,6 +65,8 @@ type SequencerHealthMonitor struct {
lastSeenUnsafeNum uint64 lastSeenUnsafeNum uint64
lastSeenUnsafeTime uint64 lastSeenUnsafeTime uint64
timeProviderFn func() uint64
node dial.RollupClientInterface node dial.RollupClientInterface
p2p p2p.API p2p p2p.API
} }
...@@ -125,23 +128,17 @@ func (hm *SequencerHealthMonitor) healthCheck() error { ...@@ -125,23 +128,17 @@ func (hm *SequencerHealthMonitor) healthCheck() error {
return ErrSequencerConnectionDown return ErrSequencerConnectionDown
} }
now := uint64(time.Now().Unix()) now := hm.timeProviderFn()
var timeDiff, blockDiff, expectedBlocks uint64
if hm.lastSeenUnsafeNum != 0 { if hm.lastSeenUnsafeNum != 0 {
diff := now - hm.lastSeenUnsafeTime timeDiff = now - hm.lastSeenUnsafeTime
blockDiff = status.UnsafeL2.Number - hm.lastSeenUnsafeNum
// how many blocks do we expect to see, minus 1 to account for edge case with respect to time. // how many blocks do we expect to see, minus 1 to account for edge case with respect to time.
// for example, if diff = 2.001s and block time = 2s, expecting to see 1 block could potentially cause sequencer to be considered unhealthy. // for example, if diff = 2.001s and block time = 2s, expecting to see 1 block could potentially cause sequencer to be considered unhealthy.
blocks := diff/hm.rollupCfg.BlockTime - 1 expectedBlocks = timeDiff / hm.rollupCfg.BlockTime
if diff > hm.rollupCfg.BlockTime && blocks > status.UnsafeL2.Number-hm.lastSeenUnsafeNum { if expectedBlocks > 0 {
hm.log.Error( expectedBlocks--
"unsafe head is not progressing as expected",
"now", now,
"unsafe_head_num", status.UnsafeL2.Number,
"last_seen_unsafe_num", hm.lastSeenUnsafeNum,
"last_seen_unsafe_time", hm.lastSeenUnsafeTime,
"unsafe_interval", hm.unsafeInterval,
)
return ErrSequencerNotHealthy
} }
} }
if status.UnsafeL2.Number > hm.lastSeenUnsafeNum { if status.UnsafeL2.Number > hm.lastSeenUnsafeNum {
...@@ -149,6 +146,18 @@ func (hm *SequencerHealthMonitor) healthCheck() error { ...@@ -149,6 +146,18 @@ func (hm *SequencerHealthMonitor) healthCheck() error {
hm.lastSeenUnsafeTime = now hm.lastSeenUnsafeTime = now
} }
if timeDiff > hm.rollupCfg.BlockTime && expectedBlocks > blockDiff {
hm.log.Error(
"unsafe head is not progressing as expected",
"now", now,
"unsafe_head_num", status.UnsafeL2.Number,
"last_seen_unsafe_num", hm.lastSeenUnsafeNum,
"last_seen_unsafe_time", hm.lastSeenUnsafeTime,
"unsafe_interval", hm.unsafeInterval,
)
return ErrSequencerNotHealthy
}
if now-status.UnsafeL2.Time > hm.unsafeInterval { if now-status.UnsafeL2.Time > hm.unsafeInterval {
hm.log.Error( hm.log.Error(
"unsafe head is not progressing as expected", "unsafe head is not progressing as expected",
...@@ -183,3 +192,7 @@ func (hm *SequencerHealthMonitor) healthCheck() error { ...@@ -183,3 +192,7 @@ func (hm *SequencerHealthMonitor) healthCheck() error {
return nil return nil
} }
func currentTimeProvicer() uint64 {
return uint64(time.Now().Unix())
}
...@@ -26,130 +26,192 @@ const ( ...@@ -26,130 +26,192 @@ const (
type HealthMonitorTestSuite struct { type HealthMonitorTestSuite struct {
suite.Suite suite.Suite
log log.Logger log log.Logger
rc *testutils.MockRollupClient interval uint64
pc *p2pMocks.API minPeerCount uint64
interval uint64 rollupCfg *rollup.Config
unsafeInterval uint64
safeInterval uint64
minPeerCount uint64
rollupCfg *rollup.Config
monitor HealthMonitor
} }
func (s *HealthMonitorTestSuite) SetupSuite() { func (s *HealthMonitorTestSuite) SetupSuite() {
s.log = testlog.Logger(s.T(), log.LvlInfo) s.log = testlog.Logger(s.T(), log.LvlDebug)
s.rc = &testutils.MockRollupClient{}
s.pc = &p2pMocks.API{}
s.interval = 1 s.interval = 1
s.unsafeInterval = 3
s.safeInterval = 5
s.minPeerCount = minPeerCount s.minPeerCount = minPeerCount
s.rollupCfg = &rollup.Config{ s.rollupCfg = &rollup.Config{
BlockTime: blockTime, BlockTime: blockTime,
} }
} }
func (s *HealthMonitorTestSuite) SetupTest() { func (s *HealthMonitorTestSuite) SetupMonitor(
s.monitor = NewSequencerHealthMonitor(s.log, s.interval, s.unsafeInterval, s.safeInterval, s.minPeerCount, s.rollupCfg, s.rc, s.pc) now, unsafeInterval, safeInterval uint64,
err := s.monitor.Start() mockRollupClient *testutils.MockRollupClient,
s.NoError(err) mockP2P *p2pMocks.API,
} ) *SequencerHealthMonitor {
tp := &timeProvider{now: now}
func (s *HealthMonitorTestSuite) TearDownTest() { if mockP2P == nil {
err := s.monitor.Stop() mockP2P = &p2pMocks.API{}
ps1 := &p2p.PeerStats{
Connected: healthyPeerCount,
}
mockP2P.EXPECT().PeerStats(context.Background()).Return(ps1, nil)
}
monitor := &SequencerHealthMonitor{
log: s.log,
done: make(chan struct{}),
interval: s.interval,
healthUpdateCh: make(chan error),
rollupCfg: s.rollupCfg,
unsafeInterval: unsafeInterval,
safeInterval: safeInterval,
minPeerCount: s.minPeerCount,
timeProviderFn: tp.Now,
node: mockRollupClient,
p2p: mockP2P,
}
err := monitor.Start()
s.NoError(err) s.NoError(err)
return monitor
} }
func (s *HealthMonitorTestSuite) TestUnhealthyLowPeerCount() { func (s *HealthMonitorTestSuite) TestUnhealthyLowPeerCount() {
s.T().Parallel()
now := uint64(time.Now().Unix()) now := uint64(time.Now().Unix())
ss1 := &eth.SyncStatus{
UnsafeL2: eth.L2BlockRef{
Time: now - 1,
},
SafeL2: eth.L2BlockRef{
Time: now - 2,
},
}
s.rc.ExpectSyncStatus(ss1, nil)
rc := &testutils.MockRollupClient{}
ss1 := mockSyncStatus(now-1, 1, now-3, 0)
rc.ExpectSyncStatus(ss1, nil)
rc.ExpectSyncStatus(ss1, nil)
pc := &p2pMocks.API{}
ps1 := &p2p.PeerStats{ ps1 := &p2p.PeerStats{
Connected: unhealthyPeerCount, Connected: unhealthyPeerCount,
} }
s.pc.EXPECT().PeerStats(context.Background()).Return(ps1, nil).Times(1) pc.EXPECT().PeerStats(context.Background()).Return(ps1, nil).Times(1)
monitor := s.SetupMonitor(now, 60, 60, rc, pc)
healthUpdateCh := s.monitor.Subscribe() healthUpdateCh := monitor.Subscribe()
healthy := <-healthUpdateCh healthy := <-healthUpdateCh
s.NotNil(healthy) s.NotNil(healthy)
s.NoError(monitor.Stop())
} }
func (s *HealthMonitorTestSuite) TestUnhealthyUnsafeHeadNotProgressing() { func (s *HealthMonitorTestSuite) TestUnhealthyUnsafeHeadNotProgressing() {
ps1 := &p2p.PeerStats{ s.T().Parallel()
Connected: healthyPeerCount,
}
s.pc.EXPECT().PeerStats(context.Background()).Return(ps1, nil).Times(3)
now := uint64(time.Now().Unix()) now := uint64(time.Now().Unix())
ss1 := &eth.SyncStatus{
UnsafeL2: eth.L2BlockRef{ rc := &testutils.MockRollupClient{}
Number: 5, ss1 := mockSyncStatus(now, 5, now-8, 1)
Time: now - 1, for i := 0; i < 6; i++ {
}, rc.ExpectSyncStatus(ss1, nil)
SafeL2: eth.L2BlockRef{
Number: 1,
Time: now - 2,
},
} }
s.rc.ExpectSyncStatus(ss1, nil)
s.rc.ExpectSyncStatus(ss1, nil)
s.rc.ExpectSyncStatus(ss1, nil)
healthUpdateCh := s.monitor.Subscribe() monitor := s.SetupMonitor(now, 60, 60, rc, nil)
for i := 0; i < 3; i++ { healthUpdateCh := monitor.Subscribe()
for i := 0; i < 5; i++ {
healthy := <-healthUpdateCh healthy := <-healthUpdateCh
if i < 2 { if i < 4 {
s.Nil(healthy) s.Nil(healthy)
s.Equal(now, monitor.lastSeenUnsafeTime)
s.Equal(uint64(5), monitor.lastSeenUnsafeNum)
} else { } else {
s.NotNil(healthy) s.NotNil(healthy)
} }
} }
s.NoError(monitor.Stop())
} }
func (s *HealthMonitorTestSuite) TestUnhealthySafeHeadNotProgressing() { func (s *HealthMonitorTestSuite) TestUnhealthySafeHeadNotProgressing() {
ps1 := &p2p.PeerStats{ s.T().Parallel()
Connected: healthyPeerCount,
}
s.pc.EXPECT().PeerStats(context.Background()).Return(ps1, nil).Times(6)
now := uint64(time.Now().Unix()) now := uint64(time.Now().Unix())
syncStatusGenerator := func(unsafeTime uint64) *eth.SyncStatus {
return &eth.SyncStatus{ rc := &testutils.MockRollupClient{}
UnsafeL2: eth.L2BlockRef{ rc.ExpectSyncStatus(mockSyncStatus(now, 1, now, 1), nil)
Time: unsafeTime, rc.ExpectSyncStatus(mockSyncStatus(now, 1, now, 1), nil)
}, rc.ExpectSyncStatus(mockSyncStatus(now+2, 2, now, 1), nil)
SafeL2: eth.L2BlockRef{ rc.ExpectSyncStatus(mockSyncStatus(now+2, 2, now, 1), nil)
Time: now, rc.ExpectSyncStatus(mockSyncStatus(now+4, 3, now, 1), nil)
}, rc.ExpectSyncStatus(mockSyncStatus(now+4, 3, now, 1), nil)
}
} monitor := s.SetupMonitor(now, 60, 3, rc, nil)
s.rc.ExpectSyncStatus(syncStatusGenerator(now), nil) healthUpdateCh := monitor.Subscribe()
s.rc.ExpectSyncStatus(syncStatusGenerator(now), nil)
s.rc.ExpectSyncStatus(syncStatusGenerator(now+2), nil) for i := 0; i < 5; i++ {
s.rc.ExpectSyncStatus(syncStatusGenerator(now+2), nil)
s.rc.ExpectSyncStatus(syncStatusGenerator(now+4), nil)
s.rc.ExpectSyncStatus(syncStatusGenerator(now+4), nil)
healthUpdateCh := s.monitor.Subscribe()
for i := 0; i < 6; i++ {
healthy := <-healthUpdateCh healthy := <-healthUpdateCh
if i < 5 { if i < 4 {
s.Nil(healthy) s.Nil(healthy)
} else { } else {
s.NotNil(healthy) s.NotNil(healthy)
} }
} }
s.NoError(monitor.Stop())
}
func (s *HealthMonitorTestSuite) TestHealthyWithUnsafeLag() {
s.T().Parallel()
now := uint64(time.Now().Unix())
rc := &testutils.MockRollupClient{}
// although unsafe has lag of 20 seconds, it's within the configured unsafe interval
// and it is advancing every block time, so it should be considered safe.
rc.ExpectSyncStatus(mockSyncStatus(now-10, 1, now, 1), nil)
rc.ExpectSyncStatus(mockSyncStatus(now-10, 1, now, 1), nil)
rc.ExpectSyncStatus(mockSyncStatus(now-8, 2, now, 1), nil)
rc.ExpectSyncStatus(mockSyncStatus(now-8, 2, now, 1), nil)
monitor := s.SetupMonitor(now, 60, 60, rc, nil)
healthUpdateCh := monitor.Subscribe()
// confirm initial state
s.Zero(monitor.lastSeenUnsafeNum)
s.Zero(monitor.lastSeenUnsafeTime)
// confirm state after first check
healthy := <-healthUpdateCh
s.Nil(healthy)
lastSeenUnsafeTime := monitor.lastSeenUnsafeTime
s.NotZero(monitor.lastSeenUnsafeTime)
s.Equal(uint64(1), monitor.lastSeenUnsafeNum)
healthy = <-healthUpdateCh
s.Nil(healthy)
s.Equal(lastSeenUnsafeTime, monitor.lastSeenUnsafeTime)
s.Equal(uint64(1), monitor.lastSeenUnsafeNum)
healthy = <-healthUpdateCh
s.Nil(healthy)
s.Equal(lastSeenUnsafeTime+2, monitor.lastSeenUnsafeTime)
s.Equal(uint64(2), monitor.lastSeenUnsafeNum)
s.NoError(monitor.Stop())
}
func mockSyncStatus(unsafeTime, unsafeNum, safeTime, safeNum uint64) *eth.SyncStatus {
return &eth.SyncStatus{
UnsafeL2: eth.L2BlockRef{
Time: unsafeTime,
Number: unsafeNum,
},
SafeL2: eth.L2BlockRef{
Time: safeTime,
Number: safeNum,
},
}
} }
func TestHealthMonitor(t *testing.T) { func TestHealthMonitor(t *testing.T) {
suite.Run(t, new(HealthMonitorTestSuite)) suite.Run(t, new(HealthMonitorTestSuite))
} }
type timeProvider struct {
now uint64
}
func (tp *timeProvider) Now() uint64 {
now := tp.now
tp.now++
return now
}
set -e
for i in {1..100}; do
echo "======================="
echo "Running iteration $i"
gotestsum -- -run 'TestControlLoop' ./... --count=1 --timeout=5s -race
if [ $? -ne 0 ]; then
echo "Test failed"
exit 1
fi
done
...@@ -22,7 +22,10 @@ import ( ...@@ -22,7 +22,10 @@ import (
"github.com/ethereum-optimism/optimism/op-service/retry" "github.com/ethereum-optimism/optimism/op-service/retry"
) )
var ErrSequencerAlreadyStarted = errors.New("sequencer already running") var (
ErrSequencerAlreadyStarted = errors.New("sequencer already running")
ErrSequencerAlreadyStopped = errors.New("sequencer not running")
)
// Deprecated: use eth.SyncStatus instead. // Deprecated: use eth.SyncStatus instead.
type SyncStatus = eth.SyncStatus type SyncStatus = eth.SyncStatus
...@@ -429,7 +432,7 @@ func (s *Driver) eventLoop() { ...@@ -429,7 +432,7 @@ func (s *Driver) eventLoop() {
} }
case respCh := <-s.stopSequencer: case respCh := <-s.stopSequencer:
if s.driverConfig.SequencerStopped { if s.driverConfig.SequencerStopped {
respCh <- hashAndError{err: errors.New("sequencer not running")} respCh <- hashAndError{err: ErrSequencerAlreadyStopped}
} else { } else {
if err := s.sequencerNotifs.SequencerStopped(); err != nil { if err := s.sequencerNotifs.SequencerStopped(); err != nil {
respCh <- hashAndError{err: fmt.Errorf("sequencer start notification: %w", err)} respCh <- hashAndError{err: fmt.Errorf("sequencer start notification: %w", err)}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment