op-node: handle sequencer errors with derivation-levels, reset when L1 origins...

op-node: handle sequencer errors with derivation-levels, reset when L1 origins are inconsistent, test reset handling

op-node: handle sequencer errors with derivation-levels, reset when L1 origins...
op-node: handle sequencer errors with derivation-levels, reset when L1 origins are inconsistent, test reset handling
61c0b14e · protolambda · a340d593 · 61c0b14e · 61c0b14e · 61c0b14e
Commit 61c0b14e authored Feb 21, 2023 by protolambda
8 changed files
--- a/op-e2e/actions/l2_sequencer.go
+++ b/op-e2e/actions/l2_sequencer.go
@@ -2,6 +2,7 @@ package actions
 import (
 	"context"
+	"errors"
 	"github.com/ethereum/go-ethereum/log"
 	"github.com/stretchr/testify/require"
@@ -54,6 +55,10 @@ func NewL2Sequencer(t Testing, log log.Logger, l1 derive.L1Fetcher, eng L2API, c
 // ActL2StartBlock starts building of a new L2 block on top of the head
 func (s *L2Sequencer) ActL2StartBlock(t Testing) {
+	s.ActL2StartBlockCheckErr(t, nil)
+}
+func (s *L2Sequencer) ActL2StartBlockCheckErr(t Testing, checkErr error) {
 	if !s.l2PipelineIdle {
 		t.InvalidAction("cannot start L2 build when derivation is not idle")
 		return
@@ -64,9 +69,19 @@ func (s *L2Sequencer) ActL2StartBlock(t Testing) {
 	}
 	err := s.sequencer.StartBuildingBlock(t.Ctx())
+	if checkErr == nil {
 		require.NoError(t, err, "failed to start block building")
+	} else {
+		require.ErrorIs(t, err, checkErr, "expected typed error")
+	}
+	if errors.Is(err, derive.ErrReset) {
+		s.derivation.Reset()
+	}
+	if err == nil {
 		s.l2Building = true
+	}
 }
 // ActL2EndBlock completes a new L2 block and applies it to the L2 chain as new canonical unsafe head
@@ -103,7 +118,16 @@ func (s *L2Sequencer) ActBuildToL1Head(t Testing) {
 	}
 }
-// ActBuildToL1HeadExcl builds empty blocks until (excl.) the L1 head becomes the L2 origin
+// ActBuildToL1HeadUnsafe builds empty blocks until (incl.) the L1 head becomes the L1 origin of the L2 head
+func (s *L2Sequencer) ActBuildToL1HeadUnsafe(t Testing) {
+	for s.derivation.UnsafeL2Head().L1Origin.Number < s.l1State.L1Head().Number {
+		// Note: the
+		s.ActL2StartBlock(t)
+		s.ActL2EndBlock(t)
+	}
+}
+// ActBuildToL1HeadExcl builds empty blocks until (excl.) the L1 head becomes the L1 origin of the L2 head
 func (s *L2Sequencer) ActBuildToL1HeadExcl(t Testing) {
 	for {
 		s.ActL2PipelineFull(t)
@@ -116,3 +140,16 @@ func (s *L2Sequencer) ActBuildToL1HeadExcl(t Testing) {
 		s.ActL2EndBlock(t)
 	}
 }
+// ActBuildToL1HeadExclUnsafe builds empty blocks until (excl.) the L1 head becomes the L1 origin of the L2 head, without safe-head progression.
+func (s *L2Sequencer) ActBuildToL1HeadExclUnsafe(t Testing) {
+	for {
+		nextOrigin, err := s.mockL1OriginSelector.FindL1Origin(t.Ctx(), s.derivation.UnsafeL2Head())
+		require.NoError(t, err)
+		if nextOrigin.Number >= s.l1State.L1Head().Number {
+			break
+		}
+		s.ActL2StartBlock(t)
+		s.ActL2EndBlock(t)
+	}
+}
--- a/op-e2e/actions/l2_sequencer_test.go
+++ b/op-e2e/actions/l2_sequencer_test.go
@@ -4,6 +4,8 @@ import (
 	"math/big"
 	"testing"
+	"github.com/ethereum-optimism/optimism/op-node/rollup/derive"
 	"github.com/ethereum/go-ethereum/core/types"
 	"github.com/ethereum/go-ethereum/params"
@@ -100,3 +102,72 @@ func TestL2Sequencer_SequencerDrift(gt *testing.T) {
 	sequencer.ActL2StartBlock(t)
 	require.True(t, engine.l2ForceEmpty, "engine should not be allowed to include anything after sequencer drift is surpassed")
 }
+// TestL2Sequencer_SequencerOnlyReorg regression-tests a Goerli halt where the sequencer
+// would build an unsafe L2 block with a L1 origin that then gets reorged out,
+// while the verifier-codepath only ever sees the valid post-reorg L1 chain.
+func TestL2Sequencer_SequencerOnlyReorg(gt *testing.T) {
+	t := NewDefaultTesting(gt)
+	p := &e2eutils.TestParams{
+		MaxSequencerDrift:   20, // larger than L1 block time we simulate in this test (12)
+		SequencerWindowSize: 24,
+		ChannelTimeout:      20,
+	}
+	dp := e2eutils.MakeDeployParams(t, p)
+	sd := e2eutils.Setup(t, dp, defaultAlloc)
+	log := testlog.Logger(t, log.LvlDebug)
+	miner, _, sequencer := setupSequencerTest(t, sd, log)
+	// Sequencer at first only recognizes the genesis as safe.
+	// The rest of the L1 chain will be incorporated as L1 origins into unsafe L2 blocks.
+	sequencer.ActL2PipelineFull(t)
+	// build L1 block with coinbase A
+	miner.ActL1SetFeeRecipient(common.Address{'A'})
+	miner.ActEmptyBlock(t)
+	// sequencer builds L2 blocks, until (incl.) it creates a L2 block with a L1 origin that has A as coinbase address
+	sequencer.ActL1HeadSignal(t)
+	sequencer.ActBuildToL1HeadUnsafe(t)
+	status := sequencer.SyncStatus()
+	require.Zero(t, status.SafeL2.L1Origin.Number, "no safe head progress")
+	require.Equal(t, status.HeadL1.Hash, status.UnsafeL2.L1Origin.Hash, "have head L1 origin")
+	// reorg out block with coinbase A, and make a block with coinbase B
+	miner.ActL1RewindToParent(t)
+	miner.ActL1SetFeeRecipient(common.Address{'B'})
+	miner.ActEmptyBlock(t)
+	// and a second block, for derivation to pick up on the new L1 chain
+	// (height is used as heuristic to not flip-flop between chains too frequently)
+	miner.ActEmptyBlock(t)
+	// Make the sequencer aware of the new head, and try to sync it.
+	// Since the safe chain never incorporated the now reorged L1 block with coinbase A,
+	// it will sync the new L1 chain fine.
+	// No batches are submitted yet however,
+	// so it'll keep the L2 block with the old L1 origin, since no conflict is detected.
+	sequencer.ActL1HeadSignal(t)
+	sequencer.ActL2PipelineFull(t)
+	// TODO: CLI-3405 we can detect the inconsistency of the L1 origin of the unsafe L2 head:
+	//  as verifier, there is no need to wait for sequencer to recognize it.
+	newStatus := sequencer.SyncStatus()
+	require.Equal(t, status.HeadL1.Hash, newStatus.UnsafeL2.L1Origin.Hash, "still have old bad L1 origin")
+	require.NotEqual(t, status.HeadL1.Hash, newStatus.HeadL1.Hash, "did see the new L1 head change")
+	require.Equal(t, newStatus.HeadL1.Hash, newStatus.CurrentL1.Hash, "did sync the new L1 head as verifier")
+	// the block N+1 cannot build on the old N which still refers to the now orphaned L1 origin
+	require.Equal(t, status.UnsafeL2.L1Origin.Number, newStatus.HeadL1.Number-1, "seeing N+1 to attempt to build on N")
+	require.NotEqual(t, status.UnsafeL2.L1Origin.Hash, newStatus.HeadL1.ParentHash, "but N+1 cannot fit on N")
+	sequencer.ActL1HeadSignal(t)
+	// sequence more L2 blocks, until we actually need the next L1 origin
+	sequencer.ActBuildToL1HeadExclUnsafe(t)
+	// We expect block building to fail when the next L1 block is not consistent with the existing L1 origin
+	sequencer.ActL2StartBlockCheckErr(t, derive.ErrReset)
+	// After hitting a reset error, it reset derivation, and drops the old L1 chain
+	sequencer.ActL2PipelineFull(t)
+	require.Zero(t, sequencer.SyncStatus().UnsafeL2.L1Origin.Number, "back to genesis block with good L1 origin, drop old unsafe L2 chain with bad L1 origins")
+	// Can build new L2 blocks with good L1 origin
+	sequencer.ActBuildToL1HeadUnsafe(t)
+	require.Equal(t, newStatus.HeadL1.Hash, sequencer.SyncStatus().UnsafeL2.L1Origin.Hash, "build L2 chain with new correct L1 origins")
+}
--- a/op-node/rollup/derive/pipeline.go
+++ b/op-node/rollup/derive/pipeline.go
@@ -25,6 +25,11 @@ type L1Fetcher interface {
 	L1TransactionFetcher
 }
+type ResettableEngineControl interface {
+	EngineControl
+	Reset()
+}
 type ResetableStage interface {
 	// Reset resets a pull stage. `base` refers to the L1 Block Reference to reset to, with corresponding configuration.
 	Reset(ctx context.Context, base eth.L1BlockRef, baseCfg eth.SystemConfig) error

--- a/op-node/rollup/driver/driver.go
+++ b/op-node/rollup/driver/driver.go
@@ -69,7 +69,7 @@ type SequencerIface interface {
 	StartBuildingBlock(ctx context.Context) error
 	CompleteBuildingBlock(ctx context.Context) (*eth.ExecutionPayload, error)
 	PlanNextSequencerAction() time.Duration
-	RunNextSequencerAction(ctx context.Context) *eth.ExecutionPayload
+	RunNextSequencerAction(ctx context.Context) (*eth.ExecutionPayload, error)
 	BuildingOnto() eth.L2BlockRef
 }
@@ -93,7 +93,6 @@ func NewDriver(driverCfg *Config, cfg *rollup.Config, l2 L2Chain, l1 L1Chain, ne
 	return &Driver{
 		l1State:          l1State,
 		derivation:       derivationPipeline,
-		idleDerivation:   false,
 		stateReq:         make(chan chan struct{}),
 		forceReset:       make(chan chan struct{}, 10),
 		startSequencer:   make(chan hashAndErrorChannel, 10),

--- a/op-node/rollup/driver/metered_engine.go
+++ b/op-node/rollup/driver/metered_engine.go
@@ -21,7 +21,7 @@ type EngineMetrics interface {
 // MeteredEngine wraps an EngineControl and adds metrics such as block building time diff and sealing time
 type MeteredEngine struct {
-	inner derive.EngineControl
+	inner derive.ResettableEngineControl
 	cfg     *rollup.Config
 	metrics EngineMetrics
@@ -31,9 +31,9 @@ type MeteredEngine struct {
 }
 // MeteredEngine implements derive.EngineControl
-var _ derive.EngineControl = (*MeteredEngine)(nil)
+var _ derive.ResettableEngineControl = (*MeteredEngine)(nil)
-func NewMeteredEngine(cfg *rollup.Config, inner derive.EngineControl, metrics EngineMetrics, log log.Logger) *MeteredEngine {
+func NewMeteredEngine(cfg *rollup.Config, inner derive.ResettableEngineControl, metrics EngineMetrics, log log.Logger) *MeteredEngine {
 	return &MeteredEngine{
 		inner:   inner,
 		cfg:     cfg,
@@ -93,3 +93,7 @@ func (m *MeteredEngine) CancelPayload(ctx context.Context, force bool) error {
 func (m *MeteredEngine) BuildingPayload() (onto eth.L2BlockRef, id eth.PayloadID, safe bool) {
 	return m.inner.BuildingPayload()
 }
+func (m *MeteredEngine) Reset() {
+	m.inner.Reset()
+}
--- a/op-node/rollup/driver/sequencer.go
+++ b/op-node/rollup/driver/sequencer.go
@@ -2,6 +2,7 @@ package driver
 import (
 	"context"
+	"errors"
 	"fmt"
 	"time"
@@ -28,7 +29,7 @@ type Sequencer struct {
 	log    log.Logger
 	config *rollup.Config
-	engine derive.EngineControl
+	engine derive.ResettableEngineControl
 	attrBuilder      derive.AttributesBuilder
 	l1OriginSelector L1OriginSelectorIface
@@ -39,7 +40,7 @@ type Sequencer struct {
 	nextAction time.Time
 }
-func NewSequencer(log log.Logger, cfg *rollup.Config, engine derive.EngineControl, attributesBuilder derive.AttributesBuilder, l1OriginSelector L1OriginSelectorIface) *Sequencer {
+func NewSequencer(log log.Logger, cfg *rollup.Config, engine derive.ResettableEngineControl, attributesBuilder derive.AttributesBuilder, l1OriginSelector L1OriginSelectorIface) *Sequencer {
 	return &Sequencer{
 		log:              log,
 		config:           cfg,
@@ -62,7 +63,7 @@ func (d *Sequencer) StartBuildingBlock(ctx context.Context) error {
 	}
 	if !(l2Head.L1Origin.Hash == l1Origin.ParentHash || l2Head.L1Origin.Hash == l1Origin.Hash) {
-		return fmt.Errorf("cannot build new L2 block with L1 origin %s (parent L1 %s) on current L2 head %s with L1 origin %s", l1Origin, l1Origin.ParentHash, l2Head, l2Head.L1Origin)
+		return derive.NewResetError(fmt.Errorf("cannot build new L2 block with L1 origin %s (parent L1 %s) on current L2 head %s with L1 origin %s", l1Origin, l1Origin.ParentHash, l2Head, l2Head.L1Origin))
 	}
 	d.log.Info("creating new block", "parent", l2Head, "l1Origin", l1Origin)
@@ -159,29 +160,55 @@ func (d *Sequencer) BuildingOnto() eth.L2BlockRef {
 // RunNextSequencerAction starts new block building work, or seals existing work,
 // and is best timed by first awaiting the delay returned by PlanNextSequencerAction.
 // If a new block is successfully sealed, it will be returned for publishing, nil otherwise.
-func (d *Sequencer) RunNextSequencerAction(ctx context.Context) *eth.ExecutionPayload {
+// Only critical errors are bubbled up, other errors are handled internally.
+func (d *Sequencer) RunNextSequencerAction(ctx context.Context) (*eth.ExecutionPayload, error) {
 	if _, buildingID, _ := d.engine.BuildingPayload(); buildingID != (eth.PayloadID{}) {
 		payload, err := d.CompleteBuildingBlock(ctx)
 		if err != nil {
-			d.log.Error("sequencer failed to seal new block", "err", err)
+			if errors.Is(err, derive.ErrCritical) {
+				return nil, err // bubble up critical errors.
+			} else if errors.Is(err, derive.ErrReset) {
+				d.log.Error("sequencer failed to seal new block, requiring derivation reset", "err", err)
+				d.nextAction = d.timeNow().Add(time.Second * time.Duration(d.config.BlockTime)) // hold off from sequencing for a full block
+				if buildingID != (eth.PayloadID{}) {                                            // cancel what we were doing
+					d.CancelBuildingBlock(ctx)
+				}
+			} else if errors.Is(err, derive.ErrTemporary) {
+				d.log.Error("sequencer failed temporarily to seal new block", "err", err)
 				d.nextAction = d.timeNow().Add(time.Second)
+				// We don't explicitly cancel block building jobs upon temporary errors: we may still finish the block.
+				// Any unfinished block building work eventually times out, and will be cleaned up that way.
+			} else {
+				d.log.Error("sequencer failed to seal block with unclassified error", "err", err)
 				if buildingID != (eth.PayloadID{}) { // don't keep stale block building jobs around, try to cancel them
 					d.CancelBuildingBlock(ctx)
 				}
-			return nil
+			}
+			return nil, nil
 		} else {
 			d.log.Info("sequencer successfully built a new block", "block", payload.ID(), "time", uint64(payload.Timestamp), "txs", len(payload.Transactions))
-			return payload
+			return payload, nil
 		}
 	} else {
 		err := d.StartBuildingBlock(ctx)
 		if err != nil {
-			d.log.Error("sequencer failed to start building new block", "err", err)
+			if errors.Is(err, derive.ErrCritical) {
+				return nil, err
+			} else if errors.Is(err, derive.ErrReset) {
+				d.log.Error("sequencer failed to seal new block, requiring derivation reset", "err", err)
+				d.nextAction = d.timeNow().Add(time.Second * time.Duration(d.config.BlockTime)) // hold off from sequencing for a full block
+				d.engine.Reset()
+			} else if errors.Is(err, derive.ErrTemporary) {
+				d.log.Error("sequencer temporarily failed to start building new block", "err", err)
 				d.nextAction = d.timeNow().Add(time.Second)
+			} else {
+				d.log.Error("sequencer failed to start building new block with unclassified error", "err", err)
+				d.nextAction = d.timeNow().Add(time.Second)
+			}
 		} else {
 			parent, buildingID, _ := d.engine.BuildingPayload() // we should have a new payload ID now that we're building a block
 			d.log.Info("sequencer started building new block", "payload_id", buildingID, "l2_parent_block", parent, "l2_parent_block_time", parent.Time)
 		}
-		return nil
+		return nil, nil
 	}
 }
--- a/op-node/rollup/driver/sequencer_test.go
+++ b/op-node/rollup/driver/sequencer_test.go
@@ -23,6 +23,8 @@ import (
 	"github.com/ethereum-optimism/optimism/op-node/testutils"
 )
+var mockResetErr = fmt.Errorf("mock reset err: %w", derive.ErrReset)
 type FakeEngineControl struct {
 	finalized eth.L2BlockRef
 	safe      eth.L2BlockRef
@@ -122,7 +124,11 @@ func (m *FakeEngineControl) resetBuildingState() {
 	m.buildingAttrs = nil
 }
-var _ derive.EngineControl = (*FakeEngineControl)(nil)
+func (m *FakeEngineControl) Reset() {
+	m.err = nil
+}
+var _ derive.ResettableEngineControl = (*FakeEngineControl)(nil)
 type testAttrBuilderFn func(ctx context.Context, l2Parent eth.L2BlockRef, epoch eth.BlockID) (attrs *eth.PayloadAttributes, err error)
@@ -319,25 +325,30 @@ func TestSequencerChaosMonkey(t *testing.T) {
 		// reset errors
 		originErr = nil
 		attrsErr = nil
+		if engControl.err != mockResetErr { // the mockResetErr requires the sequencer to Reset() to recover.
 			engControl.err = nil
+		}
 		engControl.errTyp = derive.BlockInsertOK
 		// maybe make something maybe fail, or try a new L1 origin
-		switch rng.Intn(10) { // 40% chance to fail sequencer action (!!!)
+		switch rng.Intn(20) { // 9/20 = 45% chance to fail sequencer action (!!!)
-		case 0:
+		case 0, 1:
 			originErr = errors.New("mock origin error")
-		case 1:
+		case 2, 3:
 			attrsErr = errors.New("mock attributes error")
-		case 2:
+		case 4, 5:
 			engControl.err = errors.New("mock temporary engine error")
 			engControl.errTyp = derive.BlockInsertTemporaryErr
-		case 3:
+		case 6, 7:
 			engControl.err = errors.New("mock prestate engine error")
 			engControl.errTyp = derive.BlockInsertPrestateErr
+		case 8:
+			engControl.err = mockResetErr
 		default:
 			// no error
 		}
-		payload := seq.RunNextSequencerAction(context.Background())
+		payload, err := seq.RunNextSequencerAction(context.Background())
+		require.NoError(t, err)
 		if payload != nil {
 			require.Equal(t, engControl.UnsafeL2Head().ID(), payload.ID(), "head must stay in sync with emitted payloads")
 			var tx types.Transaction

--- a/op-node/rollup/driver/state.go
+++ b/op-node/rollup/driver/state.go
@@ -32,9 +32,6 @@ type Driver struct {
 	// The derivation pipeline determines the new l2Safe.
 	derivation DerivationPipeline
-	// When the derivation pipeline is waiting for new data to do anything
-	idleDerivation bool
 	// Requests to block the event loop for synchronous execution to avoid reading an inconsistent state
 	stateReq chan chan struct{}
@@ -212,7 +209,11 @@ func (s *Driver) eventLoop() {
 		select {
 		case <-sequencerCh:
-			payload := s.sequencer.RunNextSequencerAction(ctx)
+			payload, err := s.sequencer.RunNextSequencerAction(ctx)
+			if err != nil {
+				s.log.Error("Sequencer critical error", "err", err)
+				return
+			}
 			if s.network != nil && payload != nil {
 				// Publishing of unsafe data via p2p is optional.
 				// Errors are not severe enough to change/halt sequencing but should be logged and metered.
@@ -244,13 +245,11 @@ func (s *Driver) eventLoop() {
 			step()
 		case <-stepReqCh:
 			s.metrics.SetDerivationIdle(false)
-			s.idleDerivation = false
 			s.log.Debug("Derivation process step", "onto_origin", s.derivation.Origin(), "attempts", stepAttempts)
 			err := s.derivation.Step(context.Background())
 			stepAttempts += 1 // count as attempt by default. We reset to 0 if we are making healthy progress.
 			if err == io.EOF {
 				s.log.Debug("Derivation process went idle", "progress", s.derivation.Origin())
-				s.idleDerivation = true
 				stepAttempts = 0
 				s.metrics.SetDerivationIdle(true)
 				continue