op-batcher: lock state mutex while computing sync actions (#13330)

* lock state mutex while computing sync actions * rename batcher.state to batcher.channelManager Closes https://github.com/ethereum-optimism/optimism/issues/13280 * move channel manager mutex up into driver encapsulate imperative code and mutex locking/unlocking into new method executeSyncActions * don't use pointer to mutex * rename * throttlingLoop uses channelMgrMutex * unblock throttling loop by sending pendingBytes on a channel * disable proposer in 4844 system test it mostly adds noise to the logs * remove unused ticker * Revert "remove unused ticker" This reverts commit b2e9762c5e6e92e8b62771439d864d9985dcc943. * reinstate throttle on tick used cached value * make the main loop release the chMgr lock more often * simplify * add some more mutex calls * simplify some more * push mutex locking down into l.publishTxToL1 * do not signal the throttling loop inside publishTxToL1 This is a change in behaviour that is outside the scope of this PR. * prefer deferring mutex unlock

op-batcher: lock state mutex while computing sync actions (#13330)
* lock state mutex while computing sync actions * rename batcher.state to batcher.channelManager Closes https://github.com/ethereum-optimism/optimism/issues/13280 * move channel manager mutex up into driver encapsulate imperative code and mutex locking/unlocking into new method executeSyncActions * don't use pointer to mutex * rename * throttlingLoop uses channelMgrMutex * unblock throttling loop by sending pendingBytes on a channel * disable proposer in 4844 system test it mostly adds noise to the logs * remove unused ticker * Revert "remove unused ticker" This reverts commit b2e9762c5e6e92e8b62771439d864d9985dcc943. * reinstate throttle on tick used cached value * make the main loop release the chMgr lock more often * simplify * add some more mutex calls * simplify some more * push mutex locking down into l.publishTxToL1 * do not signal the throttling loop inside publishTxToL1 This is a change in behaviour that is outside the scope of this PR. * prefer deferring mutex unlock
743e782b · George Knee · GitHub · 2932a181 · 743e782b · 743e782b
Commit 743e782b authored Dec 13, 2024 by George Knee Committed by GitHub Dec 13, 2024
5 changed files
--- a/op-batcher/batcher/channel_manager.go
+++ b/op-batcher/batcher/channel_manager.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"io"
 	"math"
-	"sync"

 	"github.com/ethereum-optimism/optimism/op-batcher/metrics"
 	"github.com/ethereum-optimism/optimism/op-node/rollup"
@@ -29,7 +28,6 @@ type ChannelOutFactory func(cfg ChannelConfig, rollupCfg *rollup.Config) (derive
 // channel.
 // Public functions on channelManager are safe for concurrent access.
 type channelManager struct {
-	mu          sync.Mutex
 	log         log.Logger
 	metr        metrics.Metricer
 	cfgProvider ChannelConfigProvider
@@ -78,8 +76,6 @@ func (s *channelManager) SetChannelOutFactory(outFactory ChannelOutFactory) {
 // Clear clears the entire state of the channel manager.
 // It is intended to be used before launching op-batcher and after an L2 reorg.
 func (s *channelManager) Clear(l1OriginLastSubmittedChannel eth.BlockID) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
 	s.log.Trace("clearing channel manager state")
 	s.blocks.Clear()
 	s.blockCursor = 0
@@ -97,8 +93,6 @@ func (s *channelManager) pendingBlocks() int {
 // TxFailed records a transaction as failed. It will attempt to resubmit the data
 // in the failed transaction.
 func (s *channelManager) TxFailed(_id txID) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
 	id := _id.String()
 	if channel, ok := s.txChannels[id]; ok {
 		delete(s.txChannels, id)
@@ -111,8 +105,7 @@ func (s *channelManager) TxFailed(_id txID) {
 // TxConfirmed marks a transaction as confirmed on L1. Only if the channel timed out
 // the channelManager's state is modified.
 func (s *channelManager) TxConfirmed(_id txID, inclusionBlock eth.BlockID) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
+
 	id := _id.String()
 	if channel, ok := s.txChannels[id]; ok {
 		delete(s.txChannels, id)
@@ -155,7 +148,7 @@ func (s *channelManager) handleChannelInvalidated(c *channel) {
 		}
 		s.rewindToBlock(blockID)
 	} else {
-		s.log.Debug("channelManager.handleChanneInvalidated: channel had no blocks")
+		s.log.Debug("channelManager.handleChannelInvalidated: channel had no blocks")
 	}

 	// Trim provided channel and any older channels:
@@ -198,8 +191,6 @@ func (s *channelManager) nextTxData(channel *channel) (txData, error) {
 // When switching DA type, the channelManager state will be rebuilt
 // with a new ChannelConfig.
 func (s *channelManager) TxData(l1Head eth.BlockID) (txData, error) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
 	channel, err := s.getReadyChannel(l1Head)
 	if err != nil {
 		return emptyTxData, err
@@ -437,9 +428,6 @@ func (s *channelManager) outputFrames() error {
 // if the block does not extend the last block loaded into the state. If no
 // blocks were added yet, the parent hash check is skipped.
 func (s *channelManager) AddL2Block(block *types.Block) error {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
 	if s.tip != (common.Hash{}) && s.tip != block.ParentHash() {
 		return ErrReorg
 	}

--- a/op-batcher/batcher/driver.go
+++ b/op-batcher/batcher/driver.go
@@ -105,7 +105,7 @@ type BatchSubmitter struct {
 	killCtx           context.Context
 	cancelKillCtx     context.CancelFunc

-	l2BlockAdded chan struct{} // notifies the throttling loop whenever an l2 block is added
+	pendingBytesUpdated chan int64 // notifies the throttling with the new pending bytes

 	mutex   sync.Mutex
 	running bool
@@ -114,8 +114,9 @@ type BatchSubmitter struct {
 	txpoolState       TxPoolState
 	txpoolBlockedBlob bool

-	state         *channelManager
-	prevCurrentL1 eth.L1BlockRef // cached CurrentL1 from the last syncStatus
+	channelMgrMutex sync.Mutex // guards channelMgr and prevCurrentL1
+	channelMgr      *channelManager
+	prevCurrentL1   eth.L1BlockRef // cached CurrentL1 from the last syncStatus
 }

 // NewBatchSubmitter initializes the BatchSubmitter driver from a preconfigured DriverSetup
@@ -126,7 +127,7 @@ func NewBatchSubmitter(setup DriverSetup) *BatchSubmitter {
 	}
 	return &BatchSubmitter{
 		DriverSetup: setup,
-		state:       state,
+		channelMgr:  state,
 	}
 }

@@ -293,13 +294,15 @@ func (l *BatchSubmitter) loadBlockIntoState(ctx context.Context, blockNumber uin
 		return nil, fmt.Errorf("getting L2 block: %w", err)
 	}

-	if err := l.state.AddL2Block(block); err != nil {
+	l.channelMgrMutex.Lock()
+	defer l.channelMgrMutex.Unlock()
+	if err := l.channelMgr.AddL2Block(block); err != nil {
 		return nil, fmt.Errorf("adding L2 block to state: %w", err)
 	}

 	// notify the throttling loop it may be time to initiate throttling without blocking
 	select {
-	case l.l2BlockAdded <- struct{}{}:
+	case l.pendingBytesUpdated <- l.channelMgr.PendingDABytes():
 	default:
 	}

@@ -387,6 +390,35 @@ func (l *BatchSubmitter) setTxPoolState(txPoolState TxPoolState, txPoolBlockedBl
 	l.txpoolMutex.Unlock()
 }

+// syncAndPrune computes actions to take based on the current sync status, prunes the channel manager state
+// and returns blocks to load.
+func (l *BatchSubmitter) syncAndPrune(syncStatus *eth.SyncStatus) *inclusiveBlockRange {
+	l.channelMgrMutex.Lock()
+	defer l.channelMgrMutex.Unlock()
+
+	// Decide appropriate actions
+	syncActions, outOfSync := computeSyncActions(*syncStatus, l.prevCurrentL1, l.channelMgr.blocks, l.channelMgr.channelQueue, l.Log)
+
+	if outOfSync {
+		// If the sequencer is out of sync
+		// do nothing and wait to see if it has
+		// got in sync on the next tick.
+		l.Log.Warn("Sequencer is out of sync, retrying next tick.")
+		return syncActions.blocksToLoad
+	}
+
+	l.prevCurrentL1 = syncStatus.CurrentL1
+
+	// Manage existing state / garbage collection
+	if syncActions.clearState != nil {
+		l.channelMgr.Clear(*syncActions.clearState)
+	} else {
+		l.channelMgr.pruneSafeBlocks(syncActions.blocksToPrune)
+		l.channelMgr.pruneChannels(syncActions.channelsToPrune)
+	}
+	return syncActions.blocksToLoad
+}
+
 // mainLoop periodically:
 // -  polls the sequencer,
 // -  prunes the channel manager state (i.e. safe blocks)
@@ -410,8 +442,8 @@ func (l *BatchSubmitter) mainLoop(ctx context.Context, receiptsCh chan txmgr.TxR
 	l.txpoolState = TxpoolGood
 	l.txpoolMutex.Unlock()

-	l.l2BlockAdded = make(chan struct{})
-	defer close(l.l2BlockAdded)
+	l.pendingBytesUpdated = make(chan int64)
+	defer close(l.pendingBytesUpdated)

 	ticker := time.NewTicker(l.Config.PollInterval)
 	defer ticker.Stop()
@@ -430,30 +462,11 @@ func (l *BatchSubmitter) mainLoop(ctx context.Context, receiptsCh chan txmgr.TxR
 				continue
 			}

-			// Decide appropriate actions
-			syncActions, outOfSync := computeSyncActions(*syncStatus, l.prevCurrentL1, l.state.blocks, l.state.channelQueue, l.Log)
+			blocksToLoad := l.syncAndPrune(syncStatus)

-			if outOfSync {
-				// If the sequencer is out of sync
-				// do nothing and wait to see if it has
-				// got in sync on the next tick.
-				l.Log.Warn("Sequencer is out of sync, retrying next tick.")
-				continue
-			}
-
-			l.prevCurrentL1 = syncStatus.CurrentL1
-
-			// Manage existing state / garbage collection
-			if syncActions.clearState != nil {
-				l.state.Clear(*syncActions.clearState)
-			} else {
-				l.state.pruneSafeBlocks(syncActions.blocksToPrune)
-				l.state.pruneChannels(syncActions.channelsToPrune)
-			}
-
-			if syncActions.blocksToLoad != nil {
+			if blocksToLoad != nil {
 				// Get fresh unsafe blocks
-				if err := l.loadBlocksIntoState(l.shutdownCtx, syncActions.blocksToLoad.start, syncActions.blocksToLoad.end); errors.Is(err, ErrReorg) {
+				if err := l.loadBlocksIntoState(l.shutdownCtx, blocksToLoad.start, blocksToLoad.end); errors.Is(err, ErrReorg) {
 					l.Log.Warn("error loading blocks, clearing state and waiting for node sync", "err", err)
 					l.waitNodeSyncAndClearState()
 					continue
@@ -461,6 +474,7 @@ func (l *BatchSubmitter) mainLoop(ctx context.Context, receiptsCh chan txmgr.TxR
 			}

 			l.publishStateToL1(queue, receiptsCh, daGroup, l.Config.PollInterval)
+
 		case <-ctx.Done():
 			if err := queue.Wait(); err != nil {
 				l.Log.Error("error waiting for transactions to complete", "err", err)
@@ -506,7 +520,7 @@ func (l *BatchSubmitter) throttlingLoop(ctx context.Context) {
 	ticker := time.NewTicker(l.Config.ThrottleInterval)
 	defer ticker.Stop()

-	updateParams := func() {
+	updateParams := func(pendingBytes int64) {
 		ctx, cancel := context.WithTimeout(l.shutdownCtx, l.Config.NetworkTimeout)
 		defer cancel()
 		cl, err := l.EndpointProvider.EthClient(ctx)
@@ -514,7 +528,7 @@ func (l *BatchSubmitter) throttlingLoop(ctx context.Context) {
 			l.Log.Error("Can't reach sequencer execution RPC", "err", err)
 			return
 		}
-		pendingBytes := l.state.PendingDABytes()
+
 		maxTxSize := uint64(0)
 		maxBlockSize := l.Config.ThrottleAlwaysBlockSize
 		if pendingBytes > int64(l.Config.ThrottleThreshold) {
@@ -550,12 +564,14 @@ func (l *BatchSubmitter) throttlingLoop(ctx context.Context) {
 		}
 	}

+	cachedPendingBytes := int64(0)
 	for {
 		select {
-		case <-l.l2BlockAdded:
-			updateParams()
 		case <-ticker.C:
-			updateParams()
+			updateParams(int64(cachedPendingBytes))
+		case pendingBytes := <-l.pendingBytesUpdated:
+			cachedPendingBytes = pendingBytes
+			updateParams(pendingBytes)
 		case <-ctx.Done():
 			l.Log.Info("DA throttling loop done")
 			return
@@ -621,7 +637,9 @@ func (l *BatchSubmitter) publishStateToL1(queue *txmgr.Queue[txRef], receiptsCh
 			l.Log.Info("txpool state is not good, aborting state publishing")
 			return
 		}
+
 		err := l.publishTxToL1(l.killCtx, queue, receiptsCh, daGroup)
+
 		if err != nil {
 			if err != io.EOF {
 				l.Log.Error("Error publishing tx to l1", "err", err)
@@ -647,7 +665,9 @@ func (l *BatchSubmitter) clearState(ctx context.Context) {
 			return false
 		} else {
 			l.Log.Info("Clearing state with safe L1 origin", "origin", l1SafeOrigin)
-			l.state.Clear(l1SafeOrigin)
+			l.channelMgrMutex.Lock()
+			defer l.channelMgrMutex.Unlock()
+			l.channelMgr.Clear(l1SafeOrigin)
 			return true
 		}
 	}
@@ -668,7 +688,9 @@ func (l *BatchSubmitter) clearState(ctx context.Context) {
 			}
 		case <-ctx.Done():
 			l.Log.Warn("Clearing state cancelled")
-			l.state.Clear(eth.BlockID{})
+			l.channelMgrMutex.Lock()
+			defer l.channelMgrMutex.Unlock()
+			l.channelMgr.Clear(eth.BlockID{})
 			return
 		}
 	}
@@ -676,6 +698,7 @@ func (l *BatchSubmitter) clearState(ctx context.Context) {

 // publishTxToL1 submits a single state tx to the L1
 func (l *BatchSubmitter) publishTxToL1(ctx context.Context, queue *txmgr.Queue[txRef], receiptsCh chan txmgr.TxReceipt[txRef], daGroup *errgroup.Group) error {
+
 	// send all available transactions
 	l1tip, err := l.l1Tip(ctx)
 	if err != nil {
@@ -686,7 +709,9 @@ func (l *BatchSubmitter) publishTxToL1(ctx context.Context, queue *txmgr.Queue[t

 	// Collect next transaction data. This pulls data out of the channel, so we need to make sure
 	// to put it back if ever da or txmgr requests fail, by calling l.recordFailedDARequest/recordFailedTx.
-	txdata, err := l.state.TxData(l1tip.ID())
+	l.channelMgrMutex.Lock()
+	txdata, err := l.channelMgr.TxData(l1tip.ID())
+	l.channelMgrMutex.Unlock()

 	if err == io.EOF {
 		l.Log.Trace("No transaction data available")
@@ -870,18 +895,22 @@ func (l *BatchSubmitter) recordFailedDARequest(id txID, err error) {
 	if err != nil {
 		l.Log.Warn("DA request failed", logFields(id, err)...)
 	}
-	l.state.TxFailed(id)
+	l.channelMgr.TxFailed(id)
 }

 func (l *BatchSubmitter) recordFailedTx(id txID, err error) {
+	l.channelMgrMutex.Lock()
+	defer l.channelMgrMutex.Unlock()
 	l.Log.Warn("Transaction failed to send", logFields(id, err)...)
-	l.state.TxFailed(id)
+	l.channelMgr.TxFailed(id)
 }

 func (l *BatchSubmitter) recordConfirmedTx(id txID, receipt *types.Receipt) {
+	l.channelMgrMutex.Lock()
+	defer l.channelMgrMutex.Unlock()
 	l.Log.Info("Transaction confirmed", logFields(id, receipt)...)
 	l1block := eth.ReceiptBlockID(receipt)
-	l.state.TxConfirmed(id, l1block)
+	l.channelMgr.TxConfirmed(id, l1block)
 }

 // l1Tip gets the current L1 tip as a L1BlockRef. The passed context is assumed

--- a/op-batcher/batcher/service.go
+++ b/op-batcher/batcher/service.go
@@ -476,6 +476,6 @@ func (bs *BatcherService) ThrottlingTestDriver() *TestBatchSubmitter {
 	tbs := &TestBatchSubmitter{
 		BatchSubmitter: bs.driver,
 	}
-	tbs.BatchSubmitter.state.metr = new(metrics.ThrottlingMetrics)
+	tbs.BatchSubmitter.channelMgr.metr = new(metrics.ThrottlingMetrics)
 	return tbs
 }
--- a/op-batcher/batcher/test_batch_submitter.go
+++ b/op-batcher/batcher/test_batch_submitter.go
@@ -27,7 +27,7 @@ func (l *TestBatchSubmitter) JamTxPool(ctx context.Context) error {
 	}
 	var candidate *txmgr.TxCandidate
 	var err error
-	cc := l.state.cfgProvider.ChannelConfig()
+	cc := l.channelMgr.cfgProvider.ChannelConfig()
 	if cc.UseBlobs {
 		candidate = l.calldataTxCandidate([]byte{})
 	} else if candidate, err = l.blobTxCandidate(emptyTxData); err != nil {

--- a/op-e2e/system/da/eip4844_test.go
+++ b/op-e2e/system/da/eip4844_test.go
@@ -92,6 +92,7 @@ func testSystem4844E2E(t *testing.T, multiBlob bool, daType batcherFlags.DataAva
 		}
 	}()

+	cfg.DisableProposer = true // disable L2 output submission for this test
 	sys, err := cfg.Start(t, action)
 	require.NoError(t, err, "Error starting up system")