Decouple transaction submission and receipt handling

a34a46aa · Michael de Hoog · 8a64179f · a34a46aa · a34a46aa · a34a46aa
Commit a34a46aa authored Apr 10, 2023 by Michael de Hoog
Showing with 136 additions and 67 deletions

driver.go op-batcher/batcher/driver.go +111 -56

metrics.go op-batcher/metrics/metrics.go +12 -0

noop.go op-batcher/metrics/noop.go +1 -0

setup.go op-e2e/setup.go +12 -11

No files found.
--- a/op-batcher/batcher/driver.go
+++ b/op-batcher/batcher/driver.go
@@ -8,6 +8,7 @@ import (
 	"math/big"
 	_ "net/http/pprof"
 	"sync"
+	"sync/atomic"
 	"time"

 	"github.com/ethereum-optimism/optimism/op-batcher/metrics"
@@ -40,6 +41,9 @@ type BatchSubmitter struct {
 	lastL1Tip       eth.L1BlockRef

 	state *channelManager
+
+	txWg       sync.WaitGroup
+	pendingTxs atomic.Uint64
 }

 // NewBatchSubmitterFromCLIConfig initializes the BatchSubmitter, gathering any resources
@@ -282,82 +286,133 @@ func (l *BatchSubmitter) calculateL2BlockRangeToStore(ctx context.Context) (eth.
 // Submitted batch, but it is not valid
 // Missed L2 block somehow.

+type txReceipt struct {
+	id      txID
+	receipt *types.Receipt
+	err     error
+}
+
 func (l *BatchSubmitter) loop() {
 	defer l.wg.Done()

-	ticker := time.NewTicker(l.PollInterval)
-	defer ticker.Stop()
+	loadTicker := time.NewTicker(l.PollInterval)
+	defer loadTicker.Stop()
+	publishTicker := time.NewTicker(100 * time.Millisecond)
+	defer publishTicker.Stop()
+	receiptsCh := make(chan txReceipt)
+
 	for {
 		select {
-		case <-ticker.C:
+		case <-loadTicker.C:
 			l.loadBlocksIntoState(l.shutdownCtx)
-			l.publishStateToL1(l.killCtx)
+		case <-publishTicker.C:
+			_ = l.publishStateToL1(l.killCtx, receiptsCh)
+		case res := <-receiptsCh:
+			// Record TX Status
+			if res.err != nil {
+				l.recordFailedTx(res.id, res.err)
+			} else {
+				l.recordConfirmedTx(res.id, res.receipt)
+			}
 		case <-l.shutdownCtx.Done():
-			l.publishStateToL1(l.killCtx)
+			l.drainState(receiptsCh)
 			return
 		}
 	}
 }

-// publishStateToL1 loops through the block data loaded into `state` and
-// submits the associated data to the L1 in the form of channel frames.
-func (l *BatchSubmitter) publishStateToL1(ctx context.Context) {
-	maxPending := l.MaxPendingTransactions
-	if maxPending == 0 {
-		maxPending = 1<<64 - 1
+func (l *BatchSubmitter) drainState(receiptsCh chan txReceipt) {
+	err := l.state.Close()
+	if err != nil {
+		l.log.Error("error closing the channel manager", "err", err)
 	}
-
-	for {
-		// Attempt to gracefully terminate the current channel, ensuring that no new frames will be
-		// produced. Any remaining frames must still be published to the L1 to prevent stalling.
-		select {
-		case <-ctx.Done():
-			err := l.state.Close()
-			if err != nil {
-				l.log.Error("error closing the channel manager", "err", err)
-			}
-		case <-l.shutdownCtx.Done():
-			err := l.state.Close()
-			if err != nil {
-				l.log.Error("error closing the channel manager", "err", err)
+	func() {
+		// keep publishing state until we've drained all pending data (EOF), or an error occurs
+		for {
+			select {
+			case <-l.killCtx.Done():
+				return
+			default:
+				err := l.publishStateToL1(l.killCtx, receiptsCh)
+				if err != nil {
+					if err != io.EOF {
+						l.log.Error("error while publishing state on shutdown", "err", err)
+					}
+					return
+				}
 			}
-		default:
 		}
-
-		l1tip, err := l.l1Tip(ctx)
-		if err != nil {
-			l.log.Error("Failed to query L1 tip", "error", err)
-			return
-		}
-		l.recordL1Tip(l1tip)
-
-		// Collect next transaction data
-		var wg sync.WaitGroup
-		for i := uint64(0); i < maxPending; i++ {
-			var txdata txData
-			txdata, err = l.state.TxData(l1tip.ID())
-			if err == io.EOF {
-				l.log.Trace("no transaction data available")
-				break
-			} else if err != nil {
-				l.log.Error("unable to get tx data", "err", err)
-				break
+	}()
+	var receipts []txReceipt
+	receiptsDone := make(chan struct{})
+	go func() {
+		for {
+			select {
+			case res := <-receiptsCh:
+				receipts = append(receipts, res)
+			case <-receiptsDone:
+				return
 			}
-			wg.Add(1)
-			go func() {
-				defer wg.Done()
-				// Record TX Status
-				if receipt, err := l.sendTransaction(ctx, txdata.Bytes()); err != nil {
-					l.recordFailedTx(txdata.ID(), err)
-				} else {
-					l.recordConfirmedTx(txdata.ID(), receipt)
-				}
-			}()
 		}
-		wg.Wait()
+	}()
+	// wait for all transactions to complete
+	l.txWg.Wait()
+	close(receiptsDone)
+	// process the receipts
+	for _, res := range receipts {
+		if res.err != nil {
+			l.recordFailedTx(res.id, res.err)
+		} else {
+			l.recordConfirmedTx(res.id, res.receipt)
+		}
 	}
 }

+// publishStateToL1 pulls the block data loaded into `state` and
+// submits the associated data to the L1 in the form of channel frames.
+func (l *BatchSubmitter) publishStateToL1(ctx context.Context, receiptsCh chan txReceipt) error {
+	pending := l.pendingTxs.Load()
+	if l.MaxPendingTransactions > 0 && pending >= l.MaxPendingTransactions {
+		l.log.Trace("skipping publish due to pending transactions")
+		return nil
+	}
+
+	l1tip, err := l.l1Tip(ctx)
+	if err != nil {
+		l.log.Error("Failed to query L1 tip", "error", err)
+		return err
+	}
+	l.recordL1Tip(l1tip)
+
+	// Collect next transaction data
+	txdata, err := l.state.TxData(l1tip.ID())
+	if err == io.EOF {
+		l.log.Trace("no transaction data available")
+		return err
+	} else if err != nil {
+		l.log.Error("unable to get tx data", "err", err)
+		return err
+	}
+
+	pending = l.pendingTxs.Add(1)
+	l.metr.RecordPendingTx(pending)
+	l.txWg.Add(1)
+	go func() {
+		defer func() {
+			l.txWg.Done()
+			pending = l.pendingTxs.Add(^uint64(0)) // -1
+			l.metr.RecordPendingTx(pending)
+		}()
+		receipt, err := l.sendTransaction(ctx, txdata.Bytes())
+		receiptsCh <- txReceipt{
+			id:      txdata.ID(),
+			receipt: receipt,
+			err:     err,
+		}
+	}()
+	return nil
+}
+
 // sendTransaction creates & submits a transaction to the batch inbox address with the given `data`.
 // It currently uses the underlying `txmgr` to handle transaction sending & price management.
 // This is a blocking method. It should not be called concurrently.

--- a/op-batcher/metrics/metrics.go
+++ b/op-batcher/metrics/metrics.go
@@ -34,6 +34,7 @@ type Metricer interface {
 	RecordChannelFullySubmitted(id derive.ChannelID)
 	RecordChannelTimedOut(id derive.ChannelID)

+	RecordPendingTx(pending uint64)
 	RecordBatchTxSubmitted()
 	RecordBatchTxSuccess()
 	RecordBatchTxFailed()
@@ -67,6 +68,7 @@ type Metrics struct {
 	ChannelInputBytesTotal  prometheus.Counter
 	ChannelOutputBytesTotal prometheus.Counter

+	PendingTxs   prometheus.Gauge
 	BatcherTxEvs opmetrics.EventVec
 }

@@ -157,6 +159,12 @@ func NewMetrics(procName string) *Metrics {
 			Help:      "Total number of compressed output bytes from a channel.",
 		}),

+		PendingTxs: factory.NewGauge(prometheus.GaugeOpts{
+			Namespace: ns,
+			Name:      "pending_txs",
+			Help:      "Number of transactions pending receipts.",
+		}),
+
 		BatcherTxEvs: opmetrics.NewEventVec(factory, ns, "", "batcher_tx", "BatcherTx", []string{"stage"}),
 	}
 }
@@ -256,6 +264,10 @@ func (m *Metrics) RecordChannelTimedOut(id derive.ChannelID) {
 	m.ChannelEvs.Record(StageTimedOut)
 }

+func (m *Metrics) RecordPendingTx(pending uint64) {
+	m.PendingTxs.Set(float64(pending))
+}
+
 func (m *Metrics) RecordBatchTxSubmitted() {
 	m.BatcherTxEvs.Record(TxStageSubmitted)
 }

--- a/op-batcher/metrics/noop.go
+++ b/op-batcher/metrics/noop.go
@@ -29,6 +29,7 @@ func (*noopMetrics) RecordChannelClosed(derive.ChannelID, int, int, int, int, er
 func (*noopMetrics) RecordChannelFullySubmitted(derive.ChannelID) {}
 func (*noopMetrics) RecordChannelTimedOut(derive.ChannelID)       {}

+func (*noopMetrics) RecordPendingTx(uint64)  {}
 func (*noopMetrics) RecordBatchTxSubmitted() {}
 func (*noopMetrics) RecordBatchTxSuccess()   {}
 func (*noopMetrics) RecordBatchTxFailed()    {}
--- a/op-e2e/setup.go
+++ b/op-e2e/setup.go
@@ -593,17 +593,18 @@ func (cfg SystemConfig) Start(_opts ...SystemConfigOption) (*System, error) {

 	// Batch Submitter
 	sys.BatchSubmitter, err = bss.NewBatchSubmitterFromCLIConfig(bss.CLIConfig{
-		L1EthRpc:           sys.Nodes["l1"].WSEndpoint(),
-		L2EthRpc:           sys.Nodes["sequencer"].WSEndpoint(),
-		RollupRpc:          sys.RollupNodes["sequencer"].HTTPEndpoint(),
-		MaxChannelDuration: 1,
-		MaxL1TxSize:        120_000,
-		TargetL1TxSize:     100_000,
-		TargetNumFrames:    1,
-		ApproxComprRatio:   0.4,
-		SubSafetyMargin:    4,
-		PollInterval:       50 * time.Millisecond,
-		TxMgrConfig:        newTxMgrConfig(sys.Nodes["l1"].WSEndpoint(), cfg.Secrets.Batcher),
+		L1EthRpc:               sys.Nodes["l1"].WSEndpoint(),
+		L2EthRpc:               sys.Nodes["sequencer"].WSEndpoint(),
+		RollupRpc:              sys.RollupNodes["sequencer"].HTTPEndpoint(),
+		MaxPendingTransactions: 1,
+		MaxChannelDuration:     1,
+		MaxL1TxSize:            120_000,
+		TargetL1TxSize:         100_000,
+		TargetNumFrames:        1,
+		ApproxComprRatio:       0.4,
+		SubSafetyMargin:        4,
+		PollInterval:           50 * time.Millisecond,
+		TxMgrConfig:            newTxMgrConfig(sys.Nodes["l1"].WSEndpoint(), cfg.Secrets.Batcher),
 		LogConfig: oplog.CLIConfig{
 			Level:  "info",
 			Format: "text",