op-challenger: Fix highest acted L1 block metric for completed games (#10549)

* op-challenger: Fix highest acted L1 block metric for completed games Adds an e2e test to prove it really does work. * op-challenger: Add more context to comment.

op-challenger: Fix highest acted L1 block metric for completed games (#10549)
* op-challenger: Fix highest acted L1 block metric for completed games Adds an e2e test to prove it really does work. * op-challenger: Add more context to comment.
27d2f163 · Adrian Sutton · GitHub · fa88d9c3 · 27d2f163 · 27d2f163
Commit 27d2f163 authored May 16, 2024 by Adrian Sutton Committed by GitHub May 15, 2024
9 changed files
--- a/op-challenger/challenger.go
+++ b/op-challenger/challenger.go
@@ -3,6 +3,7 @@ package op_challenger
 import (
 	"context"
+	"github.com/ethereum-optimism/optimism/op-challenger/metrics"
 	"github.com/ethereum/go-ethereum/log"
 	"github.com/ethereum-optimism/optimism/op-challenger/config"
@@ -11,9 +12,9 @@ import (
 )
 // Main is the programmatic entry-point for running op-challenger with a given configuration.
-func Main(ctx context.Context, logger log.Logger, cfg *config.Config) (cliapp.Lifecycle, error) {
+func Main(ctx context.Context, logger log.Logger, cfg *config.Config, m metrics.Metricer) (cliapp.Lifecycle, error) {
 	if err := cfg.Check(); err != nil {
 		return nil, err
 	}
-	return game.NewService(ctx, logger, cfg)
+	return game.NewService(ctx, logger, cfg, m)
 }
--- a/op-challenger/challenger_test.go
+++ b/op-challenger/challenger_test.go
@@ -5,6 +5,7 @@ import (
 	"testing"
 	"github.com/ethereum-optimism/optimism/op-challenger/config"
+	"github.com/ethereum-optimism/optimism/op-challenger/metrics"
 	"github.com/ethereum-optimism/optimism/op-service/testlog"
 	"github.com/ethereum/go-ethereum/log"
 	"github.com/stretchr/testify/require"
@@ -12,7 +13,7 @@ import (
 func TestMainShouldReturnErrorWhenConfigInvalid(t *testing.T) {
 	cfg := &config.Config{}
-	app, err := Main(context.Background(), testlog.Logger(t, log.LevelInfo), cfg)
+	app, err := Main(context.Background(), testlog.Logger(t, log.LevelInfo), cfg, metrics.NoopMetrics)
 	require.ErrorIs(t, err, cfg.Check())
 	require.Nil(t, app)
 }
--- a/op-challenger/cmd/main.go
+++ b/op-challenger/cmd/main.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"os"
+	"github.com/ethereum-optimism/optimism/op-challenger/metrics"
 	"github.com/urfave/cli/v2"
 	"github.com/ethereum/go-ethereum/log"
@@ -29,7 +30,9 @@ var VersionWithMeta = opservice.FormatVersion(version.Version, GitCommit, GitDat
 func main() {
 	args := os.Args
 	ctx := opio.WithInterruptBlocker(context.Background())
-	if err := run(ctx, args, challenger.Main); err != nil {
+	if err := run(ctx, args, func(ctx context.Context, l log.Logger, config *config.Config) (cliapp.Lifecycle, error) {
+		return challenger.Main(ctx, l, config, metrics.NewMetrics())
+	}); err != nil {
 		log.Crit("Application failed", "err", err)
 	}
 }

--- a/op-challenger/game/scheduler/coordinator.go
+++ b/op-challenger/game/scheduler/coordinator.go
@@ -143,12 +143,12 @@ func (c *coordinator) createJob(ctx context.Context, game types.GameMetadata, bl
 		state.player = player
 		state.status = player.Status()
 	}
-	state.inflight = true
 	if state.status != types.GameStatusInProgress {
 		c.logger.Debug("Not rescheduling resolved game", "game", game.Proxy, "status", state.status)
 		state.lastProcessedBlockNum = blockNumber
 		return nil, nil
 	}
+	state.inflight = true
 	return newJob(blockNumber, game.Proxy, state.player, state.status), nil
 }

--- a/op-challenger/game/scheduler/coordinator_test.go
+++ b/op-challenger/game/scheduler/coordinator_test.go
@@ -204,29 +204,63 @@ func TestDeleteDataForResolvedGames(t *testing.T) {
 	require.True(t, disk.gameDirExists[gameAddr1], "game 1 data should be preserved (not resolved)")
 	require.False(t, disk.gameDirExists[gameAddr2], "game 2 data should be deleted")
-	require.True(t, disk.gameDirExists[gameAddr3], "game 3 data should be preserved (inflight)")
+	// Game 3 never got marked as in-flight because it was already resolved so got skipped.
+	// We shouldn't be able to have a known-resolved game that is also in-flight because we always skip processing it.
+	require.False(t, disk.gameDirExists[gameAddr3], "game 3 data should be deleted")
 }
 func TestSchedule_RecordActedL1Block(t *testing.T) {
 	c, workQueue, _, _, _, _ := setupCoordinatorTest(t, 10)
-	gameAddr3 := common.Address{0xcc}
+	gameAddr1 := common.Address{0xaa}
+	gameAddr2 := common.Address{0xcc}
 	ctx := context.Background()
 	// The first game should be tracked
-	require.NoError(t, c.schedule(ctx, asGames(gameAddr3), 1))
+	require.NoError(t, c.schedule(ctx, asGames(gameAddr1, gameAddr2), 1))
 	// Process the result
-	require.Len(t, workQueue, 1)
+	require.Len(t, workQueue, 2)
 	j := <-workQueue
+	require.Equal(t, gameAddr1, j.addr)
 	j.status = types.GameStatusDefenderWon
 	require.NoError(t, c.processResult(j))
+	j = <-workQueue
+	require.Equal(t, gameAddr2, j.addr)
+	j.status = types.GameStatusInProgress
+	require.NoError(t, c.processResult(j))
+	// Schedule another block
+	require.NoError(t, c.schedule(ctx, asGames(gameAddr1, gameAddr2), 2))
+	// Process the result (only the in-progress game gets rescheduled)
+	require.Len(t, workQueue, 1)
+	j = <-workQueue
+	require.Equal(t, gameAddr2, j.addr)
+	require.Equal(t, uint64(2), j.block)
+	j.status = types.GameStatusInProgress
+	require.NoError(t, c.processResult(j))
+	// Schedule a third block
+	require.NoError(t, c.schedule(ctx, asGames(gameAddr1, gameAddr2), 3))
+	// Process the result (only the in-progress game gets rescheduled)
+	// This is deliberately done a third time, because there was actually a bug where it worked for the first two
+	// cycles and failed on the third. This was because the first cycle the game status was unknown so it was processed
+	// the second cycle was the first time the game was known to be complete so was skipped but crucially it left it
+	// marked as in-flight.  On the third update the was incorrectly skipped as in-flight and the l1 block number
+	// wasn't updated. From then on the block number would never be updated.
+	require.Len(t, workQueue, 1)
+	j = <-workQueue
+	require.Equal(t, gameAddr2, j.addr)
+	require.Equal(t, uint64(3), j.block)
+	j.status = types.GameStatusInProgress
+	require.NoError(t, c.processResult(j))
 	// Schedule so that the metric is updated
-	require.NoError(t, c.schedule(ctx, asGames(gameAddr3), 2))
+	require.NoError(t, c.schedule(ctx, asGames(gameAddr1, gameAddr2), 4))
 	// Verify that the block number is recorded by the metricer as acted upon
-	// The one game is now complete so its block number is updated immediately because it doesn't get scheduled
+	require.Equal(t, uint64(3), c.m.(*stubSchedulerMetrics).actedL1Blocks)
-	require.Equal(t, uint64(2), c.m.(*stubSchedulerMetrics).actedL1Blocks)
 }
 func TestSchedule_RecordActedL1BlockMultipleGames(t *testing.T) {

--- a/op-challenger/game/service.go
+++ b/op-challenger/game/service.go
@@ -69,12 +69,12 @@ type Service struct {
 }
 // NewService creates a new Service.
-func NewService(ctx context.Context, logger log.Logger, cfg *config.Config) (*Service, error) {
+func NewService(ctx context.Context, logger log.Logger, cfg *config.Config, m metrics.Metricer) (*Service, error) {
 	s := &Service{
 		systemClock: clock.SystemClock,
 		l1Clock:     clock.NewSimpleClock(),
 		logger:      logger,
-		metrics:     metrics.NewMetrics(),
+		metrics:     m,
 	}
 	if err := s.initFromConfig(ctx, cfg); err != nil {

--- a/op-e2e/e2eutils/challenger/helper.go
+++ b/op-e2e/e2eutils/challenger/helper.go
@@ -12,6 +12,7 @@ import (
 	"time"
 	"github.com/ethereum-optimism/optimism/op-service/metrics"
+	"github.com/ethereum/go-ethereum/ethclient"
 	"github.com/stretchr/testify/require"
 	"github.com/ethereum/go-ethereum/common"
@@ -39,15 +40,17 @@ type Helper struct {
 	require *require.Assertions
 	dir     string
 	chl     cliapp.Lifecycle
+	metrics *CapturingMetrics
 }
-func NewHelper(log log.Logger, t *testing.T, require *require.Assertions, dir string, chl cliapp.Lifecycle) *Helper {
+func NewHelper(log log.Logger, t *testing.T, require *require.Assertions, dir string, chl cliapp.Lifecycle, m *CapturingMetrics) *Helper {
 	return &Helper{
 		log:     log,
 		t:       t,
 		require: require,
 		dir:     dir,
 		chl:     chl,
+		metrics: m,
 	}
 }
@@ -134,17 +137,13 @@ func NewChallenger(t *testing.T, ctx context.Context, sys EndpointProvider, name
 	log := testlog.Logger(t, log.LevelDebug).New("role", name)
 	log.Info("Creating challenger")
 	cfg := NewChallengerConfig(t, sys, "sequencer", options...)
-	chl, err := challenger.Main(ctx, log, cfg)
+	cfg.MetricsConfig.Enabled = false // Don't start the metrics server
+	m := NewCapturingMetrics()
+	chl, err := challenger.Main(ctx, log, cfg, m)
 	require.NoError(t, err, "must init challenger")
 	require.NoError(t, chl.Start(ctx), "must start challenger")
-	return &Helper{
+	return NewHelper(log, t, require.New(t), cfg.Datadir, chl, m)
-		log:     log,
-		t:       t,
-		require: require.New(t),
-		dir:     cfg.Datadir,
-		chl:     chl,
-	}
 }
 func NewChallengerConfig(t *testing.T, sys EndpointProvider, l2NodeName string, options ...Option) *config.Config {
@@ -234,3 +233,21 @@ func (h *Helper) WaitForGameDataDeletion(ctx context.Context, games ...GameAddr)
 func (h *Helper) gameDataDir(addr common.Address) string {
 	return filepath.Join(h.dir, "game-"+addr.Hex())
 }
+func (h *Helper) WaitL1HeadActedOn(ctx context.Context, client *ethclient.Client) {
+	l1Head, err := client.BlockNumber(ctx)
+	h.require.NoError(err)
+	h.WaitForHighestActedL1Block(ctx, l1Head)
+}
+func (h *Helper) WaitForHighestActedL1Block(ctx context.Context, head uint64) {
+	timedCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
+	defer cancel()
+	var actual uint64
+	err := wait.For(timedCtx, time.Second, func() (bool, error) {
+		actual = h.metrics.HighestActedL1Block.Load()
+		h.log.Info("Waiting for highest acted L1 block", "target", head, "actual", actual)
+		return actual >= head, nil
+	})
+	h.require.NoErrorf(err, "Highest acted L1 block did not reach %v, was: %v", head, actual)
+}
--- a/op-e2e/e2eutils/challenger/metrics.go
+++ b/op-e2e/e2eutils/challenger/metrics.go
+package challenger
+import (
+	"sync/atomic"
+	"github.com/ethereum-optimism/optimism/op-challenger/metrics"
+)
+type CapturingMetrics struct {
+	metrics.NoopMetricsImpl
+	HighestActedL1Block atomic.Uint64
+}
+func NewCapturingMetrics() *CapturingMetrics {
+	return &CapturingMetrics{}
+}
+var _ metrics.Metricer = (*CapturingMetrics)(nil)
+func (c *CapturingMetrics) RecordActedL1Block(block uint64) {
+	c.HighestActedL1Block.Store(block)
+}
--- a/op-e2e/faultproofs/output_alphabet_test.go
+++ b/op-e2e/faultproofs/output_alphabet_test.go
@@ -293,3 +293,27 @@ func TestOutputAlphabetGame_FreeloaderEarnsNothing(t *testing.T) {
 	amt := game.Credit(ctx, freeloaderOpts.From)
 	require.True(t, amt.BitLen() == 0, "freeloaders should not be rewarded")
 }
+func TestHighestActedL1BlockMetric(t *testing.T) {
+	op_e2e.InitParallel(t)
+	ctx := context.Background()
+	sys, l1Client := StartFaultDisputeSystem(t)
+	t.Cleanup(sys.Close)
+	disputeGameFactory := disputegame.NewFactoryHelper(t, ctx, sys)
+	honestChallenger := disputeGameFactory.StartChallenger(ctx, "Honest", challenger.WithAlphabet(), challenger.WithPrivKey(sys.Cfg.Secrets.Alice))
+	game1 := disputeGameFactory.StartOutputAlphabetGame(ctx, "sequencer", 1, common.Hash{0xaa})
+	sys.AdvanceTime(game1.MaxClockDuration(ctx))
+	require.NoError(t, wait.ForNextBlock(ctx, l1Client))
+	game1.WaitForGameStatus(ctx, disputegame.StatusDefenderWins)
+	disputeGameFactory.StartOutputAlphabetGame(ctx, "sequencer", 2, common.Hash{0xaa})
+	disputeGameFactory.StartOutputAlphabetGame(ctx, "sequencer", 3, common.Hash{0xaa})
+	honestChallenger.WaitL1HeadActedOn(ctx, l1Client)
+	require.NoError(t, wait.ForNextBlock(ctx, l1Client))
+	honestChallenger.WaitL1HeadActedOn(ctx, l1Client)
+}