[op-conductor] fix flaky e2e tests (#9144)

* op-conductor: fix ha test flakiness * 3rd try * 4th try * 1st try with everything ready * 2nd try * 3rd try * 4th try

[op-conductor] fix flaky e2e tests (#9144)
* op-conductor: fix ha test flakiness * 3rd try * 4th try * 1st try with everything ready * 2nd try * 3rd try * 4th try
5e0dc463 · Francis Li · GitHub · 73f92f90 · 5e0dc463 · 5e0dc463
Commit 5e0dc463 authored Jan 24, 2024 by Francis Li Committed by GitHub Jan 24, 2024
12 changed files
--- a/op-conductor/conductor/config.go
+++ b/op-conductor/conductor/config.go
@@ -115,9 +115,10 @@ func NewConfig(ctx *cli.Context, log log.Logger) (*Config, error) {
 		ExecutionRPC:   ctx.String(flags.ExecutionRPC.Name),
 		Paused:         ctx.Bool(flags.Paused.Name),
 		HealthCheck: HealthCheckConfig{
-			Interval:     ctx.Uint64(flags.HealthCheckInterval.Name),
-			SafeInterval: ctx.Uint64(flags.HealthCheckSafeInterval.Name),
-			MinPeerCount: ctx.Uint64(flags.HealthCheckMinPeerCount.Name),
+			Interval:       ctx.Uint64(flags.HealthCheckInterval.Name),
+			UnsafeInterval: ctx.Uint64(flags.HealthCheckUnsafeInterval.Name),
+			SafeInterval:   ctx.Uint64(flags.HealthCheckSafeInterval.Name),
+			MinPeerCount:   ctx.Uint64(flags.HealthCheckMinPeerCount.Name),
 		},
 		RollupCfg:      *rollupCfg,
 		RPCEnableProxy: ctx.Bool(flags.RPCEnableProxy.Name),
@@ -133,6 +134,9 @@ type HealthCheckConfig struct {
 	// Interval is the interval (in seconds) to check the health of the sequencer.
 	Interval uint64

+	// UnsafeInterval is the interval allowed between unsafe head and now in seconds.
+	UnsafeInterval uint64
+
 	// SafeInterval is the interval between safe head progression measured in seconds.
 	SafeInterval uint64


--- a/op-conductor/conductor/service.go
+++ b/op-conductor/conductor/service.go
@@ -175,6 +175,7 @@ func (c *OpConductor) initHealthMonitor(ctx context.Context) error {
 	c.hmon = health.NewSequencerHealthMonitor(
 		c.log,
 		c.cfg.HealthCheck.Interval,
+		c.cfg.HealthCheck.UnsafeInterval,
 		c.cfg.HealthCheck.SafeInterval,
 		c.cfg.HealthCheck.MinPeerCount,
 		&c.cfg.RollupCfg,
@@ -220,6 +221,12 @@ func (oc *OpConductor) initRPCServer(ctx context.Context) error {
 			Namespace: conductorrpc.NodeRPCNamespace,
 			Service:   nodeProxy,
 		})
+
+		nodeAdminProxy := conductorrpc.NewNodeAdminProxyBackend(oc.log, oc, nodeClient)
+		server.AddAPI(rpc.API{
+			Namespace: conductorrpc.NodeAdminRPCNamespace,
+			Service:   nodeAdminProxy,
+		})
 	}

 	oc.rpcServer = server
@@ -572,7 +579,6 @@ func (oc *OpConductor) startSequencer() error {
 		return errors.Wrap(err, "failed to get latest unsafe block from EL during startSequencer phase")
 	}

-	//if unsafeInCons.BlockHash != unsafeInNode.Hash() {
 	if unsafeInCons.ExecutionPayload.BlockHash != unsafeInNode.Hash() {
 		oc.log.Warn(
 			"latest unsafe block in consensus is not the same as the one in op-node",

--- a/op-conductor/conductor/service_test.go
+++ b/op-conductor/conductor/service_test.go
@@ -36,9 +36,10 @@ func mockConfig(t *testing.T) Config {
 		ExecutionRPC:   "http://geth:8545",
 		Paused:         false,
 		HealthCheck: HealthCheckConfig{
-			Interval:     1,
-			SafeInterval: 5,
-			MinPeerCount: 1,
+			Interval:       1,
+			UnsafeInterval: 3,
+			SafeInterval:   5,
+			MinPeerCount:   1,
 		},
 		RollupCfg: rollup.Config{
 			Genesis: rollup.Genesis{

--- a/op-conductor/flags/flags.go
+++ b/op-conductor/flags/flags.go
@@ -53,6 +53,11 @@ var (
 		Usage:   "Interval between health checks",
 		EnvVars: opservice.PrefixEnvVar(EnvVarPrefix, "HEALTHCHECK_INTERVAL"),
 	}
+	HealthCheckUnsafeInterval = &cli.Uint64Flag{
+		Name:    "healthcheck.unsafe-interval",
+		Usage:   "Interval allowed between unsafe head and now measured in seconds",
+		EnvVars: opservice.PrefixEnvVar(EnvVarPrefix, "HEALTHCHECK_UNSAFE_INTERVAL"),
+	}
 	HealthCheckSafeInterval = &cli.Uint64Flag{
 		Name:    "healthcheck.safe-interval",
 		Usage:   "Interval between safe head progression measured in seconds",

--- a/op-conductor/health/monitor.go
+++ b/op-conductor/health/monitor.go
@@ -28,13 +28,14 @@ type HealthMonitor interface {
 // interval is the interval between health checks measured in seconds.
 // safeInterval is the interval between safe head progress measured in seconds.
 // minPeerCount is the minimum number of peers required for the sequencer to be healthy.
-func NewSequencerHealthMonitor(log log.Logger, interval, safeInterval, minPeerCount uint64, rollupCfg *rollup.Config, node dial.RollupClientInterface, p2p p2p.API) HealthMonitor {
+func NewSequencerHealthMonitor(log log.Logger, interval, unsafeInterval, safeInterval, minPeerCount uint64, rollupCfg *rollup.Config, node dial.RollupClientInterface, p2p p2p.API) HealthMonitor {
 	return &SequencerHealthMonitor{
 		log:            log,
 		done:           make(chan struct{}),
 		interval:       interval,
 		healthUpdateCh: make(chan bool),
 		rollupCfg:      rollupCfg,
+		unsafeInterval: unsafeInterval,
 		safeInterval:   safeInterval,
 		minPeerCount:   minPeerCount,
 		node:           node,
@@ -48,11 +49,14 @@ type SequencerHealthMonitor struct {
 	done chan struct{}
 	wg   sync.WaitGroup

-	rollupCfg      *rollup.Config
-	safeInterval   uint64
-	minPeerCount   uint64
-	interval       uint64
-	healthUpdateCh chan bool
+	rollupCfg          *rollup.Config
+	unsafeInterval     uint64
+	safeInterval       uint64
+	minPeerCount       uint64
+	interval           uint64
+	healthUpdateCh     chan bool
+	lastSeenUnsafeNum  uint64
+	lastSeenUnsafeTime uint64

 	node dial.RollupClientInterface
 	p2p  p2p.API
@@ -104,8 +108,9 @@ func (hm *SequencerHealthMonitor) loop() {

 // healthCheck checks the health of the sequencer by 3 criteria:
 // 1. unsafe head is progressing per block time
-// 2. safe head is progressing every configured batch submission interval
-// 3. peer count is above the configured minimum
+// 2. unsafe head is not too far behind now (measured by unsafeInterval)
+// 3. safe head is progressing every configured batch submission interval
+// 4. peer count is above the configured minimum
 func (hm *SequencerHealthMonitor) healthCheck() bool {
 	ctx := context.Background()
 	status, err := hm.node.SyncStatus(ctx)
@@ -115,14 +120,48 @@ func (hm *SequencerHealthMonitor) healthCheck() bool {
 	}

 	now := uint64(time.Now().Unix())
-	// allow at most one block drift for unsafe head
-	if now-status.UnsafeL2.Time > hm.interval+hm.rollupCfg.BlockTime {
-		hm.log.Error("unsafe head is not progressing", "lastSeenUnsafeBlock", status.UnsafeL2)
+
+	if hm.lastSeenUnsafeNum != 0 {
+		diff := now - hm.lastSeenUnsafeTime
+		// how many blocks do we expect to see, minus 1 to account for edge case with respect to time.
+		// for example, if diff = 2.001s and block time = 2s, expecting to see 1 block could potentially cause sequencer to be considered unhealthy.
+		blocks := diff/hm.rollupCfg.BlockTime - 1
+		if diff > hm.rollupCfg.BlockTime && blocks > status.UnsafeL2.Number-hm.lastSeenUnsafeNum {
+			hm.log.Error(
+				"unsafe head is not progressing as expected",
+				"now", now,
+				"unsafe_head_num", status.UnsafeL2.Number,
+				"last_seen_unsafe_num", hm.lastSeenUnsafeNum,
+				"last_seen_unsafe_time", hm.lastSeenUnsafeTime,
+				"unsafe_interval", hm.unsafeInterval,
+			)
+			return false
+		}
+	}
+	if status.UnsafeL2.Number > hm.lastSeenUnsafeNum {
+		hm.lastSeenUnsafeNum = status.UnsafeL2.Number
+		hm.lastSeenUnsafeTime = now
+	}
+
+	if now-status.UnsafeL2.Time > hm.unsafeInterval {
+		hm.log.Error(
+			"unsafe head is not progressing as expected",
+			"now", now,
+			"unsafe_head_num", status.UnsafeL2.Number,
+			"unsafe_head_time", status.UnsafeL2.Time,
+			"unsafe_interval", hm.unsafeInterval,
+		)
 		return false
 	}

 	if now-status.SafeL2.Time > hm.safeInterval {
-		hm.log.Error("safe head is not progressing", "safe_head_time", status.SafeL2.Time, "now", now)
+		hm.log.Error(
+			"safe head is not progressing as expected",
+			"now", now,
+			"safe_head_num", status.SafeL2.Number,
+			"safe_head_time", status.SafeL2.Time,
+			"safe_interval", hm.safeInterval,
+		)
 		return false
 	}


--- a/op-conductor/health/monitor_test.go
+++ b/op-conductor/health/monitor_test.go
@@ -26,14 +26,15 @@ const (
 type HealthMonitorTestSuite struct {
 	suite.Suite

-	log          log.Logger
-	rc           *testutils.MockRollupClient
-	pc           *p2pMocks.API
-	interval     uint64
-	safeInterval uint64
-	minPeerCount uint64
-	rollupCfg    *rollup.Config
-	monitor      HealthMonitor
+	log            log.Logger
+	rc             *testutils.MockRollupClient
+	pc             *p2pMocks.API
+	interval       uint64
+	unsafeInterval uint64
+	safeInterval   uint64
+	minPeerCount   uint64
+	rollupCfg      *rollup.Config
+	monitor        HealthMonitor
 }

 func (s *HealthMonitorTestSuite) SetupSuite() {
@@ -41,6 +42,7 @@ func (s *HealthMonitorTestSuite) SetupSuite() {
 	s.rc = &testutils.MockRollupClient{}
 	s.pc = &p2pMocks.API{}
 	s.interval = 1
+	s.unsafeInterval = 3
 	s.safeInterval = 5
 	s.minPeerCount = minPeerCount
 	s.rollupCfg = &rollup.Config{
@@ -49,7 +51,7 @@ func (s *HealthMonitorTestSuite) SetupSuite() {
 }

 func (s *HealthMonitorTestSuite) SetupTest() {
-	s.monitor = NewSequencerHealthMonitor(s.log, s.interval, s.safeInterval, s.minPeerCount, s.rollupCfg, s.rc, s.pc)
+	s.monitor = NewSequencerHealthMonitor(s.log, s.interval, s.unsafeInterval, s.safeInterval, s.minPeerCount, s.rollupCfg, s.rc, s.pc)
 	err := s.monitor.Start()
 	s.NoError(err)
 }
@@ -90,10 +92,12 @@ func (s *HealthMonitorTestSuite) TestUnhealthyUnsafeHeadNotProgressing() {
 	now := uint64(time.Now().Unix())
 	ss1 := &eth.SyncStatus{
 		UnsafeL2: eth.L2BlockRef{
-			Time: now - 1,
+			Number: 5,
+			Time:   now - 1,
 		},
 		SafeL2: eth.L2BlockRef{
-			Time: now - 2,
+			Number: 1,
+			Time:   now - 2,
 		},
 	}
 	s.rc.ExpectSyncStatus(ss1, nil)

--- a/op-conductor/rpc/api.go
+++ b/op-conductor/rpc/api.go
@@ -3,15 +3,14 @@ package rpc
 import (
 	"context"
 	"errors"
-	"math/big"

+	"github.com/ethereum/go-ethereum/rpc"
+
+	"github.com/ethereum-optimism/optimism/op-node/rollup"
 	"github.com/ethereum-optimism/optimism/op-service/eth"
-	"github.com/ethereum/go-ethereum/core/types"
 )

-var (
-	ErrNotLeader = errors.New("refusing to proxy request to non-leader sequencer")
-)
+var ErrNotLeader = errors.New("refusing to proxy request to non-leader sequencer")

 type ServerInfo struct {
 	ID   string `json:"id"`
@@ -53,13 +52,19 @@ type API interface {
 // ExecutionProxyAPI defines the methods proxied to the execution rpc backend
 // This should include all methods that are called by op-batcher or op-proposer
 type ExecutionProxyAPI interface {
-	BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error)
+	GetBlockByNumber(ctx context.Context, number rpc.BlockNumber, fullTx bool) (map[string]interface{}, error)
 }

 // NodeProxyAPI defines the methods proxied to the node rpc backend
 // This should include all methods that are called by op-batcher or op-proposer
 type NodeProxyAPI interface {
 	OutputAtBlock(ctx context.Context, blockNum uint64) (*eth.OutputResponse, error)
-	SequencerActive(ctx context.Context) (bool, error)
 	SyncStatus(ctx context.Context) (*eth.SyncStatus, error)
+	RollupConfig(ctx context.Context) (*rollup.Config, error)
+}
+
+// NodeProxyAPI defines the methods proxied to the node rpc backend
+// This should include all methods that are called by op-batcher or op-proposer
+type NodeAdminProxyAPI interface {
+	SequencerActive(ctx context.Context) (bool, error)
 }
--- a/op-conductor/rpc/execution_proxy.go
+++ b/op-conductor/rpc/execution_proxy.go
@@ -2,11 +2,10 @@ package rpc

 import (
 	"context"
-	"math/big"

-	"github.com/ethereum/go-ethereum/core/types"
 	"github.com/ethereum/go-ethereum/ethclient"
 	"github.com/ethereum/go-ethereum/log"
+	"github.com/ethereum/go-ethereum/rpc"
 )

 var ExecutionRPCNamespace = "eth"
@@ -28,13 +27,14 @@ func NewExecutionProxyBackend(log log.Logger, con conductor, client *ethclient.C
 	}
 }

-func (api *ExecutionProxyBackend) BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error) {
-	block, err := api.client.BlockByNumber(ctx, number)
+func (api *ExecutionProxyBackend) GetBlockByNumber(ctx context.Context, number rpc.BlockNumber, fullTx bool) (map[string]interface{}, error) {
+	var result map[string]interface{}
+	err := api.client.Client().Call(&result, "eth_getBlockByNumber", number, fullTx)
 	if err != nil {
 		return nil, err
 	}
 	if !api.con.Leader(ctx) {
 		return nil, ErrNotLeader
 	}
-	return block, nil
+	return result, nil
 }
--- a/op-conductor/rpc/node_admin_proxy.go
+++ b/op-conductor/rpc/node_admin_proxy.go
+package rpc
+
+import (
+	"context"
+
+	"github.com/ethereum/go-ethereum/log"
+
+	"github.com/ethereum-optimism/optimism/op-service/sources"
+)
+
+var NodeAdminRPCNamespace = "admin"
+
+// NodeAdminProxyAPI implements a node admin rpc proxy with a leadership check to make sure only leader returns the result.
+type NodeAdminProxyBackend struct {
+	log    log.Logger
+	con    conductor
+	client *sources.RollupClient
+}
+
+var _ NodeAdminProxyAPI = (*NodeAdminProxyBackend)(nil)
+
+// NewNodeAdminProxyBackend creates a new NodeAdminProxyBackend instance.
+func NewNodeAdminProxyBackend(log log.Logger, con conductor, client *sources.RollupClient) NodeAdminProxyAPI {
+	return &NodeAdminProxyBackend{
+		log:    log,
+		con:    con,
+		client: client,
+	}
+}
+
+func (api *NodeAdminProxyBackend) SequencerActive(ctx context.Context) (bool, error) {
+	active, err := api.client.SequencerActive(ctx)
+	if err != nil {
+		return false, err
+	}
+	if !api.con.Leader(ctx) {
+		return false, ErrNotLeader
+	}
+	return active, err
+}
--- a/op-conductor/rpc/node_proxy.go
+++ b/op-conductor/rpc/node_proxy.go
@@ -3,9 +3,11 @@ package rpc
 import (
 	"context"

+	"github.com/ethereum/go-ethereum/log"
+
+	"github.com/ethereum-optimism/optimism/op-node/rollup"
 	"github.com/ethereum-optimism/optimism/op-service/eth"
 	"github.com/ethereum-optimism/optimism/op-service/sources"
-	"github.com/ethereum/go-ethereum/log"
 )

 var NodeRPCNamespace = "optimism"
@@ -49,13 +51,13 @@ func (api *NodeProxyBackend) OutputAtBlock(ctx context.Context, blockNum uint64)
 	return output, nil
 }

-func (api *NodeProxyBackend) SequencerActive(ctx context.Context) (bool, error) {
-	active, err := api.client.SequencerActive(ctx)
+func (api *NodeProxyBackend) RollupConfig(ctx context.Context) (*rollup.Config, error) {
+	config, err := api.client.RollupConfig(ctx)
 	if err != nil {
-		return false, err
+		return nil, err
 	}
 	if !api.con.Leader(ctx) {
-		return false, ErrNotLeader
+		return nil, ErrNotLeader
 	}
-	return active, err
+	return config, nil
 }
--- a/op-e2e/sequencer_failover_setup.go
+++ b/op-e2e/sequencer_failover_setup.go
--- a/op-e2e/sequencer_failover_test.go
+++ b/op-e2e/sequencer_failover_test.go
@@ -9,7 +9,6 @@ import (
 // [Category: Initial Setup]
 // In this test, we test that we can successfully setup a working cluster.
 func TestSequencerFailover_SetupCluster(t *testing.T) {
-	t.Skip("temporarily disable due to flakiness for now")
 	sys, conductors := setupSequencerFailoverTest(t)
 	defer sys.Close()