Commit 5e0dc463 authored by Francis Li's avatar Francis Li Committed by GitHub

[op-conductor] fix flaky e2e tests (#9144)

* op-conductor: fix ha test flakiness

* 3rd try

* 4th try

* 1st try with everything ready

* 2nd try

* 3rd try

* 4th try
parent 73f92f90
...@@ -115,9 +115,10 @@ func NewConfig(ctx *cli.Context, log log.Logger) (*Config, error) { ...@@ -115,9 +115,10 @@ func NewConfig(ctx *cli.Context, log log.Logger) (*Config, error) {
ExecutionRPC: ctx.String(flags.ExecutionRPC.Name), ExecutionRPC: ctx.String(flags.ExecutionRPC.Name),
Paused: ctx.Bool(flags.Paused.Name), Paused: ctx.Bool(flags.Paused.Name),
HealthCheck: HealthCheckConfig{ HealthCheck: HealthCheckConfig{
Interval: ctx.Uint64(flags.HealthCheckInterval.Name), Interval: ctx.Uint64(flags.HealthCheckInterval.Name),
SafeInterval: ctx.Uint64(flags.HealthCheckSafeInterval.Name), UnsafeInterval: ctx.Uint64(flags.HealthCheckUnsafeInterval.Name),
MinPeerCount: ctx.Uint64(flags.HealthCheckMinPeerCount.Name), SafeInterval: ctx.Uint64(flags.HealthCheckSafeInterval.Name),
MinPeerCount: ctx.Uint64(flags.HealthCheckMinPeerCount.Name),
}, },
RollupCfg: *rollupCfg, RollupCfg: *rollupCfg,
RPCEnableProxy: ctx.Bool(flags.RPCEnableProxy.Name), RPCEnableProxy: ctx.Bool(flags.RPCEnableProxy.Name),
...@@ -133,6 +134,9 @@ type HealthCheckConfig struct { ...@@ -133,6 +134,9 @@ type HealthCheckConfig struct {
// Interval is the interval (in seconds) to check the health of the sequencer. // Interval is the interval (in seconds) to check the health of the sequencer.
Interval uint64 Interval uint64
// UnsafeInterval is the interval allowed between unsafe head and now in seconds.
UnsafeInterval uint64
// SafeInterval is the interval between safe head progression measured in seconds. // SafeInterval is the interval between safe head progression measured in seconds.
SafeInterval uint64 SafeInterval uint64
......
...@@ -175,6 +175,7 @@ func (c *OpConductor) initHealthMonitor(ctx context.Context) error { ...@@ -175,6 +175,7 @@ func (c *OpConductor) initHealthMonitor(ctx context.Context) error {
c.hmon = health.NewSequencerHealthMonitor( c.hmon = health.NewSequencerHealthMonitor(
c.log, c.log,
c.cfg.HealthCheck.Interval, c.cfg.HealthCheck.Interval,
c.cfg.HealthCheck.UnsafeInterval,
c.cfg.HealthCheck.SafeInterval, c.cfg.HealthCheck.SafeInterval,
c.cfg.HealthCheck.MinPeerCount, c.cfg.HealthCheck.MinPeerCount,
&c.cfg.RollupCfg, &c.cfg.RollupCfg,
...@@ -220,6 +221,12 @@ func (oc *OpConductor) initRPCServer(ctx context.Context) error { ...@@ -220,6 +221,12 @@ func (oc *OpConductor) initRPCServer(ctx context.Context) error {
Namespace: conductorrpc.NodeRPCNamespace, Namespace: conductorrpc.NodeRPCNamespace,
Service: nodeProxy, Service: nodeProxy,
}) })
nodeAdminProxy := conductorrpc.NewNodeAdminProxyBackend(oc.log, oc, nodeClient)
server.AddAPI(rpc.API{
Namespace: conductorrpc.NodeAdminRPCNamespace,
Service: nodeAdminProxy,
})
} }
oc.rpcServer = server oc.rpcServer = server
...@@ -572,7 +579,6 @@ func (oc *OpConductor) startSequencer() error { ...@@ -572,7 +579,6 @@ func (oc *OpConductor) startSequencer() error {
return errors.Wrap(err, "failed to get latest unsafe block from EL during startSequencer phase") return errors.Wrap(err, "failed to get latest unsafe block from EL during startSequencer phase")
} }
//if unsafeInCons.BlockHash != unsafeInNode.Hash() {
if unsafeInCons.ExecutionPayload.BlockHash != unsafeInNode.Hash() { if unsafeInCons.ExecutionPayload.BlockHash != unsafeInNode.Hash() {
oc.log.Warn( oc.log.Warn(
"latest unsafe block in consensus is not the same as the one in op-node", "latest unsafe block in consensus is not the same as the one in op-node",
......
...@@ -36,9 +36,10 @@ func mockConfig(t *testing.T) Config { ...@@ -36,9 +36,10 @@ func mockConfig(t *testing.T) Config {
ExecutionRPC: "http://geth:8545", ExecutionRPC: "http://geth:8545",
Paused: false, Paused: false,
HealthCheck: HealthCheckConfig{ HealthCheck: HealthCheckConfig{
Interval: 1, Interval: 1,
SafeInterval: 5, UnsafeInterval: 3,
MinPeerCount: 1, SafeInterval: 5,
MinPeerCount: 1,
}, },
RollupCfg: rollup.Config{ RollupCfg: rollup.Config{
Genesis: rollup.Genesis{ Genesis: rollup.Genesis{
......
...@@ -53,6 +53,11 @@ var ( ...@@ -53,6 +53,11 @@ var (
Usage: "Interval between health checks", Usage: "Interval between health checks",
EnvVars: opservice.PrefixEnvVar(EnvVarPrefix, "HEALTHCHECK_INTERVAL"), EnvVars: opservice.PrefixEnvVar(EnvVarPrefix, "HEALTHCHECK_INTERVAL"),
} }
HealthCheckUnsafeInterval = &cli.Uint64Flag{
Name: "healthcheck.unsafe-interval",
Usage: "Interval allowed between unsafe head and now measured in seconds",
EnvVars: opservice.PrefixEnvVar(EnvVarPrefix, "HEALTHCHECK_UNSAFE_INTERVAL"),
}
HealthCheckSafeInterval = &cli.Uint64Flag{ HealthCheckSafeInterval = &cli.Uint64Flag{
Name: "healthcheck.safe-interval", Name: "healthcheck.safe-interval",
Usage: "Interval between safe head progression measured in seconds", Usage: "Interval between safe head progression measured in seconds",
......
...@@ -28,13 +28,14 @@ type HealthMonitor interface { ...@@ -28,13 +28,14 @@ type HealthMonitor interface {
// interval is the interval between health checks measured in seconds. // interval is the interval between health checks measured in seconds.
// safeInterval is the interval between safe head progress measured in seconds. // safeInterval is the interval between safe head progress measured in seconds.
// minPeerCount is the minimum number of peers required for the sequencer to be healthy. // minPeerCount is the minimum number of peers required for the sequencer to be healthy.
func NewSequencerHealthMonitor(log log.Logger, interval, safeInterval, minPeerCount uint64, rollupCfg *rollup.Config, node dial.RollupClientInterface, p2p p2p.API) HealthMonitor { func NewSequencerHealthMonitor(log log.Logger, interval, unsafeInterval, safeInterval, minPeerCount uint64, rollupCfg *rollup.Config, node dial.RollupClientInterface, p2p p2p.API) HealthMonitor {
return &SequencerHealthMonitor{ return &SequencerHealthMonitor{
log: log, log: log,
done: make(chan struct{}), done: make(chan struct{}),
interval: interval, interval: interval,
healthUpdateCh: make(chan bool), healthUpdateCh: make(chan bool),
rollupCfg: rollupCfg, rollupCfg: rollupCfg,
unsafeInterval: unsafeInterval,
safeInterval: safeInterval, safeInterval: safeInterval,
minPeerCount: minPeerCount, minPeerCount: minPeerCount,
node: node, node: node,
...@@ -48,11 +49,14 @@ type SequencerHealthMonitor struct { ...@@ -48,11 +49,14 @@ type SequencerHealthMonitor struct {
done chan struct{} done chan struct{}
wg sync.WaitGroup wg sync.WaitGroup
rollupCfg *rollup.Config rollupCfg *rollup.Config
safeInterval uint64 unsafeInterval uint64
minPeerCount uint64 safeInterval uint64
interval uint64 minPeerCount uint64
healthUpdateCh chan bool interval uint64
healthUpdateCh chan bool
lastSeenUnsafeNum uint64
lastSeenUnsafeTime uint64
node dial.RollupClientInterface node dial.RollupClientInterface
p2p p2p.API p2p p2p.API
...@@ -104,8 +108,9 @@ func (hm *SequencerHealthMonitor) loop() { ...@@ -104,8 +108,9 @@ func (hm *SequencerHealthMonitor) loop() {
// healthCheck checks the health of the sequencer by 3 criteria: // healthCheck checks the health of the sequencer by 3 criteria:
// 1. unsafe head is progressing per block time // 1. unsafe head is progressing per block time
// 2. safe head is progressing every configured batch submission interval // 2. unsafe head is not too far behind now (measured by unsafeInterval)
// 3. peer count is above the configured minimum // 3. safe head is progressing every configured batch submission interval
// 4. peer count is above the configured minimum
func (hm *SequencerHealthMonitor) healthCheck() bool { func (hm *SequencerHealthMonitor) healthCheck() bool {
ctx := context.Background() ctx := context.Background()
status, err := hm.node.SyncStatus(ctx) status, err := hm.node.SyncStatus(ctx)
...@@ -115,14 +120,48 @@ func (hm *SequencerHealthMonitor) healthCheck() bool { ...@@ -115,14 +120,48 @@ func (hm *SequencerHealthMonitor) healthCheck() bool {
} }
now := uint64(time.Now().Unix()) now := uint64(time.Now().Unix())
// allow at most one block drift for unsafe head
if now-status.UnsafeL2.Time > hm.interval+hm.rollupCfg.BlockTime { if hm.lastSeenUnsafeNum != 0 {
hm.log.Error("unsafe head is not progressing", "lastSeenUnsafeBlock", status.UnsafeL2) diff := now - hm.lastSeenUnsafeTime
// how many blocks do we expect to see, minus 1 to account for edge case with respect to time.
// for example, if diff = 2.001s and block time = 2s, expecting to see 1 block could potentially cause sequencer to be considered unhealthy.
blocks := diff/hm.rollupCfg.BlockTime - 1
if diff > hm.rollupCfg.BlockTime && blocks > status.UnsafeL2.Number-hm.lastSeenUnsafeNum {
hm.log.Error(
"unsafe head is not progressing as expected",
"now", now,
"unsafe_head_num", status.UnsafeL2.Number,
"last_seen_unsafe_num", hm.lastSeenUnsafeNum,
"last_seen_unsafe_time", hm.lastSeenUnsafeTime,
"unsafe_interval", hm.unsafeInterval,
)
return false
}
}
if status.UnsafeL2.Number > hm.lastSeenUnsafeNum {
hm.lastSeenUnsafeNum = status.UnsafeL2.Number
hm.lastSeenUnsafeTime = now
}
if now-status.UnsafeL2.Time > hm.unsafeInterval {
hm.log.Error(
"unsafe head is not progressing as expected",
"now", now,
"unsafe_head_num", status.UnsafeL2.Number,
"unsafe_head_time", status.UnsafeL2.Time,
"unsafe_interval", hm.unsafeInterval,
)
return false return false
} }
if now-status.SafeL2.Time > hm.safeInterval { if now-status.SafeL2.Time > hm.safeInterval {
hm.log.Error("safe head is not progressing", "safe_head_time", status.SafeL2.Time, "now", now) hm.log.Error(
"safe head is not progressing as expected",
"now", now,
"safe_head_num", status.SafeL2.Number,
"safe_head_time", status.SafeL2.Time,
"safe_interval", hm.safeInterval,
)
return false return false
} }
......
...@@ -26,14 +26,15 @@ const ( ...@@ -26,14 +26,15 @@ const (
type HealthMonitorTestSuite struct { type HealthMonitorTestSuite struct {
suite.Suite suite.Suite
log log.Logger log log.Logger
rc *testutils.MockRollupClient rc *testutils.MockRollupClient
pc *p2pMocks.API pc *p2pMocks.API
interval uint64 interval uint64
safeInterval uint64 unsafeInterval uint64
minPeerCount uint64 safeInterval uint64
rollupCfg *rollup.Config minPeerCount uint64
monitor HealthMonitor rollupCfg *rollup.Config
monitor HealthMonitor
} }
func (s *HealthMonitorTestSuite) SetupSuite() { func (s *HealthMonitorTestSuite) SetupSuite() {
...@@ -41,6 +42,7 @@ func (s *HealthMonitorTestSuite) SetupSuite() { ...@@ -41,6 +42,7 @@ func (s *HealthMonitorTestSuite) SetupSuite() {
s.rc = &testutils.MockRollupClient{} s.rc = &testutils.MockRollupClient{}
s.pc = &p2pMocks.API{} s.pc = &p2pMocks.API{}
s.interval = 1 s.interval = 1
s.unsafeInterval = 3
s.safeInterval = 5 s.safeInterval = 5
s.minPeerCount = minPeerCount s.minPeerCount = minPeerCount
s.rollupCfg = &rollup.Config{ s.rollupCfg = &rollup.Config{
...@@ -49,7 +51,7 @@ func (s *HealthMonitorTestSuite) SetupSuite() { ...@@ -49,7 +51,7 @@ func (s *HealthMonitorTestSuite) SetupSuite() {
} }
func (s *HealthMonitorTestSuite) SetupTest() { func (s *HealthMonitorTestSuite) SetupTest() {
s.monitor = NewSequencerHealthMonitor(s.log, s.interval, s.safeInterval, s.minPeerCount, s.rollupCfg, s.rc, s.pc) s.monitor = NewSequencerHealthMonitor(s.log, s.interval, s.unsafeInterval, s.safeInterval, s.minPeerCount, s.rollupCfg, s.rc, s.pc)
err := s.monitor.Start() err := s.monitor.Start()
s.NoError(err) s.NoError(err)
} }
...@@ -90,10 +92,12 @@ func (s *HealthMonitorTestSuite) TestUnhealthyUnsafeHeadNotProgressing() { ...@@ -90,10 +92,12 @@ func (s *HealthMonitorTestSuite) TestUnhealthyUnsafeHeadNotProgressing() {
now := uint64(time.Now().Unix()) now := uint64(time.Now().Unix())
ss1 := &eth.SyncStatus{ ss1 := &eth.SyncStatus{
UnsafeL2: eth.L2BlockRef{ UnsafeL2: eth.L2BlockRef{
Time: now - 1, Number: 5,
Time: now - 1,
}, },
SafeL2: eth.L2BlockRef{ SafeL2: eth.L2BlockRef{
Time: now - 2, Number: 1,
Time: now - 2,
}, },
} }
s.rc.ExpectSyncStatus(ss1, nil) s.rc.ExpectSyncStatus(ss1, nil)
......
...@@ -3,15 +3,14 @@ package rpc ...@@ -3,15 +3,14 @@ package rpc
import ( import (
"context" "context"
"errors" "errors"
"math/big"
"github.com/ethereum/go-ethereum/rpc"
"github.com/ethereum-optimism/optimism/op-node/rollup"
"github.com/ethereum-optimism/optimism/op-service/eth" "github.com/ethereum-optimism/optimism/op-service/eth"
"github.com/ethereum/go-ethereum/core/types"
) )
var ( var ErrNotLeader = errors.New("refusing to proxy request to non-leader sequencer")
ErrNotLeader = errors.New("refusing to proxy request to non-leader sequencer")
)
type ServerInfo struct { type ServerInfo struct {
ID string `json:"id"` ID string `json:"id"`
...@@ -53,13 +52,19 @@ type API interface { ...@@ -53,13 +52,19 @@ type API interface {
// ExecutionProxyAPI defines the methods proxied to the execution rpc backend // ExecutionProxyAPI defines the methods proxied to the execution rpc backend
// This should include all methods that are called by op-batcher or op-proposer // This should include all methods that are called by op-batcher or op-proposer
type ExecutionProxyAPI interface { type ExecutionProxyAPI interface {
BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error) GetBlockByNumber(ctx context.Context, number rpc.BlockNumber, fullTx bool) (map[string]interface{}, error)
} }
// NodeProxyAPI defines the methods proxied to the node rpc backend // NodeProxyAPI defines the methods proxied to the node rpc backend
// This should include all methods that are called by op-batcher or op-proposer // This should include all methods that are called by op-batcher or op-proposer
type NodeProxyAPI interface { type NodeProxyAPI interface {
OutputAtBlock(ctx context.Context, blockNum uint64) (*eth.OutputResponse, error) OutputAtBlock(ctx context.Context, blockNum uint64) (*eth.OutputResponse, error)
SequencerActive(ctx context.Context) (bool, error)
SyncStatus(ctx context.Context) (*eth.SyncStatus, error) SyncStatus(ctx context.Context) (*eth.SyncStatus, error)
RollupConfig(ctx context.Context) (*rollup.Config, error)
}
// NodeProxyAPI defines the methods proxied to the node rpc backend
// This should include all methods that are called by op-batcher or op-proposer
type NodeAdminProxyAPI interface {
SequencerActive(ctx context.Context) (bool, error)
} }
...@@ -2,11 +2,10 @@ package rpc ...@@ -2,11 +2,10 @@ package rpc
import ( import (
"context" "context"
"math/big"
"github.com/ethereum/go-ethereum/core/types"
"github.com/ethereum/go-ethereum/ethclient" "github.com/ethereum/go-ethereum/ethclient"
"github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/rpc"
) )
var ExecutionRPCNamespace = "eth" var ExecutionRPCNamespace = "eth"
...@@ -28,13 +27,14 @@ func NewExecutionProxyBackend(log log.Logger, con conductor, client *ethclient.C ...@@ -28,13 +27,14 @@ func NewExecutionProxyBackend(log log.Logger, con conductor, client *ethclient.C
} }
} }
func (api *ExecutionProxyBackend) BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error) { func (api *ExecutionProxyBackend) GetBlockByNumber(ctx context.Context, number rpc.BlockNumber, fullTx bool) (map[string]interface{}, error) {
block, err := api.client.BlockByNumber(ctx, number) var result map[string]interface{}
err := api.client.Client().Call(&result, "eth_getBlockByNumber", number, fullTx)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if !api.con.Leader(ctx) { if !api.con.Leader(ctx) {
return nil, ErrNotLeader return nil, ErrNotLeader
} }
return block, nil return result, nil
} }
package rpc
import (
"context"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum-optimism/optimism/op-service/sources"
)
var NodeAdminRPCNamespace = "admin"
// NodeAdminProxyAPI implements a node admin rpc proxy with a leadership check to make sure only leader returns the result.
type NodeAdminProxyBackend struct {
log log.Logger
con conductor
client *sources.RollupClient
}
var _ NodeAdminProxyAPI = (*NodeAdminProxyBackend)(nil)
// NewNodeAdminProxyBackend creates a new NodeAdminProxyBackend instance.
func NewNodeAdminProxyBackend(log log.Logger, con conductor, client *sources.RollupClient) NodeAdminProxyAPI {
return &NodeAdminProxyBackend{
log: log,
con: con,
client: client,
}
}
func (api *NodeAdminProxyBackend) SequencerActive(ctx context.Context) (bool, error) {
active, err := api.client.SequencerActive(ctx)
if err != nil {
return false, err
}
if !api.con.Leader(ctx) {
return false, ErrNotLeader
}
return active, err
}
...@@ -3,9 +3,11 @@ package rpc ...@@ -3,9 +3,11 @@ package rpc
import ( import (
"context" "context"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum-optimism/optimism/op-node/rollup"
"github.com/ethereum-optimism/optimism/op-service/eth" "github.com/ethereum-optimism/optimism/op-service/eth"
"github.com/ethereum-optimism/optimism/op-service/sources" "github.com/ethereum-optimism/optimism/op-service/sources"
"github.com/ethereum/go-ethereum/log"
) )
var NodeRPCNamespace = "optimism" var NodeRPCNamespace = "optimism"
...@@ -49,13 +51,13 @@ func (api *NodeProxyBackend) OutputAtBlock(ctx context.Context, blockNum uint64) ...@@ -49,13 +51,13 @@ func (api *NodeProxyBackend) OutputAtBlock(ctx context.Context, blockNum uint64)
return output, nil return output, nil
} }
func (api *NodeProxyBackend) SequencerActive(ctx context.Context) (bool, error) { func (api *NodeProxyBackend) RollupConfig(ctx context.Context) (*rollup.Config, error) {
active, err := api.client.SequencerActive(ctx) config, err := api.client.RollupConfig(ctx)
if err != nil { if err != nil {
return false, err return nil, err
} }
if !api.con.Leader(ctx) { if !api.con.Leader(ctx) {
return false, ErrNotLeader return nil, ErrNotLeader
} }
return active, err return config, nil
} }
This diff is collapsed.
...@@ -9,7 +9,6 @@ import ( ...@@ -9,7 +9,6 @@ import (
// [Category: Initial Setup] // [Category: Initial Setup]
// In this test, we test that we can successfully setup a working cluster. // In this test, we test that we can successfully setup a working cluster.
func TestSequencerFailover_SetupCluster(t *testing.T) { func TestSequencerFailover_SetupCluster(t *testing.T) {
t.Skip("temporarily disable due to flakiness for now")
sys, conductors := setupSequencerFailoverTest(t) sys, conductors := setupSequencerFailoverTest(t)
defer sys.Close() defer sys.Close()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment