Commit 5e0dc463 authored by Francis Li's avatar Francis Li Committed by GitHub

[op-conductor] fix flaky e2e tests (#9144)

* op-conductor: fix ha test flakiness

* 3rd try

* 4th try

* 1st try with everything ready

* 2nd try

* 3rd try

* 4th try
parent 73f92f90
......@@ -115,9 +115,10 @@ func NewConfig(ctx *cli.Context, log log.Logger) (*Config, error) {
ExecutionRPC: ctx.String(flags.ExecutionRPC.Name),
Paused: ctx.Bool(flags.Paused.Name),
HealthCheck: HealthCheckConfig{
Interval: ctx.Uint64(flags.HealthCheckInterval.Name),
SafeInterval: ctx.Uint64(flags.HealthCheckSafeInterval.Name),
MinPeerCount: ctx.Uint64(flags.HealthCheckMinPeerCount.Name),
Interval: ctx.Uint64(flags.HealthCheckInterval.Name),
UnsafeInterval: ctx.Uint64(flags.HealthCheckUnsafeInterval.Name),
SafeInterval: ctx.Uint64(flags.HealthCheckSafeInterval.Name),
MinPeerCount: ctx.Uint64(flags.HealthCheckMinPeerCount.Name),
},
RollupCfg: *rollupCfg,
RPCEnableProxy: ctx.Bool(flags.RPCEnableProxy.Name),
......@@ -133,6 +134,9 @@ type HealthCheckConfig struct {
// Interval is the interval (in seconds) to check the health of the sequencer.
Interval uint64
// UnsafeInterval is the interval allowed between unsafe head and now in seconds.
UnsafeInterval uint64
// SafeInterval is the interval between safe head progression measured in seconds.
SafeInterval uint64
......
......@@ -175,6 +175,7 @@ func (c *OpConductor) initHealthMonitor(ctx context.Context) error {
c.hmon = health.NewSequencerHealthMonitor(
c.log,
c.cfg.HealthCheck.Interval,
c.cfg.HealthCheck.UnsafeInterval,
c.cfg.HealthCheck.SafeInterval,
c.cfg.HealthCheck.MinPeerCount,
&c.cfg.RollupCfg,
......@@ -220,6 +221,12 @@ func (oc *OpConductor) initRPCServer(ctx context.Context) error {
Namespace: conductorrpc.NodeRPCNamespace,
Service: nodeProxy,
})
nodeAdminProxy := conductorrpc.NewNodeAdminProxyBackend(oc.log, oc, nodeClient)
server.AddAPI(rpc.API{
Namespace: conductorrpc.NodeAdminRPCNamespace,
Service: nodeAdminProxy,
})
}
oc.rpcServer = server
......@@ -572,7 +579,6 @@ func (oc *OpConductor) startSequencer() error {
return errors.Wrap(err, "failed to get latest unsafe block from EL during startSequencer phase")
}
//if unsafeInCons.BlockHash != unsafeInNode.Hash() {
if unsafeInCons.ExecutionPayload.BlockHash != unsafeInNode.Hash() {
oc.log.Warn(
"latest unsafe block in consensus is not the same as the one in op-node",
......
......@@ -36,9 +36,10 @@ func mockConfig(t *testing.T) Config {
ExecutionRPC: "http://geth:8545",
Paused: false,
HealthCheck: HealthCheckConfig{
Interval: 1,
SafeInterval: 5,
MinPeerCount: 1,
Interval: 1,
UnsafeInterval: 3,
SafeInterval: 5,
MinPeerCount: 1,
},
RollupCfg: rollup.Config{
Genesis: rollup.Genesis{
......
......@@ -53,6 +53,11 @@ var (
Usage: "Interval between health checks",
EnvVars: opservice.PrefixEnvVar(EnvVarPrefix, "HEALTHCHECK_INTERVAL"),
}
HealthCheckUnsafeInterval = &cli.Uint64Flag{
Name: "healthcheck.unsafe-interval",
Usage: "Interval allowed between unsafe head and now measured in seconds",
EnvVars: opservice.PrefixEnvVar(EnvVarPrefix, "HEALTHCHECK_UNSAFE_INTERVAL"),
}
HealthCheckSafeInterval = &cli.Uint64Flag{
Name: "healthcheck.safe-interval",
Usage: "Interval between safe head progression measured in seconds",
......
......@@ -28,13 +28,14 @@ type HealthMonitor interface {
// interval is the interval between health checks measured in seconds.
// safeInterval is the interval between safe head progress measured in seconds.
// minPeerCount is the minimum number of peers required for the sequencer to be healthy.
func NewSequencerHealthMonitor(log log.Logger, interval, safeInterval, minPeerCount uint64, rollupCfg *rollup.Config, node dial.RollupClientInterface, p2p p2p.API) HealthMonitor {
func NewSequencerHealthMonitor(log log.Logger, interval, unsafeInterval, safeInterval, minPeerCount uint64, rollupCfg *rollup.Config, node dial.RollupClientInterface, p2p p2p.API) HealthMonitor {
return &SequencerHealthMonitor{
log: log,
done: make(chan struct{}),
interval: interval,
healthUpdateCh: make(chan bool),
rollupCfg: rollupCfg,
unsafeInterval: unsafeInterval,
safeInterval: safeInterval,
minPeerCount: minPeerCount,
node: node,
......@@ -48,11 +49,14 @@ type SequencerHealthMonitor struct {
done chan struct{}
wg sync.WaitGroup
rollupCfg *rollup.Config
safeInterval uint64
minPeerCount uint64
interval uint64
healthUpdateCh chan bool
rollupCfg *rollup.Config
unsafeInterval uint64
safeInterval uint64
minPeerCount uint64
interval uint64
healthUpdateCh chan bool
lastSeenUnsafeNum uint64
lastSeenUnsafeTime uint64
node dial.RollupClientInterface
p2p p2p.API
......@@ -104,8 +108,9 @@ func (hm *SequencerHealthMonitor) loop() {
// healthCheck checks the health of the sequencer by 3 criteria:
// 1. unsafe head is progressing per block time
// 2. safe head is progressing every configured batch submission interval
// 3. peer count is above the configured minimum
// 2. unsafe head is not too far behind now (measured by unsafeInterval)
// 3. safe head is progressing every configured batch submission interval
// 4. peer count is above the configured minimum
func (hm *SequencerHealthMonitor) healthCheck() bool {
ctx := context.Background()
status, err := hm.node.SyncStatus(ctx)
......@@ -115,14 +120,48 @@ func (hm *SequencerHealthMonitor) healthCheck() bool {
}
now := uint64(time.Now().Unix())
// allow at most one block drift for unsafe head
if now-status.UnsafeL2.Time > hm.interval+hm.rollupCfg.BlockTime {
hm.log.Error("unsafe head is not progressing", "lastSeenUnsafeBlock", status.UnsafeL2)
if hm.lastSeenUnsafeNum != 0 {
diff := now - hm.lastSeenUnsafeTime
// how many blocks do we expect to see, minus 1 to account for edge case with respect to time.
// for example, if diff = 2.001s and block time = 2s, expecting to see 1 block could potentially cause sequencer to be considered unhealthy.
blocks := diff/hm.rollupCfg.BlockTime - 1
if diff > hm.rollupCfg.BlockTime && blocks > status.UnsafeL2.Number-hm.lastSeenUnsafeNum {
hm.log.Error(
"unsafe head is not progressing as expected",
"now", now,
"unsafe_head_num", status.UnsafeL2.Number,
"last_seen_unsafe_num", hm.lastSeenUnsafeNum,
"last_seen_unsafe_time", hm.lastSeenUnsafeTime,
"unsafe_interval", hm.unsafeInterval,
)
return false
}
}
if status.UnsafeL2.Number > hm.lastSeenUnsafeNum {
hm.lastSeenUnsafeNum = status.UnsafeL2.Number
hm.lastSeenUnsafeTime = now
}
if now-status.UnsafeL2.Time > hm.unsafeInterval {
hm.log.Error(
"unsafe head is not progressing as expected",
"now", now,
"unsafe_head_num", status.UnsafeL2.Number,
"unsafe_head_time", status.UnsafeL2.Time,
"unsafe_interval", hm.unsafeInterval,
)
return false
}
if now-status.SafeL2.Time > hm.safeInterval {
hm.log.Error("safe head is not progressing", "safe_head_time", status.SafeL2.Time, "now", now)
hm.log.Error(
"safe head is not progressing as expected",
"now", now,
"safe_head_num", status.SafeL2.Number,
"safe_head_time", status.SafeL2.Time,
"safe_interval", hm.safeInterval,
)
return false
}
......
......@@ -26,14 +26,15 @@ const (
type HealthMonitorTestSuite struct {
suite.Suite
log log.Logger
rc *testutils.MockRollupClient
pc *p2pMocks.API
interval uint64
safeInterval uint64
minPeerCount uint64
rollupCfg *rollup.Config
monitor HealthMonitor
log log.Logger
rc *testutils.MockRollupClient
pc *p2pMocks.API
interval uint64
unsafeInterval uint64
safeInterval uint64
minPeerCount uint64
rollupCfg *rollup.Config
monitor HealthMonitor
}
func (s *HealthMonitorTestSuite) SetupSuite() {
......@@ -41,6 +42,7 @@ func (s *HealthMonitorTestSuite) SetupSuite() {
s.rc = &testutils.MockRollupClient{}
s.pc = &p2pMocks.API{}
s.interval = 1
s.unsafeInterval = 3
s.safeInterval = 5
s.minPeerCount = minPeerCount
s.rollupCfg = &rollup.Config{
......@@ -49,7 +51,7 @@ func (s *HealthMonitorTestSuite) SetupSuite() {
}
func (s *HealthMonitorTestSuite) SetupTest() {
s.monitor = NewSequencerHealthMonitor(s.log, s.interval, s.safeInterval, s.minPeerCount, s.rollupCfg, s.rc, s.pc)
s.monitor = NewSequencerHealthMonitor(s.log, s.interval, s.unsafeInterval, s.safeInterval, s.minPeerCount, s.rollupCfg, s.rc, s.pc)
err := s.monitor.Start()
s.NoError(err)
}
......@@ -90,10 +92,12 @@ func (s *HealthMonitorTestSuite) TestUnhealthyUnsafeHeadNotProgressing() {
now := uint64(time.Now().Unix())
ss1 := &eth.SyncStatus{
UnsafeL2: eth.L2BlockRef{
Time: now - 1,
Number: 5,
Time: now - 1,
},
SafeL2: eth.L2BlockRef{
Time: now - 2,
Number: 1,
Time: now - 2,
},
}
s.rc.ExpectSyncStatus(ss1, nil)
......
......@@ -3,15 +3,14 @@ package rpc
import (
"context"
"errors"
"math/big"
"github.com/ethereum/go-ethereum/rpc"
"github.com/ethereum-optimism/optimism/op-node/rollup"
"github.com/ethereum-optimism/optimism/op-service/eth"
"github.com/ethereum/go-ethereum/core/types"
)
var (
ErrNotLeader = errors.New("refusing to proxy request to non-leader sequencer")
)
var ErrNotLeader = errors.New("refusing to proxy request to non-leader sequencer")
type ServerInfo struct {
ID string `json:"id"`
......@@ -53,13 +52,19 @@ type API interface {
// ExecutionProxyAPI defines the methods proxied to the execution rpc backend
// This should include all methods that are called by op-batcher or op-proposer
type ExecutionProxyAPI interface {
BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error)
GetBlockByNumber(ctx context.Context, number rpc.BlockNumber, fullTx bool) (map[string]interface{}, error)
}
// NodeProxyAPI defines the methods proxied to the node rpc backend
// This should include all methods that are called by op-batcher or op-proposer
type NodeProxyAPI interface {
OutputAtBlock(ctx context.Context, blockNum uint64) (*eth.OutputResponse, error)
SequencerActive(ctx context.Context) (bool, error)
SyncStatus(ctx context.Context) (*eth.SyncStatus, error)
RollupConfig(ctx context.Context) (*rollup.Config, error)
}
// NodeProxyAPI defines the methods proxied to the node rpc backend
// This should include all methods that are called by op-batcher or op-proposer
type NodeAdminProxyAPI interface {
SequencerActive(ctx context.Context) (bool, error)
}
......@@ -2,11 +2,10 @@ package rpc
import (
"context"
"math/big"
"github.com/ethereum/go-ethereum/core/types"
"github.com/ethereum/go-ethereum/ethclient"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/rpc"
)
var ExecutionRPCNamespace = "eth"
......@@ -28,13 +27,14 @@ func NewExecutionProxyBackend(log log.Logger, con conductor, client *ethclient.C
}
}
func (api *ExecutionProxyBackend) BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error) {
block, err := api.client.BlockByNumber(ctx, number)
func (api *ExecutionProxyBackend) GetBlockByNumber(ctx context.Context, number rpc.BlockNumber, fullTx bool) (map[string]interface{}, error) {
var result map[string]interface{}
err := api.client.Client().Call(&result, "eth_getBlockByNumber", number, fullTx)
if err != nil {
return nil, err
}
if !api.con.Leader(ctx) {
return nil, ErrNotLeader
}
return block, nil
return result, nil
}
package rpc
import (
"context"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum-optimism/optimism/op-service/sources"
)
var NodeAdminRPCNamespace = "admin"
// NodeAdminProxyAPI implements a node admin rpc proxy with a leadership check to make sure only leader returns the result.
type NodeAdminProxyBackend struct {
log log.Logger
con conductor
client *sources.RollupClient
}
var _ NodeAdminProxyAPI = (*NodeAdminProxyBackend)(nil)
// NewNodeAdminProxyBackend creates a new NodeAdminProxyBackend instance.
func NewNodeAdminProxyBackend(log log.Logger, con conductor, client *sources.RollupClient) NodeAdminProxyAPI {
return &NodeAdminProxyBackend{
log: log,
con: con,
client: client,
}
}
func (api *NodeAdminProxyBackend) SequencerActive(ctx context.Context) (bool, error) {
active, err := api.client.SequencerActive(ctx)
if err != nil {
return false, err
}
if !api.con.Leader(ctx) {
return false, ErrNotLeader
}
return active, err
}
......@@ -3,9 +3,11 @@ package rpc
import (
"context"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum-optimism/optimism/op-node/rollup"
"github.com/ethereum-optimism/optimism/op-service/eth"
"github.com/ethereum-optimism/optimism/op-service/sources"
"github.com/ethereum/go-ethereum/log"
)
var NodeRPCNamespace = "optimism"
......@@ -49,13 +51,13 @@ func (api *NodeProxyBackend) OutputAtBlock(ctx context.Context, blockNum uint64)
return output, nil
}
func (api *NodeProxyBackend) SequencerActive(ctx context.Context) (bool, error) {
active, err := api.client.SequencerActive(ctx)
func (api *NodeProxyBackend) RollupConfig(ctx context.Context) (*rollup.Config, error) {
config, err := api.client.RollupConfig(ctx)
if err != nil {
return false, err
return nil, err
}
if !api.con.Leader(ctx) {
return false, ErrNotLeader
return nil, ErrNotLeader
}
return active, err
return config, nil
}
This diff is collapsed.
......@@ -9,7 +9,6 @@ import (
// [Category: Initial Setup]
// In this test, we test that we can successfully setup a working cluster.
func TestSequencerFailover_SetupCluster(t *testing.T) {
t.Skip("temporarily disable due to flakiness for now")
sys, conductors := setupSequencerFailoverTest(t)
defer sys.Close()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment