Commit 571bd368 authored by mergify[bot]'s avatar mergify[bot] Committed by GitHub

Merge branch 'develop' into felipe/fix-cache-for-eth2-block-tags

parents 9a50468b 6188ff50
......@@ -1504,7 +1504,7 @@ workflows:
type: approval
filters:
tags:
only: /^op-[a-z0-9\-]*\/v.*/
only: /^(proxyd|op-[a-z0-9\-]*)\/v.*/
branches:
ignore: /.*/
- docker-release:
......@@ -1586,7 +1586,7 @@ workflows:
- oplabs-gcr-release
requires:
- hold
- docker-build:
- docker-release:
name: proxyd-docker-release
filters:
tags:
......
......@@ -13,13 +13,13 @@
/packages/core-utils @ethereum-optimism/legacy-reviewers
/packages/data-transport-layer @ethereum-optimism/legacy-reviewers
/packages/chain-mon @smartcontracts
/packages/fault-detector @ethereum-optimism/legacy-reviewers
/packages/fault-detector @ethereum-optimism/devxpod
/packages/hardhat-deploy-config @ethereum-optimism/legacy-reviewers
/packages/message-relayer @ethereum-optimism/legacy-reviewers
/packages/migration-data @ethereum-optimism/legacy-reviewers
/packages/replica-healthcheck @ethereum-optimism/legacy-reviewers
/packages/sdk @ethereum-optimism/ecopod
/packages/atst @ethereum-optimism/ecopod
/packages/sdk @ethereum-optimism/devxpod
/packages/atst @ethereum-optimism/devxpod
# Bedrock codebases
/bedrock-devnet @ethereum-optimism/go-reviewers
......@@ -42,7 +42,7 @@
# Misc
/proxyd @ethereum-optimism/infra-reviewers
/indexer @ethereum-optimism/infra-reviewers
/indexer @ethereum-optimism/devxpod
/infra @ethereum-optimism/infra-reviewers
/specs @ethereum-optimism/contract-reviewers @ethereum-optimism/go-reviewers
/endpoint-monitor @ethereum-optimism/infra-reviewers
......@@ -103,6 +103,8 @@ type Metrics struct {
SequencerInconsistentL1Origin *EventMetrics
SequencerResets *EventMetrics
L1RequestDurationSeconds *prometheus.HistogramVec
SequencerBuildingDiffDurationSeconds prometheus.Histogram
SequencerBuildingDiffTotal prometheus.Counter
......@@ -378,6 +380,14 @@ func NewMetrics(procName string) *Metrics {
Help: "number of unverified execution payloads buffered in quarantine",
}),
L1RequestDurationSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{
Namespace: ns,
Name: "l1_request_seconds",
Buckets: []float64{
.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
Help: "Histogram of L1 request time",
}, []string{"request"}),
SequencerBuildingDiffDurationSeconds: factory.NewHistogram(prometheus.HistogramOpts{
Namespace: ns,
Name: "sequencer_building_diff_seconds",
......@@ -589,6 +599,11 @@ func (m *Metrics) RecordBandwidth(ctx context.Context, bwc *libp2pmetrics.Bandwi
}
}
// RecordL1RequestTime tracks the amount of time the derivation pipeline spent waiting for L1 data requests.
func (m *Metrics) RecordL1RequestTime(method string, duration time.Duration) {
m.L1RequestDurationSeconds.WithLabelValues(method).Observe(float64(duration) / float64(time.Second))
}
// RecordSequencerBuildingDiffTime tracks the amount of time the sequencer was allowed between
// start to finish, incl. sealing, minus the block time.
// Ideally this is 0, realistically the sequencer scheduler may be busy with other jobs like syncing sometimes.
......
......@@ -30,6 +30,7 @@ type Metrics interface {
RecordL1ReorgDepth(d uint64)
EngineMetrics
L1FetcherMetrics
SequencerMetrics
}
......@@ -103,6 +104,7 @@ type AltSync interface {
// NewDriver composes an events handler that tracks L1 state, triggers L2 derivation, and optionally sequences new L2 blocks.
func NewDriver(driverCfg *Config, cfg *rollup.Config, l2 L2Chain, l1 L1Chain, altSync AltSync, network Network, log log.Logger, snapshotLog log.Logger, metrics Metrics) *Driver {
l1 = NewMeteredL1Fetcher(l1, metrics)
l1State := NewL1State(log, metrics)
sequencerConfDepth := NewConfDepth(driverCfg.SequencerConfDepth, l1State.L1Head, l1)
findL1Origin := NewL1OriginSelector(log, cfg, sequencerConfDepth)
......
package driver
import (
"context"
"time"
"github.com/ethereum-optimism/optimism/op-node/eth"
"github.com/ethereum-optimism/optimism/op-node/rollup/derive"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/core/types"
)
type L1FetcherMetrics interface {
RecordL1RequestTime(method string, duration time.Duration)
}
type MeteredL1Fetcher struct {
inner derive.L1Fetcher
metrics L1FetcherMetrics
now func() time.Time
}
func NewMeteredL1Fetcher(inner derive.L1Fetcher, metrics L1FetcherMetrics) *MeteredL1Fetcher {
return &MeteredL1Fetcher{
inner: inner,
metrics: metrics,
now: time.Now,
}
}
func (m *MeteredL1Fetcher) L1BlockRefByLabel(ctx context.Context, label eth.BlockLabel) (eth.L1BlockRef, error) {
defer m.recordTime("L1BlockRefByLabel")()
return m.inner.L1BlockRefByLabel(ctx, label)
}
func (m *MeteredL1Fetcher) L1BlockRefByNumber(ctx context.Context, num uint64) (eth.L1BlockRef, error) {
defer m.recordTime("L1BlockRefByNumber")()
return m.inner.L1BlockRefByNumber(ctx, num)
}
func (m *MeteredL1Fetcher) L1BlockRefByHash(ctx context.Context, hash common.Hash) (eth.L1BlockRef, error) {
defer m.recordTime("L1BlockRefByHash")()
return m.inner.L1BlockRefByHash(ctx, hash)
}
func (m *MeteredL1Fetcher) InfoByHash(ctx context.Context, hash common.Hash) (eth.BlockInfo, error) {
defer m.recordTime("InfoByHash")()
return m.inner.InfoByHash(ctx, hash)
}
func (m *MeteredL1Fetcher) InfoAndTxsByHash(ctx context.Context, hash common.Hash) (eth.BlockInfo, types.Transactions, error) {
defer m.recordTime("InfoAndTxsByHash")()
return m.inner.InfoAndTxsByHash(ctx, hash)
}
func (m *MeteredL1Fetcher) FetchReceipts(ctx context.Context, blockHash common.Hash) (eth.BlockInfo, types.Receipts, error) {
defer m.recordTime("FetchReceipts")()
return m.inner.FetchReceipts(ctx, blockHash)
}
var _ derive.L1Fetcher = (*MeteredL1Fetcher)(nil)
func (m *MeteredL1Fetcher) recordTime(method string) func() {
start := m.now()
return func() {
end := m.now()
m.metrics.RecordL1RequestTime(method, end.Sub(start))
}
}
package driver
import (
"context"
"errors"
"testing"
"time"
"github.com/ethereum-optimism/optimism/op-node/eth"
"github.com/ethereum-optimism/optimism/op-node/testutils"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/core/types"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
)
func TestDurationRecorded(t *testing.T) {
num := uint64(1234)
hash := common.Hash{0xaa}
ref := eth.L1BlockRef{Number: num}
info := &testutils.MockBlockInfo{}
expectedErr := errors.New("test error")
tests := []struct {
method string
expect func(inner *testutils.MockL1Source)
call func(t *testing.T, fetcher *MeteredL1Fetcher, inner *testutils.MockL1Source)
}{
{
method: "L1BlockRefByLabel",
call: func(t *testing.T, fetcher *MeteredL1Fetcher, inner *testutils.MockL1Source) {
inner.ExpectL1BlockRefByLabel(eth.Finalized, ref, expectedErr)
result, err := fetcher.L1BlockRefByLabel(context.Background(), eth.Finalized)
require.Equal(t, ref, result)
require.Equal(t, expectedErr, err)
},
},
{
method: "L1BlockRefByNumber",
call: func(t *testing.T, fetcher *MeteredL1Fetcher, inner *testutils.MockL1Source) {
inner.ExpectL1BlockRefByNumber(num, ref, expectedErr)
result, err := fetcher.L1BlockRefByNumber(context.Background(), num)
require.Equal(t, ref, result)
require.Equal(t, expectedErr, err)
},
},
{
method: "L1BlockRefByHash",
call: func(t *testing.T, fetcher *MeteredL1Fetcher, inner *testutils.MockL1Source) {
inner.ExpectL1BlockRefByHash(hash, ref, expectedErr)
result, err := fetcher.L1BlockRefByHash(context.Background(), hash)
require.Equal(t, ref, result)
require.Equal(t, expectedErr, err)
},
},
{
method: "InfoByHash",
call: func(t *testing.T, fetcher *MeteredL1Fetcher, inner *testutils.MockL1Source) {
inner.ExpectInfoByHash(hash, info, expectedErr)
result, err := fetcher.InfoByHash(context.Background(), hash)
require.Equal(t, info, result)
require.Equal(t, expectedErr, err)
},
},
{
method: "InfoAndTxsByHash",
call: func(t *testing.T, fetcher *MeteredL1Fetcher, inner *testutils.MockL1Source) {
txs := types.Transactions{
&types.Transaction{},
}
inner.ExpectInfoAndTxsByHash(hash, info, txs, expectedErr)
actualInfo, actualTxs, err := fetcher.InfoAndTxsByHash(context.Background(), hash)
require.Equal(t, info, actualInfo)
require.Equal(t, txs, actualTxs)
require.Equal(t, expectedErr, err)
},
},
{
method: "FetchReceipts",
call: func(t *testing.T, fetcher *MeteredL1Fetcher, inner *testutils.MockL1Source) {
rcpts := types.Receipts{
&types.Receipt{},
}
inner.ExpectFetchReceipts(hash, info, rcpts, expectedErr)
actualInfo, actualRcpts, err := fetcher.FetchReceipts(context.Background(), hash)
require.Equal(t, info, actualInfo)
require.Equal(t, rcpts, actualRcpts)
require.Equal(t, expectedErr, err)
},
},
}
for _, test := range tests {
test := test
t.Run(test.method, func(t *testing.T) {
duration := 200 * time.Millisecond
fetcher, inner, metrics := createFetcher(duration)
defer inner.AssertExpectations(t)
defer metrics.AssertExpectations(t)
metrics.ExpectRecordRequestTime(test.method, duration)
test.call(t, fetcher, inner)
})
}
}
// createFetcher creates a MeteredL1Fetcher with a mock inner.
// The clock used to calculate the current time will advance by clockIncrement on each call, making it appear as if
// each request takes that amount of time to execute.
func createFetcher(clockIncrement time.Duration) (*MeteredL1Fetcher, *testutils.MockL1Source, *mockMetrics) {
inner := &testutils.MockL1Source{}
currTime := time.UnixMilli(1294812934000000)
clock := func() time.Time {
currTime = currTime.Add(clockIncrement)
return currTime
}
metrics := &mockMetrics{}
fetcher := MeteredL1Fetcher{
inner: inner,
metrics: metrics,
now: clock,
}
return &fetcher, inner, metrics
}
type mockMetrics struct {
mock.Mock
}
func (m *mockMetrics) RecordL1RequestTime(method string, duration time.Duration) {
m.MethodCalled("RecordL1RequestTime", method, duration)
}
func (m *mockMetrics) ExpectRecordRequestTime(method string, duration time.Duration) {
m.On("RecordL1RequestTime", method, duration).Once()
}
......@@ -6,7 +6,7 @@ DOCKER_REPO=$1
GIT_TAG=$2
GIT_SHA=$3
IMAGE_NAME=$(echo "$GIT_TAG" | grep -Eow '^op-[a-z0-9\-]*' || true)
IMAGE_NAME=$(echo "$GIT_TAG" | grep -Eow '^(proxyd|op-[a-z0-9\-]*)' || true)
if [ -z "$IMAGE_NAME" ]; then
echo "image name could not be parsed from git tag '$GIT_TAG'"
exit 1
......
......@@ -203,23 +203,22 @@ func NewConsensusPoller(bg *BackendGroup, opts ...ConsensusOpt) *ConsensusPoller
// UpdateBackend refreshes the consensus state of a single backend
func (cp *ConsensusPoller) UpdateBackend(ctx context.Context, be *Backend) {
bs := cp.backendState[be]
if time.Now().Before(bs.bannedUntil) {
log.Debug("skipping backend banned", "backend", be.Name, "bannedUntil", bs.bannedUntil)
if cp.IsBanned(be) {
log.Debug("skipping backend banned", "backend", be.Name)
return
}
// if backend it not online or not in a health state we'll only resume checkin it after ban
if !be.Online() || !be.IsHealthy() {
log.Warn("backend banned - not online or not healthy", "backend", be.Name, "bannedUntil", bs.bannedUntil)
bs.bannedUntil = time.Now().Add(cp.banPeriod)
log.Warn("backend banned - not online or not healthy", "backend", be.Name)
cp.Ban(be)
}
// if backend it not in sync we'll check again after ban
inSync, err := cp.isInSync(ctx, be)
if err != nil || !inSync {
log.Warn("backend banned - not in sync", "backend", be.Name, "bannedUntil", bs.bannedUntil)
bs.bannedUntil = time.Now().Add(cp.banPeriod)
log.Warn("backend banned - not in sync", "backend", be.Name)
cp.Ban(be)
}
// if backend exhausted rate limit we'll skip it for now
......@@ -246,7 +245,11 @@ func (cp *ConsensusPoller) UpdateBackend(ctx context.Context, be *Backend) {
if changed {
RecordBackendLatestBlock(be, latestBlockNumber)
log.Debug("backend state updated", "name", be.Name, "state", bs)
log.Debug("backend state updated",
"name", be.Name,
"peerCount", peerCount,
"latestBlockNumber", latestBlockNumber,
"latestBlockHash", latestBlockHash)
}
}
......@@ -258,7 +261,7 @@ func (cp *ConsensusPoller) UpdateBackendGroupConsensus(ctx context.Context) {
currentConsensusBlockNumber := cp.GetConsensusBlockNumber()
for _, be := range cp.backendGroup.Backends {
peerCount, backendLatestBlockNumber, backendLatestBlockHash, lastUpdate := cp.getBackendState(be)
peerCount, backendLatestBlockNumber, backendLatestBlockHash, lastUpdate, _ := cp.getBackendState(be)
if !be.skipPeerCountCheck && peerCount < cp.minPeerCount {
continue
......@@ -306,10 +309,10 @@ func (cp *ConsensusPoller) UpdateBackendGroupConsensus(ctx context.Context) {
- with minimum peer count
- updated recently
*/
bs := cp.backendState[be]
notUpdated := bs.lastUpdate.Add(cp.maxUpdateThreshold).Before(time.Now())
isBanned := time.Now().Before(bs.bannedUntil)
notEnoughPeers := !be.skipPeerCountCheck && bs.peerCount < cp.minPeerCount
peerCount, _, _, lastUpdate, bannedUntil := cp.getBackendState(be)
notUpdated := lastUpdate.Add(cp.maxUpdateThreshold).Before(time.Now())
isBanned := time.Now().Before(bannedUntil)
notEnoughPeers := !be.skipPeerCountCheck && peerCount < cp.minPeerCount
if !be.IsHealthy() || be.IsRateLimited() || !be.Online() || notUpdated || isBanned || notEnoughPeers {
filteredBackendsNames = append(filteredBackendsNames, be.Name)
continue
......@@ -359,6 +362,22 @@ func (cp *ConsensusPoller) UpdateBackendGroupConsensus(ctx context.Context) {
log.Debug("group state", "proposedBlock", proposedBlock, "consensusBackends", strings.Join(consensusBackendsNames, ", "), "filteredBackends", strings.Join(filteredBackendsNames, ", "))
}
// IsBanned checks if a specific backend is banned
func (cp *ConsensusPoller) IsBanned(be *Backend) bool {
bs := cp.backendState[be]
defer bs.backendStateMux.Unlock()
bs.backendStateMux.Lock()
return time.Now().Before(bs.bannedUntil)
}
// Ban bans a specific backend
func (cp *ConsensusPoller) Ban(be *Backend) {
bs := cp.backendState[be]
defer bs.backendStateMux.Unlock()
bs.backendStateMux.Lock()
bs.bannedUntil = time.Now().Add(cp.banPeriod)
}
// Unban remove any bans from the backends
func (cp *ConsensusPoller) Unban() {
for _, be := range cp.backendGroup.Backends {
......@@ -432,13 +451,14 @@ func (cp *ConsensusPoller) isInSync(ctx context.Context, be *Backend) (result bo
return res, nil
}
func (cp *ConsensusPoller) getBackendState(be *Backend) (peerCount uint64, blockNumber hexutil.Uint64, blockHash string, lastUpdate time.Time) {
func (cp *ConsensusPoller) getBackendState(be *Backend) (peerCount uint64, blockNumber hexutil.Uint64, blockHash string, lastUpdate time.Time, bannedUntil time.Time) {
bs := cp.backendState[be]
bs.backendStateMux.Lock()
peerCount = bs.peerCount
blockNumber = bs.latestBlockNumber
blockHash = bs.latestBlockHash
lastUpdate = bs.lastUpdate
bannedUntil = bs.bannedUntil
bs.backendStateMux.Unlock()
return
}
......
......@@ -576,7 +576,7 @@ func (s *Server) populateContext(w http.ResponseWriter, r *http.Request) context
return nil
}
ctx = context.WithValue(r.Context(), ContextKeyAuth, s.authenticatedPaths[authorization]) // nolint:staticcheck
ctx = context.WithValue(ctx, ContextKeyAuth, s.authenticatedPaths[authorization]) // nolint:staticcheck
}
return context.WithValue(
......
# [Public] 4/26 Transaction Delays Post-Mortem
# Incident Summary
On April 26, 2023 between the hours of 1900 and 2100 UTC, OP Mainnet experienced degraded service following a ~10x increase in the rate of `eth_sendRawTransaction` requests.
While the sequencer remained online and continued to process transactions, users experienced transaction inclusion delays, rate limit errors, problems syncing nodes, and other symptoms of degraded performance.
The issue resolved itself once the rate of `eth_sendRawTransaction` requests subsided. However, we did not communicate the status of the network to our users, nor did we execute on mitigations quickly enough that could have reduced the impact of the degraded service. We recognize that this was a frustrating experience that caused significant impact to our users, particularly those participating in the DeFi ecosystem. We are sorry for this user experience, and hope that this retrospective provides insight into what happened and what we are doing to prevent similar issues from happening again.
# Leadup
OP Mainnet has not yet been upgraded to Bedrock. As a result, all OP Mainnet nodes run two components to sync the L2 chain:
- The `data-transport-layer` (or DTL), which indexes transactions from L1 or another L2 node.
- `l2geth`, which executes the transactions indexed by the DTL and maintains the L2 chain’s state.
The DTL and `l2geth` retrieve new data by polling for it. The DTL polls L1 or L2 depending on how it is configured, and `l2geth` polls the DTL. The higher the transaction throughput is, the more transactions will have to be processed between each “tick” of the polling loop. When throughput is too high, it is possible for the number of transactions between each tick to exceed what can be processed in a single tick. In this case, multiple ticks are necessary to catch up to the tip of the chain.
To protect the sequencer against traffic spikes, we route read requests - specifically `eth_getBlockRange`, which the DTL uses to sync from L2 - to a read-only replica rather than to the sequencer itself.
At 1915 UTC, sequencer throughput jumped from the usual ~8 transactions per second to a peak of 95 transactions per second over the course of 15 minutes.
# Causes
As a result of the increased throughput, our read-only replica started to fall behind. The graph below shows the delay, in seconds, between the sequencer creating new blocks and them being indexed by the read-only replica:
![outage.png](2023-04-26-transaction-delays/outage.png)
This meant that while the sequencer was processing transactions normally, users were unable to see their transactions confirmed on chain for several minutes. For DeFi apps relying on an accurate view of on-chain data, this caused transactions to be reverted and positions to be liquidated. It also made it difficult to retry transactions, since the user’s wallet nonce may have increased on the sequencer but not on the replica.
This issue was ecosystem wide. Infrastructure providers run replicas of their own, which use the same polling-based mechanism to sync data. This likely further increased the delay between when transactions were processed, and when they appeared as confirmed. This is not an error on the part of infrastructure providers, but rather a flaw in how the pre-Bedrock system is designed.
# Recovery and Lessons Learned
The issue resolved itself once the transaction volume dropped back down to normal levels. However, we did not communicate with our community for the duration of the outage. This is a significant miss, and for that we apologize. Going forward, we will do the following in an effort to avoid similar issues:
- We will add monitoring for replica lag, so that we can proactively route traffic directly to the sequencer when throughput increases beyond what the sync mechanism can handle.
- Though rate limits were not the direct cause of this incident, we will increase rate limits so that node operators can poll for new blocks more frequently.
Lastly, we will upgrade mainnet to Bedrock later this year. Bedrock fixes these issues from an architectural perspective. Specifically:
- There will be no more DTL, or polling-based sync mechanism. Blocks are either derived from L1, or gossipped over a peer-to-peer network.
- Blocks will be created every two seconds rather than on every transaction. This allows data to propagate across the network more efficiently and predictably.
- The sequencer will have a private mempool. This will allow fee-replacement transactions to work properly, and eliminate the need for aggressive rate limiting on the sequencer.
We recognize how frustrating an issue like this can be, and that is compounded when we are not proactively communicating. We’re sorry our users had this experience. We’re committed to applying these learnings moving forward and appreciate our community holding us accountable.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment