Commit 98206b7e authored by smartcontracts's avatar smartcontracts Committed by GitHub

fix(fd): properly handle connection failures (#2832)

Updates the Fault Detector to properly handle connection failures when
trying to query the L2 node. Also increments a new metric whenever this
happens so we can keep track of these failures if necessary.
Co-authored-by: default avatarMark Tyneway <mark.tyneway@gmail.com>
Co-authored-by: default avatarmergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
parent 87f745b5
---
'@eth-optimism/fault-detector': patch
---
Properly handle connection failures for L2 node
...@@ -2,7 +2,7 @@ import { BaseServiceV2, Gauge, validators } from '@eth-optimism/common-ts' ...@@ -2,7 +2,7 @@ import { BaseServiceV2, Gauge, validators } from '@eth-optimism/common-ts'
import { getChainId, sleep, toRpcHexString } from '@eth-optimism/core-utils' import { getChainId, sleep, toRpcHexString } from '@eth-optimism/core-utils'
import { CrossChainMessenger } from '@eth-optimism/sdk' import { CrossChainMessenger } from '@eth-optimism/sdk'
import { Provider } from '@ethersproject/abstract-provider' import { Provider } from '@ethersproject/abstract-provider'
import { Contract, ethers } from 'ethers' import { Contract, ethers, Transaction } from 'ethers'
import dateformat from 'dateformat' import dateformat from 'dateformat'
import { import {
...@@ -20,10 +20,12 @@ type Metrics = { ...@@ -20,10 +20,12 @@ type Metrics = {
highestCheckedBatchIndex: Gauge highestCheckedBatchIndex: Gauge
highestKnownBatchIndex: Gauge highestKnownBatchIndex: Gauge
isCurrentlyMismatched: Gauge isCurrentlyMismatched: Gauge
inUnexpectedErrorState: Gauge l1NodeConnectionFailures: Gauge
l2NodeConnectionFailures: Gauge
} }
type State = { type State = {
fpw: number
scc: Contract scc: Contract
messenger: CrossChainMessenger messenger: CrossChainMessenger
highestCheckedBatchIndex: number highestCheckedBatchIndex: number
...@@ -68,9 +70,13 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> { ...@@ -68,9 +70,13 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> {
type: Gauge, type: Gauge,
desc: '0 if state is ok, 1 if state is mismatched', desc: '0 if state is ok, 1 if state is mismatched',
}, },
inUnexpectedErrorState: { l1NodeConnectionFailures: {
type: Gauge, type: Gauge,
desc: '0 if service is ok, 1 service is in unexpected error state', desc: 'Number of times L1 node connection has failed',
},
l2NodeConnectionFailures: {
type: Gauge,
desc: 'Number of times L2 node connection has failed',
}, },
}, },
}) })
...@@ -86,6 +92,7 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> { ...@@ -86,6 +92,7 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> {
// We use this a lot, a bit cleaner to pull out to the top level of the state object. // We use this a lot, a bit cleaner to pull out to the top level of the state object.
this.state.scc = this.state.messenger.contracts.l1.StateCommitmentChain this.state.scc = this.state.messenger.contracts.l1.StateCommitmentChain
this.state.fpw = (await this.state.scc.FRAUD_PROOF_WINDOW()).toNumber()
// Figure out where to start syncing from. // Figure out where to start syncing from.
if (this.options.startBatchIndex === -1) { if (this.options.startBatchIndex === -1) {
...@@ -102,17 +109,30 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> { ...@@ -102,17 +109,30 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> {
} }
async main(): Promise<void> { async main(): Promise<void> {
const latestBatchIndex = await this.state.scc.getTotalBatches() let latestBatchIndex: number
if (this.state.highestCheckedBatchIndex >= latestBatchIndex.toNumber()) { try {
latestBatchIndex = (await this.state.scc.getTotalBatches()).toNumber()
} catch (err) {
this.logger.error(`got error when connecting to node`, {
error: err,
node: 'l1',
section: 'getTotalBatches',
})
this.metrics.l1NodeConnectionFailures.inc()
await sleep(15000) await sleep(15000)
return return
} }
this.metrics.highestKnownBatchIndex.set(latestBatchIndex.toNumber()) if (this.state.highestCheckedBatchIndex >= latestBatchIndex) {
await sleep(15000)
return
} else {
this.metrics.highestKnownBatchIndex.set(latestBatchIndex)
}
this.logger.info(`checking batch`, { this.logger.info(`checking batch`, {
batchIndex: this.state.highestCheckedBatchIndex, batchIndex: this.state.highestCheckedBatchIndex,
latestIndex: latestBatchIndex.toNumber(), latestIndex: latestBatchIndex,
}) })
let event: ethers.Event let event: ethers.Event
...@@ -122,13 +142,30 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> { ...@@ -122,13 +142,30 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> {
this.state.highestCheckedBatchIndex this.state.highestCheckedBatchIndex
) )
} catch (err) { } catch (err) {
this.logger.error(`got unexpected error while searching for batch`, { this.logger.error(`got error when connecting to node`, {
batchIndex: this.state.highestCheckedBatchIndex,
error: err, error: err,
node: 'l1',
section: 'findEventForStateBatch',
}) })
this.metrics.l1NodeConnectionFailures.inc()
await sleep(15000)
return
}
let batchTransaction: Transaction
try {
batchTransaction = await event.getTransaction()
} catch (err) {
this.logger.error(`got error when connecting to node`, {
error: err,
node: 'l1',
section: 'getTransaction',
})
this.metrics.l1NodeConnectionFailures.inc()
await sleep(15000)
return
} }
const batchTransaction = await event.getTransaction()
const [stateRoots] = this.state.scc.interface.decodeFunctionData( const [stateRoots] = this.state.scc.interface.decodeFunctionData(
'appendStateBatch', 'appendStateBatch',
batchTransaction.data batchTransaction.data
...@@ -138,7 +175,20 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> { ...@@ -138,7 +175,20 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> {
const batchSize = event.args._batchSize.toNumber() const batchSize = event.args._batchSize.toNumber()
const batchEnd = batchStart + batchSize const batchEnd = batchStart + batchSize
const latestBlock = await this.options.l2RpcProvider.getBlockNumber() let latestBlock: number
try {
latestBlock = await this.options.l2RpcProvider.getBlockNumber()
} catch (err) {
this.logger.error(`got error when connecting to node`, {
error: err,
node: 'l2',
section: 'getBlockNumber',
})
this.metrics.l2NodeConnectionFailures.inc()
await sleep(15000)
return
}
if (latestBlock < batchEnd) { if (latestBlock < batchEnd) {
this.logger.info(`node is behind, waiting for sync`, { this.logger.info(`node is behind, waiting for sync`, {
batchEnd, batchEnd,
...@@ -151,21 +201,32 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> { ...@@ -151,21 +201,32 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> {
// multiple requests of maximum 1000 blocks in the case that batchSize > 1000. // multiple requests of maximum 1000 blocks in the case that batchSize > 1000.
let blocks: any[] = [] let blocks: any[] = []
for (let i = 0; i < batchSize; i += 1000) { for (let i = 0; i < batchSize; i += 1000) {
const provider = this.options let newBlocks: any[]
.l2RpcProvider as ethers.providers.JsonRpcProvider try {
blocks = blocks.concat( newBlocks = await (
await provider.send('eth_getBlockRange', [ this.options.l2RpcProvider as ethers.providers.JsonRpcProvider
).send('eth_getBlockRange', [
toRpcHexString(batchStart + i), toRpcHexString(batchStart + i),
toRpcHexString(batchStart + i + Math.min(batchSize - i, 1000) - 1), toRpcHexString(batchStart + i + Math.min(batchSize - i, 1000) - 1),
false, false,
]) ])
) } catch (err) {
this.logger.error(`got error when connecting to node`, {
error: err,
node: 'l2',
section: 'getBlockRange',
})
this.metrics.l2NodeConnectionFailures.inc()
await sleep(15000)
return
}
blocks = blocks.concat(newBlocks)
} }
for (const [i, stateRoot] of stateRoots.entries()) { for (const [i, stateRoot] of stateRoots.entries()) {
if (blocks[i].stateRoot !== stateRoot) { if (blocks[i].stateRoot !== stateRoot) {
this.metrics.isCurrentlyMismatched.set(1) this.metrics.isCurrentlyMismatched.set(1)
const fpw = await this.state.scc.FRAUD_PROOF_WINDOW()
this.logger.error(`state root mismatch`, { this.logger.error(`state root mismatch`, {
blockNumber: blocks[i].number, blockNumber: blocks[i].number,
expectedStateRoot: blocks[i].stateRoot, expectedStateRoot: blocks[i].stateRoot,
...@@ -173,7 +234,7 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> { ...@@ -173,7 +234,7 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> {
finalizationTime: dateformat( finalizationTime: dateformat(
new Date( new Date(
(ethers.BigNumber.from(blocks[i].timestamp).toNumber() + (ethers.BigNumber.from(blocks[i].timestamp).toNumber() +
fpw.toNumber()) * this.state.fpw) *
1000 1000
), ),
'mmmm dS, yyyy, h:MM:ss TT' 'mmmm dS, yyyy, h:MM:ss TT'
...@@ -190,7 +251,6 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> { ...@@ -190,7 +251,6 @@ export class FaultDetector extends BaseServiceV2<Options, Metrics, State> {
// If we got through the above without throwing an error, we should be fine to reset. // If we got through the above without throwing an error, we should be fine to reset.
this.metrics.isCurrentlyMismatched.set(0) this.metrics.isCurrentlyMismatched.set(0)
this.metrics.inUnexpectedErrorState.set(0)
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment