Commit 032731b5 authored by Kelvin Fichter's avatar Kelvin Fichter

feat(rhc): cleanly catch connection failures

Updates the replica-healthcheck service to cleanly catch connection
failures in the target and reference nodes. When a failure is detected,
a metric will be incremented and the failure will be logged.
parent f981b8da
---
'@eth-optimism/replica-healthcheck': patch
---
Add checks and metrics for dead networks
......@@ -208,11 +208,9 @@ services:
dockerfile: ./ops/docker/Dockerfile.packages
target: replica-healthcheck
image: ethereumoptimism/replica-healthcheck:${DOCKER_TAG_REPLICA_HEALTHCHECK:-latest}
entrypoint: ./healthcheck.sh
environment:
HEALTHCHECK__REFERENCE_RPC_PROVIDER: http://l2geth:8545
HEALTHCHECK__TARGET_RPC_PROVIDER: http://replica:8545
RETRIES: 60
ports:
- ${HEALTHCHECK_HTTP_PORT:-7300}:7300
......
......@@ -62,5 +62,4 @@ CMD ["npm", "run", "start"]
FROM base as replica-healthcheck
WORKDIR /opt/optimism/packages/replica-healthcheck
COPY ./ops/scripts/healthcheck.sh .
ENTRYPOINT ["npm", "run", "start"]
#!/bin/bash
set -e
RETRIES=${RETRIES:-60}
# wait for reference RPC to be up
curl \
--fail \
--show-error \
--silent \
--output /dev/null \
--retry-connrefused \
--retry $RETRIES \
--retry-delay 1 \
$HEALTHCHECK__REFERENCE_RPC_PROVIDER
# wait for target RPC to be up
curl \
--fail \
--show-error \
--silent \
--output /dev/null \
--retry-connrefused \
--retry $RETRIES \
--retry-delay 1 \
$HEALTHCHECK__TARGET_RPC_PROVIDER
# go
exec yarn start
import { Provider } from '@ethersproject/abstract-provider'
import { BaseServiceV2, Gauge, validators } from '@eth-optimism/common-ts'
import { Provider, Block } from '@ethersproject/abstract-provider'
import {
BaseServiceV2,
Counter,
Gauge,
validators,
} from '@eth-optimism/common-ts'
import { sleep } from '@eth-optimism/core-utils'
type HealthcheckOptions = {
......@@ -13,6 +18,8 @@ type HealthcheckMetrics = {
isCurrentlyDiverged: Gauge
referenceHeight: Gauge
targetHeight: Gauge
targetConnectionFailures: Counter
referenceConnectionFailures: Counter
}
type HealthcheckState = {}
......@@ -59,15 +66,48 @@ export class HealthcheckService extends BaseServiceV2<
type: Gauge,
desc: 'Block height of the target client',
},
targetConnectionFailures: {
type: Counter,
desc: 'Number of connection failures to the target client',
},
referenceConnectionFailures: {
type: Counter,
desc: 'Number of connection failures to the reference client',
},
},
})
}
async main() {
const targetLatest = await this.options.targetRpcProvider.getBlock('latest')
const referenceLatest = await this.options.referenceRpcProvider.getBlock(
// Get the latest block from the target client and check for connection failures.
let targetLatest: Block
try {
targetLatest = await this.options.targetRpcProvider.getBlock('latest')
} catch (err) {
if (err.message.includes('could not detect network')) {
this.logger.error('target client not connected')
this.metrics.targetConnectionFailures.inc()
return
} else {
throw err
}
}
// Get the latest block from the reference client and check for connection failures.
let referenceLatest: Block
try {
referenceLatest = await this.options.referenceRpcProvider.getBlock(
'latest'
)
} catch (err) {
if (err.message.includes('could not detect network')) {
this.logger.error('reference client not connected')
this.metrics.referenceConnectionFailures.inc()
return
} else {
throw err
}
}
// Update these metrics first so they'll refresh no matter what.
this.metrics.targetHeight.set(targetLatest.number)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment