Commit 48e3956a authored by smartcontracts's avatar smartcontracts Committed by GitHub

Merge pull request #2352 from ethereum-optimism/sc/fix-rhc-docker

fix(rhc): fix bug in healthcheck dockerfile
parents a3b12065 c50cafa0
---
'@eth-optimism/replica-healthcheck': patch
---
Fixes a bug in the replica-healthcheck docker file
---
'@eth-optimism/common-ts': patch
---
Properly exposes metrics as part of a metrics server at port 7300
---
'@eth-optimism/integration-tests': patch
---
Add integration test for healthcheck server
---
'@eth-optimism/replica-healthcheck': patch
---
Add checks and metrics for dead networks
......@@ -154,6 +154,7 @@ jobs:
--env L2_GAS_PRICE=onchain \
--env RUN_DEBUG_TRACE_TESTS=false \
--env RUN_REPLICA_TESTS=false \
--env RUN_HEALTHCHECK_TESTS=false \
--env RUN_STRESS_TESTS=false \
--env OVMCONTEXT_SPEC_NUM_TXS=1 \
--env DTL_ENQUEUE_CONFIRMATIONS=12 \
......@@ -252,7 +253,7 @@ workflows:
- develop
jobs:
- build-dtl:
context:
context:
- optimism
- slack
<<: *slack-nightly-build-fail-post-step
......@@ -331,4 +332,4 @@ workflows:
}
]
}
event: always
\ No newline at end of file
event: always
......@@ -50,7 +50,7 @@ jobs:
working-directory: ./ops
run: |
./scripts/stats.sh &
docker-compose -f docker-compose.yml up -d
docker-compose -f docker-compose.yml up -d --scale replica-healthcheck=1
- name: Wait for the Sequencer node
working-directory: ./ops
......@@ -64,7 +64,6 @@ jobs:
if: failure()
uses: jwalton/gh-docker-logs@v1
with:
images: 'ethereumoptimism/hardhat,ops_deployer,ops_dtl,ops_l2geth,ethereumoptimism/message-relayer,ops_batch_submitter,ops_replica,ops_integration_tests'
dest: '/home/runner/logs'
- name: Tar logs
......
......@@ -10,8 +10,9 @@ OVMCONTEXT_SPEC_NUM_TXS=1
RUN_WITHDRAWAL_TESTS=false
RUN_DEBUG_TRACE_TESTS=false
RUN_REPLICA_TESTS=false
RUN_HEALTHCHECK_TESTS=false
RUN_STRESS_TESTS=false
# Can be configured up or down as necessary
MOCHA_TIMEOUT=300000
# Set to true to make Mocha stop after the first failed test.
MOCHA_BAIL=false
\ No newline at end of file
MOCHA_BAIL=false
......@@ -67,6 +67,7 @@
"hardhat-gas-reporter": "^1.0.4",
"lint-staged": "11.0.0",
"mocha": "^8.4.0",
"node-fetch": "^2.6.7",
"prom-client": "^14.0.1",
"rimraf": "^3.0.2",
"typescript": "^4.3.5",
......
import fetch from 'node-fetch'
import { expect } from './shared/setup'
import { envConfig } from './shared/utils'
describe('Healthcheck Tests', () => {
before(async function () {
if (!envConfig.RUN_HEALTHCHECK_TESTS) {
this.skip()
}
})
// Super simple test, is the metric server up?
it('should have metrics exposed', async () => {
const response = await fetch(envConfig.HEALTHCHECK_URL)
expect(response.status).to.equal(200)
})
})
......@@ -56,6 +56,8 @@ const procEnv = cleanEnv(process.env, {
VERIFIER_URL: str({ default: 'http://localhost:8547' }),
HEALTHCHECK_URL: str({ default: 'http://localhost:7300/metrics' }),
PRIVATE_KEY: str({
default:
'0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80',
......@@ -78,6 +80,9 @@ const procEnv = cleanEnv(process.env, {
RUN_REPLICA_TESTS: bool({
default: true,
}),
RUN_HEALTHCHECK_TESTS: bool({
default: true,
}),
RUN_DEBUG_TRACE_TESTS: bool({
default: true,
}),
......
......@@ -197,6 +197,23 @@ services:
- ${REPLICA_HTTP_PORT:-8549}:8545
- ${REPLICA_WS_PORT:-8550}:8546
replica-healthcheck:
depends_on:
- l2geth
- replica
deploy:
replicas: 0
build:
context: ..
dockerfile: ./ops/docker/Dockerfile.packages
target: replica-healthcheck
image: ethereumoptimism/replica-healthcheck:${DOCKER_TAG_REPLICA_HEALTHCHECK:-latest}
environment:
HEALTHCHECK__REFERENCE_RPC_PROVIDER: http://l2geth:8545
HEALTHCHECK__TARGET_RPC_PROVIDER: http://replica:8545
ports:
- ${HEALTHCHECK_HTTP_PORT:-7300}:7300
integration_tests:
deploy:
replicas: 0
......@@ -209,6 +226,7 @@ services:
environment:
L1_URL: http://l1_chain:8545
L2_URL: http://l2geth:8545
HEALTHCHECK_URL: http://replica-healthcheck:7300/metrics
REPLICA_URL: http://replica:8545
VERIFIER_URL: http://verifier:8545
URL: http://deployer:8081/addresses.json
......
......@@ -61,5 +61,5 @@ CMD ["npm", "run", "start"]
FROM base as replica-healthcheck
WORKDIR /opts/optimism/packages/replica-healthcheck
WORKDIR /opt/optimism/packages/replica-healthcheck
ENTRYPOINT ["npm", "run", "start"]
/* Imports: External */
import { Server } from 'net'
import Config from 'bcfg'
import * as dotenv from 'dotenv'
import { Command, Option } from 'commander'
import { ValidatorSpec, Spec, cleanEnv } from 'envalid'
import { sleep } from '@eth-optimism/core-utils'
import snakeCase from 'lodash/snakeCase'
import express from 'express'
import prometheus, { Registry } from 'prom-client'
/* Imports: Internal */
import { Logger } from '../common/logger'
import { Metric } from './metrics'
......@@ -82,6 +84,26 @@ export abstract class BaseServiceV2<
*/
protected readonly metrics: TMetrics
/**
* Registry for prometheus metrics.
*/
protected readonly metricsRegistry: Registry
/**
* Metrics server.
*/
protected metricsServer: Server
/**
* Port for the metrics server.
*/
protected readonly metricsServerPort: number
/**
* Hostname for the metrics server.
*/
protected readonly metricsServerHostname: string
/**
* @param params Options for the construction of the service.
* @param params.name Name for the service. This name will determine the prefix used for logging,
......@@ -93,6 +115,8 @@ export abstract class BaseServiceV2<
* @param params.options Options to pass to the service.
* @param params.loops Whether or not the service should loop. Defaults to true.
* @param params.loopIntervalMs Loop interval in milliseconds. Defaults to zero.
* @param params.metricsServerPort Port for the metrics server. Defaults to 7300.
* @param params.metricsServerHostname Hostname for the metrics server. Defaults to 0.0.0.0.
*/
constructor(params: {
name: string
......@@ -101,6 +125,8 @@ export abstract class BaseServiceV2<
options?: Partial<TOptions>
loop?: boolean
loopIntervalMs?: number
metricsServerPort?: number
metricsServerHostname?: string
}) {
this.loop = params.loop !== undefined ? params.loop : true
this.loopIntervalMs =
......@@ -203,6 +229,11 @@ export abstract class BaseServiceV2<
return acc
}, {}) as TMetrics
// Create the metrics server.
this.metricsRegistry = prometheus.register
this.metricsServerPort = params.metricsServerPort || 7300
this.metricsServerHostname = params.metricsServerHostname || '0.0.0.0'
this.logger = new Logger({ name: params.name })
// Gracefully handle stop signals.
......@@ -222,6 +253,33 @@ export abstract class BaseServiceV2<
public async run(): Promise<void> {
this.done = false
// Start the metrics server if not yet running.
if (!this.metricsServer) {
this.logger.info('starting metrics server')
await new Promise((resolve) => {
const app = express()
app.get('/metrics', async (_, res) => {
res.status(200).send(await this.metricsRegistry.metrics())
})
this.metricsServer = app.listen(
this.metricsServerPort,
this.metricsServerHostname,
() => {
resolve(null)
}
)
})
this.logger.info(`metrics started`, {
port: this.metricsServerPort,
hostname: this.metricsServerHostname,
route: '/metrics',
})
}
if (this.init) {
this.logger.info('initializing service')
await this.init()
......@@ -267,7 +325,18 @@ export abstract class BaseServiceV2<
while (!this.done) {
await sleep(1000)
}
this.logger.info('main loop finished, goodbye!')
// Shut down the metrics server if it's running.
if (this.metricsServer) {
this.logger.info('stopping metrics server')
await new Promise((resolve) => {
this.metricsServer.close(() => {
resolve(null)
})
})
this.logger.info('metrics server stopped')
this.metricsServer = undefined
}
}
/**
......
import { Provider } from '@ethersproject/abstract-provider'
import { BaseServiceV2, Gauge, validators } from '@eth-optimism/common-ts'
import { Provider, Block } from '@ethersproject/abstract-provider'
import {
BaseServiceV2,
Counter,
Gauge,
validators,
} from '@eth-optimism/common-ts'
import { sleep } from '@eth-optimism/core-utils'
type HealthcheckOptions = {
......@@ -13,6 +18,8 @@ type HealthcheckMetrics = {
isCurrentlyDiverged: Gauge
referenceHeight: Gauge
targetHeight: Gauge
targetConnectionFailures: Counter
referenceConnectionFailures: Counter
}
type HealthcheckState = {}
......@@ -59,15 +66,48 @@ export class HealthcheckService extends BaseServiceV2<
type: Gauge,
desc: 'Block height of the target client',
},
targetConnectionFailures: {
type: Counter,
desc: 'Number of connection failures to the target client',
},
referenceConnectionFailures: {
type: Counter,
desc: 'Number of connection failures to the reference client',
},
},
})
}
async main() {
const targetLatest = await this.options.targetRpcProvider.getBlock('latest')
const referenceLatest = await this.options.referenceRpcProvider.getBlock(
'latest'
)
// Get the latest block from the target client and check for connection failures.
let targetLatest: Block
try {
targetLatest = await this.options.targetRpcProvider.getBlock('latest')
} catch (err) {
if (err.message.includes('could not detect network')) {
this.logger.error('target client not connected')
this.metrics.targetConnectionFailures.inc()
return
} else {
throw err
}
}
// Get the latest block from the reference client and check for connection failures.
let referenceLatest: Block
try {
referenceLatest = await this.options.referenceRpcProvider.getBlock(
'latest'
)
} catch (err) {
if (err.message.includes('could not detect network')) {
this.logger.error('reference client not connected')
this.metrics.referenceConnectionFailures.inc()
return
} else {
throw err
}
}
// Update these metrics first so they'll refresh no matter what.
this.metrics.targetHeight.set(targetLatest.number)
......
......@@ -11520,7 +11520,7 @@ node-fetch@2.6.1:
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
node-fetch@^2.6.0, node-fetch@^2.6.1:
node-fetch@^2.6.0, node-fetch@^2.6.1, node-fetch@^2.6.7:
version "2.6.7"
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.7.tgz#24de9fba827e3b4ae44dc8b20256a379160052ad"
integrity sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ==
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment