metrics.go 14.9 KB
Newer Older
1 2 3
package proxyd

import (
4
	"context"
5
	"strconv"
6
	"strings"
7
	"time"
8

9 10
	"github.com/ethereum/go-ethereum/common/hexutil"

11 12
	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"
13 14 15 16 17 18 19 20
)

const (
	MetricsNamespace = "proxyd"

	RPCRequestSourceHTTP = "http"
	RPCRequestSourceWS   = "ws"

21
	BackendProxyd = "proxyd"
22 23
	SourceClient  = "client"
	SourceBackend = "backend"
24
	MethodUnknown = "unknown"
25 26
)

27
var PayloadSizeBuckets = []float64{10, 50, 100, 500, 1000, 5000, 10000, 100000, 1000000}
28
var MillisecondDurationBuckets = []float64{1, 10, 50, 100, 500, 1000, 5000, 10000, 100000}
29

30 31 32 33 34 35 36 37 38
var (
	rpcRequestsTotal = promauto.NewCounter(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "rpc_requests_total",
		Help:      "Count of total client RPC requests.",
	})

	rpcForwardsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
39
		Name:      "rpc_forwards_total",
40 41
		Help:      "Count of total RPC requests forwarded to each backend.",
	}, []string{
42
		"auth",
43 44 45 46 47
		"backend_name",
		"method_name",
		"source",
	})

48 49 50 51 52 53 54 55 56
	rpcBackendHTTPResponseCodesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "rpc_backend_http_response_codes_total",
		Help:      "Count of total backend responses by HTTP status code.",
	}, []string{
		"auth",
		"backend_name",
		"method_name",
		"status_code",
57
		"batched",
58 59
	})

60 61 62 63 64
	rpcErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "rpc_errors_total",
		Help:      "Count of total RPC errors.",
	}, []string{
65
		"auth",
66 67
		"backend_name",
		"method_name",
68 69 70
		"error_code",
	})

71 72 73 74 75 76 77 78 79 80 81
	rpcSpecialErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "rpc_special_errors_total",
		Help:      "Count of total special RPC errors.",
	}, []string{
		"auth",
		"backend_name",
		"method_name",
		"error_type",
	})

82 83 84 85 86 87 88 89
	rpcBackendRequestDurationSumm = promauto.NewSummaryVec(prometheus.SummaryOpts{
		Namespace:  MetricsNamespace,
		Name:       "rpc_backend_request_duration_seconds",
		Help:       "Summary of backend response times broken down by backend and method name.",
		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
	}, []string{
		"backend_name",
		"method_name",
90
		"batched",
91 92
	})

93
	activeClientWsConnsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
94 95 96
		Namespace: MetricsNamespace,
		Name:      "active_client_ws_conns",
		Help:      "Gauge of active client WS connections.",
97 98
	}, []string{
		"auth",
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
	})

	activeBackendWsConnsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "active_backend_ws_conns",
		Help:      "Gauge of active backend WS connections.",
	}, []string{
		"backend_name",
	})

	unserviceableRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "unserviceable_requests_total",
		Help:      "Count of total requests that were rejected due to no backends being available.",
	}, []string{
114
		"auth",
115
		"request_source",
116 117
	})

118 119 120 121 122 123 124 125
	httpResponseCodesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "http_response_codes_total",
		Help:      "Count of total HTTP response codes.",
	}, []string{
		"status_code",
	})

126 127 128 129 130 131 132 133 134 135 136 137
	httpRequestDurationSumm = promauto.NewSummary(prometheus.SummaryOpts{
		Namespace:  MetricsNamespace,
		Name:       "http_request_duration_seconds",
		Help:       "Summary of HTTP request durations, in seconds.",
		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
	})

	wsMessagesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "ws_messages_total",
		Help:      "Count of total websocket messages including protocol control.",
	}, []string{
138
		"auth",
139 140 141 142 143 144 145 146 147 148 149
		"backend_name",
		"source",
	})

	redisErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "redis_errors_total",
		Help:      "Count of total Redis errors.",
	}, []string{
		"source",
	})
150

151 152 153
	requestPayloadSizesGauge = promauto.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: MetricsNamespace,
		Name:      "request_payload_sizes",
154
		Help:      "Histogram of client request payload sizes.",
155 156 157 158 159 160 161 162
		Buckets:   PayloadSizeBuckets,
	}, []string{
		"auth",
	})

	responsePayloadSizesGauge = promauto.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: MetricsNamespace,
		Name:      "response_payload_sizes",
163
		Help:      "Histogram of client response payload sizes.",
164 165 166 167 168
		Buckets:   PayloadSizeBuckets,
	}, []string{
		"auth",
	})

169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
	cacheHitsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "cache_hits_total",
		Help:      "Number of cache hits.",
	}, []string{
		"method",
	})

	cacheMissesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "cache_misses_total",
		Help:      "Number of cache misses.",
	}, []string{
		"method",
	})

185 186 187 188 189 190 191 192
	cacheErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "cache_errors_total",
		Help:      "Number of cache errors.",
	}, []string{
		"method",
	})

193 194 195 196 197 198
	batchRPCShortCircuitsTotal = promauto.NewCounter(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "batch_rpc_short_circuits_total",
		Help:      "Count of total batch RPC short-circuits.",
	})

199 200 201 202 203 204
	rpcSpecialErrors = []string{
		"nonce too low",
		"gas price too high",
		"gas price too low",
		"invalid parameters",
	}
205 206 207 208 209 210 211

	redisCacheDurationSumm = promauto.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: MetricsNamespace,
		Name:      "redis_cache_duration_milliseconds",
		Help:      "Histogram of Redis command durations, in milliseconds.",
		Buckets:   MillisecondDurationBuckets,
	}, []string{"command"})
212 213 214 215 216 217 218 219

	tooManyRequestErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "too_many_request_errors_total",
		Help:      "Count of request timeouts due to too many concurrent RPCs.",
	}, []string{
		"backend_name",
	})
220 221 222 223 224 225 226 227 228 229 230 231 232 233

	batchSizeHistogram = promauto.NewHistogram(prometheus.HistogramOpts{
		Namespace: MetricsNamespace,
		Name:      "batch_size_summary",
		Help:      "Summary of batch sizes",
		Buckets: []float64{
			1,
			5,
			10,
			25,
			50,
			100,
		},
	})
234 235 236 237 238 239

	frontendRateLimitTakeErrors = promauto.NewCounter(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "rate_limit_take_errors",
		Help:      "Count of errors taking frontend rate limits",
	})
240

241
	consensusLatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
242
		Namespace: MetricsNamespace,
243
		Name:      "group_consensus_latest_block",
244
		Help:      "Consensus latest block",
245 246
	}, []string{
		"backend_group_name",
247 248
	})

Felipe Andrade's avatar
Felipe Andrade committed
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
	consensusSafeBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_safe_block",
		Help:      "Consensus safe block",
	}, []string{
		"backend_group_name",
	})

	consensusFinalizedBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_finalized_block",
		Help:      "Consensus finalized block",
	}, []string{
		"backend_group_name",
	})

265 266 267 268 269 270 271
	backendLatestBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_latest_block",
		Help:      "Current latest block observed per backend",
	}, []string{
		"backend_name",
	})
272

Felipe Andrade's avatar
Felipe Andrade committed
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
	backendSafeBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_safe_block",
		Help:      "Current safe block observed per backend",
	}, []string{
		"backend_name",
	})

	backendFinalizedBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_finalized_block",
		Help:      "Current finalized block observed per backend",
	}, []string{
		"backend_name",
	})

	backendUnexpectedBlockTagsBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_unexpected_block_tags",
		Help:      "Bool gauge for unexpected block tags",
	}, []string{
		"backend_name",
	})

297 298 299
	consensusGroupCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_count",
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
		Help:      "Consensus group serving traffic count",
	}, []string{
		"backend_group_name",
	})

	consensusGroupFilteredCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_filtered_count",
		Help:      "Consensus group filtered out from serving traffic count",
	}, []string{
		"backend_group_name",
	})

	consensusGroupTotalCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_total_count",
		Help:      "Total count of candidates to be part of consensus group",
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
	}, []string{
		"backend_group_name",
	})

	consensusBannedBackends = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "consensus_backend_banned",
		Help:      "Bool gauge for banned backends",
	}, []string{
		"backend_name",
	})

	consensusPeerCountBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "consensus_backend_peer_count",
		Help:      "Peer count",
	}, []string{
		"backend_name",
	})

	consensusInSyncBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "consensus_backend_in_sync",
		Help:      "Bool gauge for backends in sync",
	}, []string{
		"backend_name",
	})

	consensusUpdateDelayBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "consensus_backend_update_delay",
		Help:      "Delay (ms) for backend update",
	}, []string{
		"backend_name",
	})
352 353 354 355 356 357 358 359 360

	avgLatencyBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_avg_latency",
		Help:      "Average latency per backend",
	}, []string{
		"backend_name",
	})

361 362 363 364 365 366 367 368
	degradedBackends = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_degraded",
		Help:      "Bool gauge for degraded backends",
	}, []string{
		"backend_name",
	})

Felipe Andrade's avatar
Felipe Andrade committed
369
	networkErrorRateBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
370
		Namespace: MetricsNamespace,
Felipe Andrade's avatar
Felipe Andrade committed
371 372
		Name:      "backend_error_rate",
		Help:      "Request error rate per backend",
373 374 375
	}, []string{
		"backend_name",
	})
376 377
)

378 379 380 381
func RecordRedisError(source string) {
	redisErrorsTotal.WithLabelValues(source).Inc()
}

382
func RecordRPCError(ctx context.Context, backendName, method string, err error) {
383 384 385
	rpcErr, ok := err.(*RPCErr)
	var code int
	if ok {
386
		MaybeRecordSpecialRPCError(ctx, backendName, method, rpcErr)
387 388 389 390 391
		code = rpcErr.Code
	} else {
		code = -1
	}

392
	rpcErrorsTotal.WithLabelValues(GetAuthCtx(ctx), backendName, method, strconv.Itoa(code)).Inc()
393 394
}

395 396
func RecordWSMessage(ctx context.Context, backendName, source string) {
	wsMessagesTotal.WithLabelValues(GetAuthCtx(ctx), backendName, source).Inc()
397 398
}

399 400
func RecordUnserviceableRequest(ctx context.Context, source string) {
	unserviceableRequestsTotal.WithLabelValues(GetAuthCtx(ctx), source).Inc()
401 402
}

403 404
func RecordRPCForward(ctx context.Context, backendName, method, source string) {
	rpcForwardsTotal.WithLabelValues(GetAuthCtx(ctx), backendName, method, source).Inc()
405
}
406 407 408 409 410 411 412 413 414 415

func MaybeRecordSpecialRPCError(ctx context.Context, backendName, method string, rpcErr *RPCErr) {
	errMsg := strings.ToLower(rpcErr.Message)
	for _, errStr := range rpcSpecialErrors {
		if strings.Contains(errMsg, errStr) {
			rpcSpecialErrorsTotal.WithLabelValues(GetAuthCtx(ctx), backendName, method, errStr).Inc()
			return
		}
	}
}
416

417 418
func RecordRequestPayloadSize(ctx context.Context, payloadSize int) {
	requestPayloadSizesGauge.WithLabelValues(GetAuthCtx(ctx)).Observe(float64(payloadSize))
419 420 421 422 423
}

func RecordResponsePayloadSize(ctx context.Context, payloadSize int) {
	responsePayloadSizesGauge.WithLabelValues(GetAuthCtx(ctx)).Observe(float64(payloadSize))
}
424 425 426 427 428 429 430 431

func RecordCacheHit(method string) {
	cacheHitsTotal.WithLabelValues(method).Inc()
}

func RecordCacheMiss(method string) {
	cacheMissesTotal.WithLabelValues(method).Inc()
}
432

433
func RecordCacheError(method string) {
Felipe Andrade's avatar
Felipe Andrade committed
434
	cacheErrorsTotal.WithLabelValues(method).Inc()
435 436
}

437 438 439
func RecordBatchSize(size int) {
	batchSizeHistogram.Observe(float64(size))
}
440

441 442 443 444
func RecordGroupConsensusLatestBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
	consensusLatestBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}

Felipe Andrade's avatar
Felipe Andrade committed
445 446 447 448 449 450 451 452
func RecordGroupConsensusSafeBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
	consensusSafeBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}

func RecordGroupConsensusFinalizedBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
	consensusFinalizedBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}

453 454 455 456
func RecordGroupConsensusCount(group *BackendGroup, count int) {
	consensusGroupCount.WithLabelValues(group.Name).Set(float64(count))
}

457 458 459 460 461 462 463 464
func RecordGroupConsensusFilteredCount(group *BackendGroup, count int) {
	consensusGroupFilteredCount.WithLabelValues(group.Name).Set(float64(count))
}

func RecordGroupTotalCount(group *BackendGroup, count int) {
	consensusGroupTotalCount.WithLabelValues(group.Name).Set(float64(count))
}

465 466
func RecordBackendLatestBlock(b *Backend, blockNumber hexutil.Uint64) {
	backendLatestBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
467 468
}

Felipe Andrade's avatar
Felipe Andrade committed
469 470 471 472 473 474 475 476 477
func RecordBackendSafeBlock(b *Backend, blockNumber hexutil.Uint64) {
	backendSafeBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
}

func RecordBackendFinalizedBlock(b *Backend, blockNumber hexutil.Uint64) {
	backendFinalizedBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
}

func RecordBackendUnexpectedBlockTags(b *Backend, unexpected bool) {
Felipe Andrade's avatar
Felipe Andrade committed
478
	backendUnexpectedBlockTagsBackend.WithLabelValues(b.Name).Set(boolToFloat64(unexpected))
Felipe Andrade's avatar
Felipe Andrade committed
479 480
}

481
func RecordConsensusBackendBanned(b *Backend, banned bool) {
Felipe Andrade's avatar
Felipe Andrade committed
482
	consensusBannedBackends.WithLabelValues(b.Name).Set(boolToFloat64(banned))
483 484
}

485 486
func RecordConsensusBackendPeerCount(b *Backend, peerCount uint64) {
	consensusPeerCountBackend.WithLabelValues(b.Name).Set(float64(peerCount))
487 488
}

489
func RecordConsensusBackendInSync(b *Backend, inSync bool) {
Felipe Andrade's avatar
Felipe Andrade committed
490
	consensusInSyncBackend.WithLabelValues(b.Name).Set(boolToFloat64(inSync))
491 492
}

Felipe Andrade's avatar
Felipe Andrade committed
493 494 495 496 497 498
func RecordConsensusBackendUpdateDelay(b *Backend, lastUpdate time.Time) {
	// avoid recording the delay for the first update
	if lastUpdate.IsZero() {
		return
	}
	delay := time.Since(lastUpdate)
499 500 501
	consensusUpdateDelayBackend.WithLabelValues(b.Name).Set(float64(delay.Milliseconds()))
}

502 503
func RecordBackendNetworkLatencyAverageSlidingWindow(b *Backend, avgLatency time.Duration) {
	avgLatencyBackend.WithLabelValues(b.Name).Set(float64(avgLatency.Milliseconds()))
504
	degradedBackends.WithLabelValues(b.Name).Set(boolToFloat64(b.IsDegraded()))
505 506
}

Felipe Andrade's avatar
Felipe Andrade committed
507 508
func RecordBackendNetworkErrorRateSlidingWindow(b *Backend, rate float64) {
	networkErrorRateBackend.WithLabelValues(b.Name).Set(rate)
509 510
}

Felipe Andrade's avatar
Felipe Andrade committed
511 512 513 514 515
func boolToFloat64(b bool) float64 {
	if b {
		return 1
	}
	return 0
516
}