metrics.go 16.1 KB
Newer Older
1 2 3
package proxyd

import (
4
	"context"
5
	"strconv"
6
	"strings"
7
	"time"
8

9 10
	"github.com/ethereum/go-ethereum/common/hexutil"

11 12
	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"
13 14 15 16 17 18 19 20
)

const (
	MetricsNamespace = "proxyd"

	RPCRequestSourceHTTP = "http"
	RPCRequestSourceWS   = "ws"

21
	BackendProxyd = "proxyd"
22 23
	SourceClient  = "client"
	SourceBackend = "backend"
24
	MethodUnknown = "unknown"
25 26
)

27
var PayloadSizeBuckets = []float64{10, 50, 100, 500, 1000, 5000, 10000, 100000, 1000000}
28
var MillisecondDurationBuckets = []float64{1, 10, 50, 100, 500, 1000, 5000, 10000, 100000}
29

30 31 32 33 34 35 36 37 38
var (
	rpcRequestsTotal = promauto.NewCounter(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "rpc_requests_total",
		Help:      "Count of total client RPC requests.",
	})

	rpcForwardsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
39
		Name:      "rpc_forwards_total",
40 41
		Help:      "Count of total RPC requests forwarded to each backend.",
	}, []string{
42
		"auth",
43 44 45 46 47
		"backend_name",
		"method_name",
		"source",
	})

48 49 50 51 52 53 54 55 56
	rpcBackendHTTPResponseCodesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "rpc_backend_http_response_codes_total",
		Help:      "Count of total backend responses by HTTP status code.",
	}, []string{
		"auth",
		"backend_name",
		"method_name",
		"status_code",
57
		"batched",
58 59
	})

60 61 62 63 64
	rpcErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "rpc_errors_total",
		Help:      "Count of total RPC errors.",
	}, []string{
65
		"auth",
66 67
		"backend_name",
		"method_name",
68 69 70
		"error_code",
	})

71 72 73 74 75 76 77 78 79 80 81
	rpcSpecialErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "rpc_special_errors_total",
		Help:      "Count of total special RPC errors.",
	}, []string{
		"auth",
		"backend_name",
		"method_name",
		"error_type",
	})

82 83 84 85 86 87 88 89
	rpcBackendRequestDurationSumm = promauto.NewSummaryVec(prometheus.SummaryOpts{
		Namespace:  MetricsNamespace,
		Name:       "rpc_backend_request_duration_seconds",
		Help:       "Summary of backend response times broken down by backend and method name.",
		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
	}, []string{
		"backend_name",
		"method_name",
90
		"batched",
91 92
	})

93
	activeClientWsConnsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
94 95 96
		Namespace: MetricsNamespace,
		Name:      "active_client_ws_conns",
		Help:      "Gauge of active client WS connections.",
97 98
	}, []string{
		"auth",
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
	})

	activeBackendWsConnsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "active_backend_ws_conns",
		Help:      "Gauge of active backend WS connections.",
	}, []string{
		"backend_name",
	})

	unserviceableRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "unserviceable_requests_total",
		Help:      "Count of total requests that were rejected due to no backends being available.",
	}, []string{
114
		"auth",
115
		"request_source",
116 117
	})

118 119 120 121 122 123 124 125
	httpResponseCodesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "http_response_codes_total",
		Help:      "Count of total HTTP response codes.",
	}, []string{
		"status_code",
	})

126 127 128 129 130 131 132 133 134 135 136 137
	httpRequestDurationSumm = promauto.NewSummary(prometheus.SummaryOpts{
		Namespace:  MetricsNamespace,
		Name:       "http_request_duration_seconds",
		Help:       "Summary of HTTP request durations, in seconds.",
		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
	})

	wsMessagesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "ws_messages_total",
		Help:      "Count of total websocket messages including protocol control.",
	}, []string{
138
		"auth",
139 140 141 142 143 144 145 146 147 148 149
		"backend_name",
		"source",
	})

	redisErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "redis_errors_total",
		Help:      "Count of total Redis errors.",
	}, []string{
		"source",
	})
150

151 152 153
	requestPayloadSizesGauge = promauto.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: MetricsNamespace,
		Name:      "request_payload_sizes",
154
		Help:      "Histogram of client request payload sizes.",
155 156 157 158 159 160 161 162
		Buckets:   PayloadSizeBuckets,
	}, []string{
		"auth",
	})

	responsePayloadSizesGauge = promauto.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: MetricsNamespace,
		Name:      "response_payload_sizes",
163
		Help:      "Histogram of client response payload sizes.",
164 165 166 167 168
		Buckets:   PayloadSizeBuckets,
	}, []string{
		"auth",
	})

169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
	cacheHitsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "cache_hits_total",
		Help:      "Number of cache hits.",
	}, []string{
		"method",
	})

	cacheMissesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "cache_misses_total",
		Help:      "Number of cache misses.",
	}, []string{
		"method",
	})

185 186 187 188 189 190 191 192
	cacheErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "cache_errors_total",
		Help:      "Number of cache errors.",
	}, []string{
		"method",
	})

193 194 195 196 197 198
	batchRPCShortCircuitsTotal = promauto.NewCounter(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "batch_rpc_short_circuits_total",
		Help:      "Count of total batch RPC short-circuits.",
	})

199 200 201 202 203 204
	rpcSpecialErrors = []string{
		"nonce too low",
		"gas price too high",
		"gas price too low",
		"invalid parameters",
	}
205 206 207 208 209 210 211

	redisCacheDurationSumm = promauto.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: MetricsNamespace,
		Name:      "redis_cache_duration_milliseconds",
		Help:      "Histogram of Redis command durations, in milliseconds.",
		Buckets:   MillisecondDurationBuckets,
	}, []string{"command"})
212 213 214 215 216 217 218 219

	tooManyRequestErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "too_many_request_errors_total",
		Help:      "Count of request timeouts due to too many concurrent RPCs.",
	}, []string{
		"backend_name",
	})
220 221 222 223 224 225 226 227 228 229 230 231 232 233

	batchSizeHistogram = promauto.NewHistogram(prometheus.HistogramOpts{
		Namespace: MetricsNamespace,
		Name:      "batch_size_summary",
		Help:      "Summary of batch sizes",
		Buckets: []float64{
			1,
			5,
			10,
			25,
			50,
			100,
		},
	})
234 235 236 237 238 239

	frontendRateLimitTakeErrors = promauto.NewCounter(prometheus.CounterOpts{
		Namespace: MetricsNamespace,
		Name:      "rate_limit_take_errors",
		Help:      "Count of errors taking frontend rate limits",
	})
240

241
	consensusLatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
242
		Namespace: MetricsNamespace,
243
		Name:      "group_consensus_latest_block",
244
		Help:      "Consensus latest block",
245 246
	}, []string{
		"backend_group_name",
247 248
	})

Felipe Andrade's avatar
Felipe Andrade committed
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
	consensusSafeBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_safe_block",
		Help:      "Consensus safe block",
	}, []string{
		"backend_group_name",
	})

	consensusFinalizedBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_finalized_block",
		Help:      "Consensus finalized block",
	}, []string{
		"backend_group_name",
	})

265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
	consensusHALatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_ha_latest_block",
		Help:      "Consensus HA latest block",
	}, []string{
		"backend_group_name",
		"leader",
	})

	consensusHASafeBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_ha_safe_block",
		Help:      "Consensus HA safe block",
	}, []string{
		"backend_group_name",
		"leader",
	})

	consensusHAFinalizedBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_ha_finalized_block",
		Help:      "Consensus HA finalized block",
	}, []string{
		"backend_group_name",
		"leader",
	})

292 293 294 295 296 297 298
	backendLatestBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_latest_block",
		Help:      "Current latest block observed per backend",
	}, []string{
		"backend_name",
	})
299

Felipe Andrade's avatar
Felipe Andrade committed
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
	backendSafeBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_safe_block",
		Help:      "Current safe block observed per backend",
	}, []string{
		"backend_name",
	})

	backendFinalizedBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_finalized_block",
		Help:      "Current finalized block observed per backend",
	}, []string{
		"backend_name",
	})

	backendUnexpectedBlockTagsBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_unexpected_block_tags",
		Help:      "Bool gauge for unexpected block tags",
	}, []string{
		"backend_name",
	})

324 325 326
	consensusGroupCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_count",
327 328 329 330 331 332 333 334
		Help:      "Consensus group serving traffic count",
	}, []string{
		"backend_group_name",
	})

	consensusGroupFilteredCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_filtered_count",
335
		Help:      "Consensus group filtered out from serving traffic count",
336 337 338 339 340 341 342 343
	}, []string{
		"backend_group_name",
	})

	consensusGroupTotalCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "group_consensus_total_count",
		Help:      "Total count of candidates to be part of consensus group",
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
	}, []string{
		"backend_group_name",
	})

	consensusBannedBackends = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "consensus_backend_banned",
		Help:      "Bool gauge for banned backends",
	}, []string{
		"backend_name",
	})

	consensusPeerCountBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "consensus_backend_peer_count",
		Help:      "Peer count",
	}, []string{
		"backend_name",
	})

	consensusInSyncBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "consensus_backend_in_sync",
		Help:      "Bool gauge for backends in sync",
	}, []string{
		"backend_name",
	})

	consensusUpdateDelayBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "consensus_backend_update_delay",
		Help:      "Delay (ms) for backend update",
	}, []string{
		"backend_name",
	})
379 380 381 382 383 384 385 386 387

	avgLatencyBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_avg_latency",
		Help:      "Average latency per backend",
	}, []string{
		"backend_name",
	})

388 389 390 391 392 393 394 395
	degradedBackends = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: MetricsNamespace,
		Name:      "backend_degraded",
		Help:      "Bool gauge for degraded backends",
	}, []string{
		"backend_name",
	})

Felipe Andrade's avatar
Felipe Andrade committed
396
	networkErrorRateBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
397
		Namespace: MetricsNamespace,
Felipe Andrade's avatar
Felipe Andrade committed
398 399
		Name:      "backend_error_rate",
		Help:      "Request error rate per backend",
400 401 402
	}, []string{
		"backend_name",
	})
403 404
)

405 406 407 408
func RecordRedisError(source string) {
	redisErrorsTotal.WithLabelValues(source).Inc()
}

409
func RecordRPCError(ctx context.Context, backendName, method string, err error) {
410 411 412
	rpcErr, ok := err.(*RPCErr)
	var code int
	if ok {
413
		MaybeRecordSpecialRPCError(ctx, backendName, method, rpcErr)
414 415 416 417 418
		code = rpcErr.Code
	} else {
		code = -1
	}

419
	rpcErrorsTotal.WithLabelValues(GetAuthCtx(ctx), backendName, method, strconv.Itoa(code)).Inc()
420 421
}

422 423
func RecordWSMessage(ctx context.Context, backendName, source string) {
	wsMessagesTotal.WithLabelValues(GetAuthCtx(ctx), backendName, source).Inc()
424 425
}

426 427
func RecordUnserviceableRequest(ctx context.Context, source string) {
	unserviceableRequestsTotal.WithLabelValues(GetAuthCtx(ctx), source).Inc()
428 429
}

430 431
func RecordRPCForward(ctx context.Context, backendName, method, source string) {
	rpcForwardsTotal.WithLabelValues(GetAuthCtx(ctx), backendName, method, source).Inc()
432
}
433 434 435 436 437 438 439 440 441 442

func MaybeRecordSpecialRPCError(ctx context.Context, backendName, method string, rpcErr *RPCErr) {
	errMsg := strings.ToLower(rpcErr.Message)
	for _, errStr := range rpcSpecialErrors {
		if strings.Contains(errMsg, errStr) {
			rpcSpecialErrorsTotal.WithLabelValues(GetAuthCtx(ctx), backendName, method, errStr).Inc()
			return
		}
	}
}
443

444 445
func RecordRequestPayloadSize(ctx context.Context, payloadSize int) {
	requestPayloadSizesGauge.WithLabelValues(GetAuthCtx(ctx)).Observe(float64(payloadSize))
446 447 448 449 450
}

func RecordResponsePayloadSize(ctx context.Context, payloadSize int) {
	responsePayloadSizesGauge.WithLabelValues(GetAuthCtx(ctx)).Observe(float64(payloadSize))
}
451 452 453 454 455 456 457 458

func RecordCacheHit(method string) {
	cacheHitsTotal.WithLabelValues(method).Inc()
}

func RecordCacheMiss(method string) {
	cacheMissesTotal.WithLabelValues(method).Inc()
}
459

460
func RecordCacheError(method string) {
Felipe Andrade's avatar
Felipe Andrade committed
461
	cacheErrorsTotal.WithLabelValues(method).Inc()
462 463
}

464 465 466
func RecordBatchSize(size int) {
	batchSizeHistogram.Observe(float64(size))
}
467

468 469 470 471 472 473 474 475 476 477 478 479
func RecordGroupConsensusHALatestBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) {
	consensusHALatestBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber))
}

func RecordGroupConsensusHASafeBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) {
	consensusHASafeBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber))
}

func RecordGroupConsensusHAFinalizedBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) {
	consensusHAFinalizedBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber))
}

480 481 482 483
func RecordGroupConsensusLatestBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
	consensusLatestBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}

Felipe Andrade's avatar
Felipe Andrade committed
484 485 486 487 488 489 490 491
func RecordGroupConsensusSafeBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
	consensusSafeBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}

func RecordGroupConsensusFinalizedBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
	consensusFinalizedBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}

492 493 494 495
func RecordGroupConsensusCount(group *BackendGroup, count int) {
	consensusGroupCount.WithLabelValues(group.Name).Set(float64(count))
}

496 497 498 499 500 501 502 503
func RecordGroupConsensusFilteredCount(group *BackendGroup, count int) {
	consensusGroupFilteredCount.WithLabelValues(group.Name).Set(float64(count))
}

func RecordGroupTotalCount(group *BackendGroup, count int) {
	consensusGroupTotalCount.WithLabelValues(group.Name).Set(float64(count))
}

504 505
func RecordBackendLatestBlock(b *Backend, blockNumber hexutil.Uint64) {
	backendLatestBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
506 507
}

Felipe Andrade's avatar
Felipe Andrade committed
508 509 510 511 512 513 514 515 516
func RecordBackendSafeBlock(b *Backend, blockNumber hexutil.Uint64) {
	backendSafeBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
}

func RecordBackendFinalizedBlock(b *Backend, blockNumber hexutil.Uint64) {
	backendFinalizedBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
}

func RecordBackendUnexpectedBlockTags(b *Backend, unexpected bool) {
Felipe Andrade's avatar
Felipe Andrade committed
517
	backendUnexpectedBlockTagsBackend.WithLabelValues(b.Name).Set(boolToFloat64(unexpected))
Felipe Andrade's avatar
Felipe Andrade committed
518 519
}

520
func RecordConsensusBackendBanned(b *Backend, banned bool) {
Felipe Andrade's avatar
Felipe Andrade committed
521
	consensusBannedBackends.WithLabelValues(b.Name).Set(boolToFloat64(banned))
522 523
}

524 525
func RecordConsensusBackendPeerCount(b *Backend, peerCount uint64) {
	consensusPeerCountBackend.WithLabelValues(b.Name).Set(float64(peerCount))
526 527
}

528
func RecordConsensusBackendInSync(b *Backend, inSync bool) {
Felipe Andrade's avatar
Felipe Andrade committed
529
	consensusInSyncBackend.WithLabelValues(b.Name).Set(boolToFloat64(inSync))
530 531
}

Felipe Andrade's avatar
Felipe Andrade committed
532 533 534 535 536 537
func RecordConsensusBackendUpdateDelay(b *Backend, lastUpdate time.Time) {
	// avoid recording the delay for the first update
	if lastUpdate.IsZero() {
		return
	}
	delay := time.Since(lastUpdate)
538 539 540
	consensusUpdateDelayBackend.WithLabelValues(b.Name).Set(float64(delay.Milliseconds()))
}

541 542
func RecordBackendNetworkLatencyAverageSlidingWindow(b *Backend, avgLatency time.Duration) {
	avgLatencyBackend.WithLabelValues(b.Name).Set(float64(avgLatency.Milliseconds()))
543
	degradedBackends.WithLabelValues(b.Name).Set(boolToFloat64(b.IsDegraded()))
544 545
}

Felipe Andrade's avatar
Felipe Andrade committed
546 547
func RecordBackendNetworkErrorRateSlidingWindow(b *Backend, rate float64) {
	networkErrorRateBackend.WithLabelValues(b.Name).Set(rate)
548 549
}

Felipe Andrade's avatar
Felipe Andrade committed
550 551 552 553 554
func boolToFloat64(b bool) float64 {
	if b {
		return 1
	}
	return 0
555
}