From a3174e7ea3f2dfa7796b07fc4b362aba59063ee5 Mon Sep 17 00:00:00 2001 From: Larry Li Date: Fri, 13 Mar 2026 01:06:48 -0700 Subject: [PATCH 01/11] publish two basic metrics from the RPC client into Beholder: RPC latency, RPC error rate --- docs/rpc_observability.md | 47 +++++++++++ metrics/rpc_client.go | 125 ++++++++++++++++++++++++++++++ metrics/rpc_client_test.go | 32 ++++++++ multinode/go.mod | 2 + multinode/rpc_client_base.go | 38 +++++++-- multinode/rpc_client_base_test.go | 2 +- 6 files changed, 237 insertions(+), 9 deletions(-) create mode 100644 docs/rpc_observability.md create mode 100644 metrics/rpc_client.go create mode 100644 metrics/rpc_client_test.go diff --git a/docs/rpc_observability.md b/docs/rpc_observability.md new file mode 100644 index 0000000..351537d --- /dev/null +++ b/docs/rpc_observability.md @@ -0,0 +1,47 @@ +# RPC Observability (Beholder) + +RPC client metrics are published to Beholder and surface in Prometheus/Grafana when `RPCClientBase` is constructed with a non-nil `metrics.RPCClientMetrics`. + +## Metrics + +| Metric | Type | Description | +|--------|------|-------------| +| `rpc_request_latency_ms` | Histogram | RPC call latency in milliseconds (per call) | +| `rpc_request_errors_total` | Counter | Total number of failed RPC requests | + +Labels: `env`, `network`, `chain_id`, `rpc_provider`, `call` (e.g. `latest_block`, `latest_finalized_block`). + +## Example Prometheus / Grafana Queries + +### Latency over time + +- **p99 latency by env and chain:** + ```promql + histogram_quantile(0.99, sum(rate(rpc_request_latency_ms_bucket[5m])) by (le, env, network, chain_id)) + ``` +- **p50 latency for a given environment:** + ```promql + histogram_quantile(0.5, sum(rate(rpc_request_latency_ms_bucket{env="staging"}[5m])) by (le, network, chain_id)) + ``` + +### Error rate over time + +- **Errors per second by env and chain:** + ```promql + sum(rate(rpc_request_errors_total[5m])) by (env, network, chain_id, rpc_provider) + ``` +- **Error rate for a specific RPC provider:** + ```promql + sum(rate(rpc_request_errors_total{rpc_provider="primary"}[5m])) by (env, network, chain_id) + ``` + +### Request rate + +- **Requests per second by call type:** + ```promql + sum(rate(rpc_request_latency_ms_count[5m])) by (call, env, network) + ``` + +## Enabling metrics + +Create `RPCClientMetrics` with `metrics.NewRPCClientMetrics(metrics.RPCClientMetricsConfig{...})` and pass it as the last argument to `multinode.NewRPCClientBase(...)`. The follow-up interface refactor will make it easier for multinode/chain integrations to supply `env`, `network`, `chain_id`, and `rpc_provider`. diff --git a/metrics/rpc_client.go b/metrics/rpc_client.go new file mode 100644 index 0000000..02f13d7 --- /dev/null +++ b/metrics/rpc_client.go @@ -0,0 +1,125 @@ +// RPC client observability using Beholder. +// +// This file defines rpc_request_latency_ms and rpc_request_errors_total, emitted +// from the RPC client when RPCClientBase is constructed with a non-nil RPCClientMetrics. +// Metrics are queryable in Prometheus/Grafana by env, network, chain_id, and rpc_provider. +// +// Example Prometheus/Grafana queries: +// +// - Latency over time (e.g. p99 by env and chain): +// histogram_quantile(0.99, sum(rate(rpc_request_latency_ms_bucket[5m])) by (le, env, network, chain_id)) +// +// - Error rate over time (errors per second by env and chain): +// sum(rate(rpc_request_errors_total[5m])) by (env, network, chain_id, rpc_provider) +// +// - Request rate by call type: +// sum(rate(rpc_request_latency_ms_count[5m])) by (call, env, network) +package metrics + +import ( + "context" + "fmt" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + "github.com/smartcontractkit/chainlink-common/pkg/beholder" +) + +const ( + // RPCRequestLatencyMs is the Beholder/Prometheus metric name for RPC call latency in milliseconds. + RPCRequestLatencyMs = "rpc_request_latency_ms" + // RPCRequestErrorsTotal is the Beholder/Prometheus metric name for total RPC call errors. + RPCRequestErrorsTotal = "rpc_request_errors_total" +) + +var ( + rpcRequestLatencyBuckets = []float64{ + 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000, + } + promRPCRequestLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: RPCRequestLatencyMs, + Help: "RPC request latency in milliseconds (per call)", + Buckets: rpcRequestLatencyBuckets, + }, []string{"env", "network", "chain_id", "rpc_provider", "call"}) + promRPCRequestErrors = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: RPCRequestErrorsTotal, + Help: "Total number of failed RPC requests", + }, []string{"env", "network", "chain_id", "rpc_provider", "call"}) +) + +// RPCClientMetrics records RPC latency and error metrics for observability via Beholder/Prometheus. +// Metrics are queryable by environment, network, chain_id, and rpc_provider in Grafana. +type RPCClientMetrics interface { + // RecordRequest records latency for an RPC call. If err is non-nil, also increments the error counter. + // callName identifies the operation (e.g. "latest_block", "latest_finalized_block"). + RecordRequest(ctx context.Context, callName string, latencyMs float64, err error) +} + +var _ RPCClientMetrics = (*rpcClientMetrics)(nil) + +type rpcClientMetrics struct { + env string + network string + chainID string + rpcProvider string + latency metric.Float64Histogram + errorsTotal metric.Int64Counter +} + +// RPCClientMetricsConfig holds labels for RPC client metrics. +// Empty strings are allowed; they will still be emitted as labels for filtering. +type RPCClientMetricsConfig struct { + Env string // e.g. "staging", "production" + Network string // chain/network name + ChainID string // chain ID + RPCProvider string // RPC provider or node name (optional) +} + +// NewRPCClientMetrics creates RPC client metrics that publish to Beholder and Prometheus. +// Callers (e.g. chain-specific RPC clients or multinode) should pass env, network, chainID, and optionally rpcProvider +// so metrics can be queried in Grafana by environment, chain/network, and RPC provider. +func NewRPCClientMetrics(cfg RPCClientMetricsConfig) (RPCClientMetrics, error) { + latency, err := beholder.GetMeter().Float64Histogram(RPCRequestLatencyMs) + if err != nil { + return nil, fmt.Errorf("failed to register RPC request latency metric: %w", err) + } + errorsTotal, err := beholder.GetMeter().Int64Counter(RPCRequestErrorsTotal) + if err != nil { + return nil, fmt.Errorf("failed to register RPC request errors metric: %w", err) + } + return &rpcClientMetrics{ + env: cfg.Env, + network: cfg.Network, + chainID: cfg.ChainID, + rpcProvider: cfg.RPCProvider, + latency: latency, + errorsTotal: errorsTotal, + }, nil +} + +func (m *rpcClientMetrics) RecordRequest(ctx context.Context, callName string, latencyMs float64, err error) { + attrs := metric.WithAttributes( + attribute.String("env", m.env), + attribute.String("network", m.network), + attribute.String("chain_id", m.chainID), + attribute.String("rpc_provider", m.rpcProvider), + attribute.String("call", callName), + ) + promRPCRequestLatency.WithLabelValues(m.env, m.network, m.chainID, m.rpcProvider, callName).Observe(latencyMs) + m.latency.Record(ctx, latencyMs, attrs) + if err != nil { + promRPCRequestErrors.WithLabelValues(m.env, m.network, m.chainID, m.rpcProvider, callName).Inc() + m.errorsTotal.Add(ctx, 1, attrs) + } +} + +// NoopRPCClientMetrics is a no-op implementation for when metrics are disabled. +type NoopRPCClientMetrics struct{} + +func (NoopRPCClientMetrics) RecordRequest(context.Context, string, float64, error) {} + +// Ensure NoopRPCClientMetrics implements RPCClientMetrics. +var _ RPCClientMetrics = NoopRPCClientMetrics{} diff --git a/metrics/rpc_client_test.go b/metrics/rpc_client_test.go new file mode 100644 index 0000000..df16e59 --- /dev/null +++ b/metrics/rpc_client_test.go @@ -0,0 +1,32 @@ +package metrics + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestNewRPCClientMetrics(t *testing.T) { + m, err := NewRPCClientMetrics(RPCClientMetricsConfig{ + Env: "staging", + Network: "ethereum", + ChainID: "1", + RPCProvider: "primary", + }) + require.NoError(t, err) + require.NotNil(t, m) + + ctx := context.Background() + m.RecordRequest(ctx, "latest_block", 100.0, nil) + m.RecordRequest(ctx, "latest_block", 50.0, errors.New("rpc error")) +} + +func TestNoopRPCClientMetrics_RecordRequest(t *testing.T) { + var m NoopRPCClientMetrics + ctx := context.Background() + m.RecordRequest(ctx, "latest_block", 100.0, nil) + m.RecordRequest(ctx, "latest_block", 50.0, errors.New("rpc error")) + // Noop should not panic +} diff --git a/multinode/go.mod b/multinode/go.mod index c18efc1..ab79704 100644 --- a/multinode/go.mod +++ b/multinode/go.mod @@ -2,6 +2,8 @@ module github.com/smartcontractkit/chainlink-framework/multinode go 1.25.3 +replace github.com/smartcontractkit/chainlink-framework/metrics => ../metrics + require ( github.com/jpillora/backoff v1.0.0 github.com/pkg/errors v0.9.1 diff --git a/multinode/rpc_client_base.go b/multinode/rpc_client_base.go index b4a886c..2e490aa 100644 --- a/multinode/rpc_client_base.go +++ b/multinode/rpc_client_base.go @@ -9,6 +9,8 @@ import ( "github.com/smartcontractkit/chainlink-common/pkg/logger" "github.com/smartcontractkit/chainlink-common/pkg/services" + + "github.com/smartcontractkit/chainlink-framework/metrics" ) type RPCClientBaseConfig interface { @@ -46,12 +48,18 @@ type RPCClientBase[HEAD Head] struct { highestUserObservations ChainInfo // most recent chain info observed during current lifecycle latestChainInfo ChainInfo + + // rpcMetrics is optional; when set, RPC latency and errors are reported to Beholder/Prometheus. + rpcMetrics metrics.RPCClientMetrics } +// NewRPCClientBase creates an RPC client base. rpcMetrics is optional; when non-nil, +// latency and error metrics are emitted to Beholder for observability in Prometheus/Grafana. func NewRPCClientBase[HEAD Head]( cfg RPCClientBaseConfig, ctxTimeout time.Duration, log logger.Logger, latestBlock func(ctx context.Context) (HEAD, error), latestFinalizedBlock func(ctx context.Context) (HEAD, error), + rpcMetrics metrics.RPCClientMetrics, ) *RPCClientBase[HEAD] { return &RPCClientBase[HEAD]{ cfg: cfg, @@ -61,6 +69,7 @@ func NewRPCClientBase[HEAD Head]( latestFinalizedBlock: latestFinalizedBlock, subs: make(map[Subscription]struct{}), lifeCycleCh: make(chan struct{}), + rpcMetrics: rpcMetrics, } } @@ -151,20 +160,29 @@ func (m *RPCClientBase[HEAD]) SubscribeToFinalizedHeads(ctx context.Context) (<- return channel, sub, nil } +const ( + callLatestBlock = "latest_block" + callLatestFinalizedBlock = "latest_finalized_block" +) + func (m *RPCClientBase[HEAD]) LatestBlock(ctx context.Context) (HEAD, error) { // capture lifeCycleCh to ensure we are not updating chainInfo with observations related to previous life cycle ctx, cancel, lifeCycleCh := m.AcquireQueryCtx(ctx, m.ctxTimeout) defer cancel() + start := time.Now() head, err := m.latestBlock(ctx) + latencyMs := float64(time.Since(start).Milliseconds()) + if err == nil && !head.IsValid() { + err = errors.New("invalid head") + } + if m.rpcMetrics != nil { + m.rpcMetrics.RecordRequest(ctx, callLatestBlock, latencyMs, err) + } if err != nil { return head, err } - if !head.IsValid() { - return head, errors.New("invalid head") - } - m.OnNewHead(ctx, lifeCycleCh, head) return head, nil } @@ -173,15 +191,19 @@ func (m *RPCClientBase[HEAD]) LatestFinalizedBlock(ctx context.Context) (HEAD, e ctx, cancel, lifeCycleCh := m.AcquireQueryCtx(ctx, m.ctxTimeout) defer cancel() + start := time.Now() head, err := m.latestFinalizedBlock(ctx) + latencyMs := float64(time.Since(start).Milliseconds()) + if err == nil && !head.IsValid() { + err = errors.New("invalid head") + } + if m.rpcMetrics != nil { + m.rpcMetrics.RecordRequest(ctx, callLatestFinalizedBlock, latencyMs, err) + } if err != nil { return head, err } - if !head.IsValid() { - return head, errors.New("invalid head") - } - m.OnNewFinalizedHead(ctx, lifeCycleCh, head) return head, nil } diff --git a/multinode/rpc_client_base_test.go b/multinode/rpc_client_base_test.go index 25afd4d..033791e 100644 --- a/multinode/rpc_client_base_test.go +++ b/multinode/rpc_client_base_test.go @@ -67,7 +67,7 @@ func newTestRPC(t *testing.T) *testRPC { } rpc := &testRPC{} - rpc.RPCClientBase = NewRPCClientBase[*testHead](cfg, requestTimeout, lggr, rpc.latestBlock, rpc.latestBlock) + rpc.RPCClientBase = NewRPCClientBase[*testHead](cfg, requestTimeout, lggr, rpc.latestBlock, rpc.latestBlock, nil) t.Cleanup(rpc.Close) return rpc } From 24a83d364b9f427d76180042226fac73ce7cf8e1 Mon Sep 17 00:00:00 2001 From: Larry Li Date: Fri, 13 Mar 2026 01:19:46 -0700 Subject: [PATCH 02/11] update --- metrics/multinode.go | 48 +++++++++++++++++++++---------------------- metrics/rpc_client.go | 12 +++++------ 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/metrics/multinode.go b/metrics/multinode.go index c278913..5bc1ff5 100644 --- a/metrics/multinode.go +++ b/metrics/multinode.go @@ -135,29 +135,29 @@ type GenericMultiNodeMetrics interface { var _ GenericMultiNodeMetrics = &multiNodeMetrics{} type multiNodeMetrics struct { - network string - chainID string - nodeStates metric.Int64Gauge - nodeClientVersion metric.Int64Gauge - nodeVerifies metric.Int64Counter - nodeVerifiesFailed metric.Int64Counter - nodeVerifiesSuccess metric.Int64Counter - nodeTransitionsToAlive metric.Int64Counter - nodeTransitionsToInSync metric.Int64Counter - nodeTransitionsToOutOfSync metric.Int64Counter - nodeTransitionsToUnreachable metric.Int64Counter - nodeTransitionsToInvalidChainID metric.Int64Counter - nodeTransitionsToUnusable metric.Int64Counter - nodeTransitionsToSyncing metric.Int64Counter - highestSeenBlock metric.Int64Gauge - highestFinalizedBlock metric.Int64Gauge - seenBlocks metric.Int64Counter - polls metric.Int64Counter - pollsFailed metric.Int64Counter - pollsSuccess metric.Int64Counter - finalizedStateFailed metric.Int64Counter - nodeTransitionsToFinalizedStateNotAvailable metric.Int64Counter - invariantViolations metric.Int64Counter + network string + chainID string + nodeStates metric.Int64Gauge + nodeClientVersion metric.Int64Gauge + nodeVerifies metric.Int64Counter + nodeVerifiesFailed metric.Int64Counter + nodeVerifiesSuccess metric.Int64Counter + nodeTransitionsToAlive metric.Int64Counter + nodeTransitionsToInSync metric.Int64Counter + nodeTransitionsToOutOfSync metric.Int64Counter + nodeTransitionsToUnreachable metric.Int64Counter + nodeTransitionsToInvalidChainID metric.Int64Counter + nodeTransitionsToUnusable metric.Int64Counter + nodeTransitionsToSyncing metric.Int64Counter + highestSeenBlock metric.Int64Gauge + highestFinalizedBlock metric.Int64Gauge + seenBlocks metric.Int64Counter + polls metric.Int64Counter + pollsFailed metric.Int64Counter + pollsSuccess metric.Int64Counter + finalizedStateFailed metric.Int64Counter + nodeTransitionsToFinalizedStateNotAvailable metric.Int64Counter + invariantViolations metric.Int64Counter } func NewGenericMultiNodeMetrics(network string, chainID string) (GenericMultiNodeMetrics, error) { @@ -289,7 +289,7 @@ func NewGenericMultiNodeMetrics(network string, chainID string) (GenericMultiNod pollsSuccess: pollsSuccess, finalizedStateFailed: finalizedStateFailed, nodeTransitionsToFinalizedStateNotAvailable: nodeTransitionsToFinalizedStateNotAvailable, - invariantViolations: invariantViolations, + invariantViolations: invariantViolations, }, nil } diff --git a/metrics/rpc_client.go b/metrics/rpc_client.go index 02f13d7..b803ad3 100644 --- a/metrics/rpc_client.go +++ b/metrics/rpc_client.go @@ -61,12 +61,12 @@ type RPCClientMetrics interface { var _ RPCClientMetrics = (*rpcClientMetrics)(nil) type rpcClientMetrics struct { - env string - network string - chainID string - rpcProvider string - latency metric.Float64Histogram - errorsTotal metric.Int64Counter + env string + network string + chainID string + rpcProvider string + latency metric.Float64Histogram + errorsTotal metric.Int64Counter } // RPCClientMetricsConfig holds labels for RPC client metrics. From 2f3fab6d9719e8857d0b8ca892456337d50a95b8 Mon Sep 17 00:00:00 2001 From: Larry Li Date: Mon, 16 Mar 2026 14:38:51 -0700 Subject: [PATCH 03/11] update --- multinode/go.mod | 1 - 1 file changed, 1 deletion(-) diff --git a/multinode/go.mod b/multinode/go.mod index ab79704..2f7c9d0 100644 --- a/multinode/go.mod +++ b/multinode/go.mod @@ -2,7 +2,6 @@ module github.com/smartcontractkit/chainlink-framework/multinode go 1.25.3 -replace github.com/smartcontractkit/chainlink-framework/metrics => ../metrics require ( github.com/jpillora/backoff v1.0.0 From b892ff41d40c3937f9e1896552a1315a237002d2 Mon Sep 17 00:00:00 2001 From: Larry Li Date: Mon, 16 Mar 2026 15:10:05 -0700 Subject: [PATCH 04/11] update --- multinode/go.mod | 1 + multinode/go.sum | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/multinode/go.mod b/multinode/go.mod index 2f7c9d0..ab79704 100644 --- a/multinode/go.mod +++ b/multinode/go.mod @@ -2,6 +2,7 @@ module github.com/smartcontractkit/chainlink-framework/multinode go 1.25.3 +replace github.com/smartcontractkit/chainlink-framework/metrics => ../metrics require ( github.com/jpillora/backoff v1.0.0 diff --git a/multinode/go.sum b/multinode/go.sum index 459485f..9ec295e 100644 --- a/multinode/go.sum +++ b/multinode/go.sum @@ -80,8 +80,6 @@ github.com/smartcontractkit/chainlink-common v0.10.1-0.20260305114348-b8bbac30bf github.com/smartcontractkit/chainlink-common v0.10.1-0.20260305114348-b8bbac30bfc7/go.mod h1:0ghbAr7tRO0tT5ZqBXhOyzgUO37tNNe33Yn0hskauVM= github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.10 h1:FJAFgXS9oqASnkS03RE1HQwYQQxrO4l46O5JSzxqLgg= github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.10/go.mod h1:oiDa54M0FwxevWwyAX773lwdWvFYYlYHHQV1LQ5HpWY= -github.com/smartcontractkit/chainlink-framework/metrics v0.0.0-20250717121125-2350c82883e2 h1:ysZjKH+BpWlQhF93kr/Lc668UlCvT9NjfcsGdZT19I8= -github.com/smartcontractkit/chainlink-framework/metrics v0.0.0-20250717121125-2350c82883e2/go.mod h1:jo+cUqNcHwN8IF7SInQNXDZ8qzBsyMpnLdYbDswviFc= github.com/smartcontractkit/freeport v0.1.3-0.20250716200817-cb5dfd0e369e h1:Hv9Mww35LrufCdM9wtS9yVi/rEWGI1UnjHbcKKU0nVY= github.com/smartcontractkit/freeport v0.1.3-0.20250716200817-cb5dfd0e369e/go.mod h1:T4zH9R8R8lVWKfU7tUvYz2o2jMv1OpGCdpY2j2QZXzU= github.com/smartcontractkit/libocr v0.0.0-20250912173940-f3ab0246e23d h1:LokA9PoCNb8mm8mDT52c3RECPMRsGz1eCQORq+J3n74= From b5471e0f30662547304f9bb2a250e4744370f2d7 Mon Sep 17 00:00:00 2001 From: Larry Li Date: Mon, 16 Mar 2026 21:25:43 -0700 Subject: [PATCH 05/11] revert changes --- docs/rpc_observability.md | 7 +++++++ multinode/go.mod | 2 -- multinode/rpc_client_base.go | 25 +------------------------ multinode/rpc_client_base_test.go | 2 +- 4 files changed, 9 insertions(+), 27 deletions(-) diff --git a/docs/rpc_observability.md b/docs/rpc_observability.md index 351537d..ab981c5 100644 --- a/docs/rpc_observability.md +++ b/docs/rpc_observability.md @@ -45,3 +45,10 @@ Labels: `env`, `network`, `chain_id`, `rpc_provider`, `call` (e.g. `latest_block ## Enabling metrics Create `RPCClientMetrics` with `metrics.NewRPCClientMetrics(metrics.RPCClientMetricsConfig{...})` and pass it as the last argument to `multinode.NewRPCClientBase(...)`. The follow-up interface refactor will make it easier for multinode/chain integrations to supply `env`, `network`, `chain_id`, and `rpc_provider`. + +## Follow-up: multinode integration (PR 2) + +After the metrics module changes are merged, a second PR will: + +1. Update `multinode/go.mod`: bump `github.com/smartcontractkit/chainlink-framework/metrics` to the new version that includes `RPCClientMetrics`. +2. Add RPC metrics support in multinode: add optional `rpcMetrics metrics.RPCClientMetrics` to `RPCClientBase` and `NewRPCClientBase`, and call `RecordRequest` in `LatestBlock` and `LatestFinalizedBlock` (with latency and error recording). diff --git a/multinode/go.mod b/multinode/go.mod index ab79704..c18efc1 100644 --- a/multinode/go.mod +++ b/multinode/go.mod @@ -2,8 +2,6 @@ module github.com/smartcontractkit/chainlink-framework/multinode go 1.25.3 -replace github.com/smartcontractkit/chainlink-framework/metrics => ../metrics - require ( github.com/jpillora/backoff v1.0.0 github.com/pkg/errors v0.9.1 diff --git a/multinode/rpc_client_base.go b/multinode/rpc_client_base.go index 2e490aa..fd6e622 100644 --- a/multinode/rpc_client_base.go +++ b/multinode/rpc_client_base.go @@ -9,8 +9,6 @@ import ( "github.com/smartcontractkit/chainlink-common/pkg/logger" "github.com/smartcontractkit/chainlink-common/pkg/services" - - "github.com/smartcontractkit/chainlink-framework/metrics" ) type RPCClientBaseConfig interface { @@ -48,18 +46,13 @@ type RPCClientBase[HEAD Head] struct { highestUserObservations ChainInfo // most recent chain info observed during current lifecycle latestChainInfo ChainInfo - - // rpcMetrics is optional; when set, RPC latency and errors are reported to Beholder/Prometheus. - rpcMetrics metrics.RPCClientMetrics } -// NewRPCClientBase creates an RPC client base. rpcMetrics is optional; when non-nil, -// latency and error metrics are emitted to Beholder for observability in Prometheus/Grafana. +// NewRPCClientBase creates an RPC client base. func NewRPCClientBase[HEAD Head]( cfg RPCClientBaseConfig, ctxTimeout time.Duration, log logger.Logger, latestBlock func(ctx context.Context) (HEAD, error), latestFinalizedBlock func(ctx context.Context) (HEAD, error), - rpcMetrics metrics.RPCClientMetrics, ) *RPCClientBase[HEAD] { return &RPCClientBase[HEAD]{ cfg: cfg, @@ -69,7 +62,6 @@ func NewRPCClientBase[HEAD Head]( latestFinalizedBlock: latestFinalizedBlock, subs: make(map[Subscription]struct{}), lifeCycleCh: make(chan struct{}), - rpcMetrics: rpcMetrics, } } @@ -160,25 +152,15 @@ func (m *RPCClientBase[HEAD]) SubscribeToFinalizedHeads(ctx context.Context) (<- return channel, sub, nil } -const ( - callLatestBlock = "latest_block" - callLatestFinalizedBlock = "latest_finalized_block" -) - func (m *RPCClientBase[HEAD]) LatestBlock(ctx context.Context) (HEAD, error) { // capture lifeCycleCh to ensure we are not updating chainInfo with observations related to previous life cycle ctx, cancel, lifeCycleCh := m.AcquireQueryCtx(ctx, m.ctxTimeout) defer cancel() - start := time.Now() head, err := m.latestBlock(ctx) - latencyMs := float64(time.Since(start).Milliseconds()) if err == nil && !head.IsValid() { err = errors.New("invalid head") } - if m.rpcMetrics != nil { - m.rpcMetrics.RecordRequest(ctx, callLatestBlock, latencyMs, err) - } if err != nil { return head, err } @@ -191,15 +173,10 @@ func (m *RPCClientBase[HEAD]) LatestFinalizedBlock(ctx context.Context) (HEAD, e ctx, cancel, lifeCycleCh := m.AcquireQueryCtx(ctx, m.ctxTimeout) defer cancel() - start := time.Now() head, err := m.latestFinalizedBlock(ctx) - latencyMs := float64(time.Since(start).Milliseconds()) if err == nil && !head.IsValid() { err = errors.New("invalid head") } - if m.rpcMetrics != nil { - m.rpcMetrics.RecordRequest(ctx, callLatestFinalizedBlock, latencyMs, err) - } if err != nil { return head, err } diff --git a/multinode/rpc_client_base_test.go b/multinode/rpc_client_base_test.go index 033791e..25afd4d 100644 --- a/multinode/rpc_client_base_test.go +++ b/multinode/rpc_client_base_test.go @@ -67,7 +67,7 @@ func newTestRPC(t *testing.T) *testRPC { } rpc := &testRPC{} - rpc.RPCClientBase = NewRPCClientBase[*testHead](cfg, requestTimeout, lggr, rpc.latestBlock, rpc.latestBlock, nil) + rpc.RPCClientBase = NewRPCClientBase[*testHead](cfg, requestTimeout, lggr, rpc.latestBlock, rpc.latestBlock) t.Cleanup(rpc.Close) return rpc } From 0b3bc02848800569cfbb3e801708d521b7943dd4 Mon Sep 17 00:00:00 2001 From: Larry Li Date: Mon, 16 Mar 2026 21:40:04 -0700 Subject: [PATCH 06/11] update --- multinode/go.sum | 2 ++ multinode/rpc_client_base.go | 15 ++++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/multinode/go.sum b/multinode/go.sum index 9ec295e..459485f 100644 --- a/multinode/go.sum +++ b/multinode/go.sum @@ -80,6 +80,8 @@ github.com/smartcontractkit/chainlink-common v0.10.1-0.20260305114348-b8bbac30bf github.com/smartcontractkit/chainlink-common v0.10.1-0.20260305114348-b8bbac30bfc7/go.mod h1:0ghbAr7tRO0tT5ZqBXhOyzgUO37tNNe33Yn0hskauVM= github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.10 h1:FJAFgXS9oqASnkS03RE1HQwYQQxrO4l46O5JSzxqLgg= github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.10/go.mod h1:oiDa54M0FwxevWwyAX773lwdWvFYYlYHHQV1LQ5HpWY= +github.com/smartcontractkit/chainlink-framework/metrics v0.0.0-20250717121125-2350c82883e2 h1:ysZjKH+BpWlQhF93kr/Lc668UlCvT9NjfcsGdZT19I8= +github.com/smartcontractkit/chainlink-framework/metrics v0.0.0-20250717121125-2350c82883e2/go.mod h1:jo+cUqNcHwN8IF7SInQNXDZ8qzBsyMpnLdYbDswviFc= github.com/smartcontractkit/freeport v0.1.3-0.20250716200817-cb5dfd0e369e h1:Hv9Mww35LrufCdM9wtS9yVi/rEWGI1UnjHbcKKU0nVY= github.com/smartcontractkit/freeport v0.1.3-0.20250716200817-cb5dfd0e369e/go.mod h1:T4zH9R8R8lVWKfU7tUvYz2o2jMv1OpGCdpY2j2QZXzU= github.com/smartcontractkit/libocr v0.0.0-20250912173940-f3ab0246e23d h1:LokA9PoCNb8mm8mDT52c3RECPMRsGz1eCQORq+J3n74= diff --git a/multinode/rpc_client_base.go b/multinode/rpc_client_base.go index fd6e622..b4a886c 100644 --- a/multinode/rpc_client_base.go +++ b/multinode/rpc_client_base.go @@ -48,7 +48,6 @@ type RPCClientBase[HEAD Head] struct { latestChainInfo ChainInfo } -// NewRPCClientBase creates an RPC client base. func NewRPCClientBase[HEAD Head]( cfg RPCClientBaseConfig, ctxTimeout time.Duration, log logger.Logger, latestBlock func(ctx context.Context) (HEAD, error), @@ -158,13 +157,14 @@ func (m *RPCClientBase[HEAD]) LatestBlock(ctx context.Context) (HEAD, error) { defer cancel() head, err := m.latestBlock(ctx) - if err == nil && !head.IsValid() { - err = errors.New("invalid head") - } if err != nil { return head, err } + if !head.IsValid() { + return head, errors.New("invalid head") + } + m.OnNewHead(ctx, lifeCycleCh, head) return head, nil } @@ -174,13 +174,14 @@ func (m *RPCClientBase[HEAD]) LatestFinalizedBlock(ctx context.Context) (HEAD, e defer cancel() head, err := m.latestFinalizedBlock(ctx) - if err == nil && !head.IsValid() { - err = errors.New("invalid head") - } if err != nil { return head, err } + if !head.IsValid() { + return head, errors.New("invalid head") + } + m.OnNewFinalizedHead(ctx, lifeCycleCh, head) return head, nil } From 65f6952d70654f139a3da333f41b67f1e23afd84 Mon Sep 17 00:00:00 2001 From: Larry Li Date: Mon, 16 Mar 2026 21:59:56 -0700 Subject: [PATCH 07/11] update --- metrics/rpc_client.go | 2 ++ metrics/rpc_client_test.go | 1 + 2 files changed, 3 insertions(+) diff --git a/metrics/rpc_client.go b/metrics/rpc_client.go index b803ad3..a16f1df 100644 --- a/metrics/rpc_client.go +++ b/metrics/rpc_client.go @@ -14,6 +14,8 @@ // // - Request rate by call type: // sum(rate(rpc_request_latency_ms_count[5m])) by (call, env, network) +// +//nolint:revive package metrics import ( diff --git a/metrics/rpc_client_test.go b/metrics/rpc_client_test.go index df16e59..c085257 100644 --- a/metrics/rpc_client_test.go +++ b/metrics/rpc_client_test.go @@ -1,3 +1,4 @@ +//nolint:revive package metrics import ( From 0e4e90297f39b1c86e830ce6b2ff908d4a5ccdd8 Mon Sep 17 00:00:00 2001 From: Larry Li Date: Mon, 16 Mar 2026 22:19:19 -0700 Subject: [PATCH 08/11] update --- metrics/rpc_client.go | 1 - metrics/rpc_client_test.go | 1 - 2 files changed, 2 deletions(-) diff --git a/metrics/rpc_client.go b/metrics/rpc_client.go index a16f1df..a17a2ef 100644 --- a/metrics/rpc_client.go +++ b/metrics/rpc_client.go @@ -15,7 +15,6 @@ // - Request rate by call type: // sum(rate(rpc_request_latency_ms_count[5m])) by (call, env, network) // -//nolint:revive package metrics import ( diff --git a/metrics/rpc_client_test.go b/metrics/rpc_client_test.go index c085257..df16e59 100644 --- a/metrics/rpc_client_test.go +++ b/metrics/rpc_client_test.go @@ -1,4 +1,3 @@ -//nolint:revive package metrics import ( From cd5fc9440df8a19a08d93c4239d2a2c033ea95d0 Mon Sep 17 00:00:00 2001 From: Larry Li Date: Mon, 16 Mar 2026 23:08:34 -0700 Subject: [PATCH 09/11] update --- metrics/rpc_client.go | 1 - 1 file changed, 1 deletion(-) diff --git a/metrics/rpc_client.go b/metrics/rpc_client.go index a17a2ef..b803ad3 100644 --- a/metrics/rpc_client.go +++ b/metrics/rpc_client.go @@ -14,7 +14,6 @@ // // - Request rate by call type: // sum(rate(rpc_request_latency_ms_count[5m])) by (call, env, network) -// package metrics import ( From 7a913aab8e1b733e4bf8b04b7882560c1941f6c2 Mon Sep 17 00:00:00 2001 From: Larry Li Date: Thu, 19 Mar 2026 20:56:59 -0700 Subject: [PATCH 10/11] update --- docs/rpc_observability.md | 54 -------- metrics/client.go | 105 ++++++++++++++- .../{rpc_client_test.go => client_test.go} | 16 +-- metrics/rpc_client.go | 125 ------------------ 4 files changed, 112 insertions(+), 188 deletions(-) delete mode 100644 docs/rpc_observability.md rename metrics/{rpc_client_test.go => client_test.go} (52%) delete mode 100644 metrics/rpc_client.go diff --git a/docs/rpc_observability.md b/docs/rpc_observability.md deleted file mode 100644 index ab981c5..0000000 --- a/docs/rpc_observability.md +++ /dev/null @@ -1,54 +0,0 @@ -# RPC Observability (Beholder) - -RPC client metrics are published to Beholder and surface in Prometheus/Grafana when `RPCClientBase` is constructed with a non-nil `metrics.RPCClientMetrics`. - -## Metrics - -| Metric | Type | Description | -|--------|------|-------------| -| `rpc_request_latency_ms` | Histogram | RPC call latency in milliseconds (per call) | -| `rpc_request_errors_total` | Counter | Total number of failed RPC requests | - -Labels: `env`, `network`, `chain_id`, `rpc_provider`, `call` (e.g. `latest_block`, `latest_finalized_block`). - -## Example Prometheus / Grafana Queries - -### Latency over time - -- **p99 latency by env and chain:** - ```promql - histogram_quantile(0.99, sum(rate(rpc_request_latency_ms_bucket[5m])) by (le, env, network, chain_id)) - ``` -- **p50 latency for a given environment:** - ```promql - histogram_quantile(0.5, sum(rate(rpc_request_latency_ms_bucket{env="staging"}[5m])) by (le, network, chain_id)) - ``` - -### Error rate over time - -- **Errors per second by env and chain:** - ```promql - sum(rate(rpc_request_errors_total[5m])) by (env, network, chain_id, rpc_provider) - ``` -- **Error rate for a specific RPC provider:** - ```promql - sum(rate(rpc_request_errors_total{rpc_provider="primary"}[5m])) by (env, network, chain_id) - ``` - -### Request rate - -- **Requests per second by call type:** - ```promql - sum(rate(rpc_request_latency_ms_count[5m])) by (call, env, network) - ``` - -## Enabling metrics - -Create `RPCClientMetrics` with `metrics.NewRPCClientMetrics(metrics.RPCClientMetricsConfig{...})` and pass it as the last argument to `multinode.NewRPCClientBase(...)`. The follow-up interface refactor will make it easier for multinode/chain integrations to supply `env`, `network`, `chain_id`, and `rpc_provider`. - -## Follow-up: multinode integration (PR 2) - -After the metrics module changes are merged, a second PR will: - -1. Update `multinode/go.mod`: bump `github.com/smartcontractkit/chainlink-framework/metrics` to the new version that includes `RPCClientMetrics`. -2. Add RPC metrics support in multinode: add optional `rpcMetrics metrics.RPCClientMetrics` to `RPCClientBase` and `NewRPCClientBase`, and call `RecordRequest` in `LatestBlock` and `LatestFinalizedBlock` (with latency and error recording). diff --git a/metrics/client.go b/metrics/client.go index cdca9a7..cf2f2ab 100644 --- a/metrics/client.go +++ b/metrics/client.go @@ -1,16 +1,23 @@ package metrics import ( + "context" + "fmt" + "strconv" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + "github.com/smartcontractkit/chainlink-common/pkg/beholder" ) var ( RPCCallLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "rpc_call_latency", - Help: "The duration of an RPC call in milliseconds", + Help: "The duration of an RPC call in seconds", Buckets: []float64{ float64(50 * time.Millisecond), float64(100 * time.Millisecond), @@ -22,4 +29,100 @@ var ( float64(8 * time.Second), }, }, []string{"chainFamily", "chainID", "rpcUrl", "isSendOnly", "success", "rpcCallName"}) + + RPCCallErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "rpc_call_errors_total", + Help: "The total number of failed RPC calls", + }, []string{"chainFamily", "chainID", "rpcUrl", "isSendOnly", "rpcCallName"}) +) + +const ( + rpcCallLatencyBeholder = "rpc_call_latency" + rpcCallErrorsTotalBeholder = "rpc_call_errors_total" ) + +// RPCClientMetrics records RPC latency and errors to Prometheus and Beholder (same pattern as multinode metrics). +type RPCClientMetrics interface { + // RecordRequest records latency for an RPC call (observed in seconds for Prometheus). + // If err is non-nil, increments rpc_call_errors_total. + RecordRequest(ctx context.Context, callName string, latency time.Duration, err error) +} + +var _ RPCClientMetrics = (*rpcClientMetrics)(nil) + +type rpcClientMetrics struct { + chainFamily string + chainID string + rpcURL string + isSendOnly bool + latency metric.Float64Histogram + errorsTotal metric.Int64Counter +} + +// RPCClientMetricsConfig holds fixed labels for an RPC client instance. +type RPCClientMetricsConfig struct { + ChainFamily string + ChainID string + RPCURL string + IsSendOnly bool +} + +// NewRPCClientMetrics creates RPC client metrics that publish to Prometheus and Beholder. +func NewRPCClientMetrics(cfg RPCClientMetricsConfig) (RPCClientMetrics, error) { + latency, err := beholder.GetMeter().Float64Histogram(rpcCallLatencyBeholder) + if err != nil { + return nil, fmt.Errorf("failed to register RPC call latency metric: %w", err) + } + errorsTotal, err := beholder.GetMeter().Int64Counter(rpcCallErrorsTotalBeholder) + if err != nil { + return nil, fmt.Errorf("failed to register RPC call errors metric: %w", err) + } + return &rpcClientMetrics{ + chainFamily: cfg.ChainFamily, + chainID: cfg.ChainID, + rpcURL: cfg.RPCURL, + isSendOnly: cfg.IsSendOnly, + latency: latency, + errorsTotal: errorsTotal, + }, nil +} + +func (m *rpcClientMetrics) RecordRequest(ctx context.Context, callName string, latency time.Duration, err error) { + successStr := "true" + if err != nil { + successStr = "false" + } + sendStr := strconv.FormatBool(m.isSendOnly) + sec := latency.Seconds() + + RPCCallLatency.WithLabelValues(m.chainFamily, m.chainID, m.rpcURL, sendStr, successStr, callName).Observe(sec) + + latAttrs := metric.WithAttributes( + attribute.String("chainFamily", m.chainFamily), + attribute.String("chainID", m.chainID), + attribute.String("rpcUrl", m.rpcURL), + attribute.String("isSendOnly", sendStr), + attribute.String("success", successStr), + attribute.String("rpcCallName", callName), + ) + m.latency.Record(ctx, sec, latAttrs) + + if err != nil { + RPCCallErrorsTotal.WithLabelValues(m.chainFamily, m.chainID, m.rpcURL, sendStr, callName).Inc() + errAttrs := metric.WithAttributes( + attribute.String("chainFamily", m.chainFamily), + attribute.String("chainID", m.chainID), + attribute.String("rpcUrl", m.rpcURL), + attribute.String("isSendOnly", sendStr), + attribute.String("rpcCallName", callName), + ) + m.errorsTotal.Add(ctx, 1, errAttrs) + } +} + +// NoopRPCClientMetrics is a no-op implementation for when metrics are disabled. +type NoopRPCClientMetrics struct{} + +func (NoopRPCClientMetrics) RecordRequest(context.Context, string, time.Duration, error) {} + +var _ RPCClientMetrics = NoopRPCClientMetrics{} diff --git a/metrics/rpc_client_test.go b/metrics/client_test.go similarity index 52% rename from metrics/rpc_client_test.go rename to metrics/client_test.go index df16e59..5eb5427 100644 --- a/metrics/rpc_client_test.go +++ b/metrics/client_test.go @@ -4,29 +4,29 @@ import ( "context" "errors" "testing" + "time" "github.com/stretchr/testify/require" ) func TestNewRPCClientMetrics(t *testing.T) { m, err := NewRPCClientMetrics(RPCClientMetricsConfig{ - Env: "staging", - Network: "ethereum", + ChainFamily: "evm", ChainID: "1", - RPCProvider: "primary", + RPCURL: "http://localhost:8545", + IsSendOnly: false, }) require.NoError(t, err) require.NotNil(t, m) ctx := context.Background() - m.RecordRequest(ctx, "latest_block", 100.0, nil) - m.RecordRequest(ctx, "latest_block", 50.0, errors.New("rpc error")) + m.RecordRequest(ctx, "latest_block", 100*time.Millisecond, nil) + m.RecordRequest(ctx, "latest_block", 50*time.Millisecond, errors.New("rpc error")) } func TestNoopRPCClientMetrics_RecordRequest(t *testing.T) { var m NoopRPCClientMetrics ctx := context.Background() - m.RecordRequest(ctx, "latest_block", 100.0, nil) - m.RecordRequest(ctx, "latest_block", 50.0, errors.New("rpc error")) - // Noop should not panic + m.RecordRequest(ctx, "latest_block", 100*time.Millisecond, nil) + m.RecordRequest(ctx, "latest_block", 50*time.Millisecond, errors.New("rpc error")) } diff --git a/metrics/rpc_client.go b/metrics/rpc_client.go deleted file mode 100644 index b803ad3..0000000 --- a/metrics/rpc_client.go +++ /dev/null @@ -1,125 +0,0 @@ -// RPC client observability using Beholder. -// -// This file defines rpc_request_latency_ms and rpc_request_errors_total, emitted -// from the RPC client when RPCClientBase is constructed with a non-nil RPCClientMetrics. -// Metrics are queryable in Prometheus/Grafana by env, network, chain_id, and rpc_provider. -// -// Example Prometheus/Grafana queries: -// -// - Latency over time (e.g. p99 by env and chain): -// histogram_quantile(0.99, sum(rate(rpc_request_latency_ms_bucket[5m])) by (le, env, network, chain_id)) -// -// - Error rate over time (errors per second by env and chain): -// sum(rate(rpc_request_errors_total[5m])) by (env, network, chain_id, rpc_provider) -// -// - Request rate by call type: -// sum(rate(rpc_request_latency_ms_count[5m])) by (call, env, network) -package metrics - -import ( - "context" - "fmt" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/metric" - - "github.com/smartcontractkit/chainlink-common/pkg/beholder" -) - -const ( - // RPCRequestLatencyMs is the Beholder/Prometheus metric name for RPC call latency in milliseconds. - RPCRequestLatencyMs = "rpc_request_latency_ms" - // RPCRequestErrorsTotal is the Beholder/Prometheus metric name for total RPC call errors. - RPCRequestErrorsTotal = "rpc_request_errors_total" -) - -var ( - rpcRequestLatencyBuckets = []float64{ - 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000, - } - promRPCRequestLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ - Name: RPCRequestLatencyMs, - Help: "RPC request latency in milliseconds (per call)", - Buckets: rpcRequestLatencyBuckets, - }, []string{"env", "network", "chain_id", "rpc_provider", "call"}) - promRPCRequestErrors = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: RPCRequestErrorsTotal, - Help: "Total number of failed RPC requests", - }, []string{"env", "network", "chain_id", "rpc_provider", "call"}) -) - -// RPCClientMetrics records RPC latency and error metrics for observability via Beholder/Prometheus. -// Metrics are queryable by environment, network, chain_id, and rpc_provider in Grafana. -type RPCClientMetrics interface { - // RecordRequest records latency for an RPC call. If err is non-nil, also increments the error counter. - // callName identifies the operation (e.g. "latest_block", "latest_finalized_block"). - RecordRequest(ctx context.Context, callName string, latencyMs float64, err error) -} - -var _ RPCClientMetrics = (*rpcClientMetrics)(nil) - -type rpcClientMetrics struct { - env string - network string - chainID string - rpcProvider string - latency metric.Float64Histogram - errorsTotal metric.Int64Counter -} - -// RPCClientMetricsConfig holds labels for RPC client metrics. -// Empty strings are allowed; they will still be emitted as labels for filtering. -type RPCClientMetricsConfig struct { - Env string // e.g. "staging", "production" - Network string // chain/network name - ChainID string // chain ID - RPCProvider string // RPC provider or node name (optional) -} - -// NewRPCClientMetrics creates RPC client metrics that publish to Beholder and Prometheus. -// Callers (e.g. chain-specific RPC clients or multinode) should pass env, network, chainID, and optionally rpcProvider -// so metrics can be queried in Grafana by environment, chain/network, and RPC provider. -func NewRPCClientMetrics(cfg RPCClientMetricsConfig) (RPCClientMetrics, error) { - latency, err := beholder.GetMeter().Float64Histogram(RPCRequestLatencyMs) - if err != nil { - return nil, fmt.Errorf("failed to register RPC request latency metric: %w", err) - } - errorsTotal, err := beholder.GetMeter().Int64Counter(RPCRequestErrorsTotal) - if err != nil { - return nil, fmt.Errorf("failed to register RPC request errors metric: %w", err) - } - return &rpcClientMetrics{ - env: cfg.Env, - network: cfg.Network, - chainID: cfg.ChainID, - rpcProvider: cfg.RPCProvider, - latency: latency, - errorsTotal: errorsTotal, - }, nil -} - -func (m *rpcClientMetrics) RecordRequest(ctx context.Context, callName string, latencyMs float64, err error) { - attrs := metric.WithAttributes( - attribute.String("env", m.env), - attribute.String("network", m.network), - attribute.String("chain_id", m.chainID), - attribute.String("rpc_provider", m.rpcProvider), - attribute.String("call", callName), - ) - promRPCRequestLatency.WithLabelValues(m.env, m.network, m.chainID, m.rpcProvider, callName).Observe(latencyMs) - m.latency.Record(ctx, latencyMs, attrs) - if err != nil { - promRPCRequestErrors.WithLabelValues(m.env, m.network, m.chainID, m.rpcProvider, callName).Inc() - m.errorsTotal.Add(ctx, 1, attrs) - } -} - -// NoopRPCClientMetrics is a no-op implementation for when metrics are disabled. -type NoopRPCClientMetrics struct{} - -func (NoopRPCClientMetrics) RecordRequest(context.Context, string, float64, error) {} - -// Ensure NoopRPCClientMetrics implements RPCClientMetrics. -var _ RPCClientMetrics = NoopRPCClientMetrics{} From 4ec82db36a26d0cf12f695a93ee1f4a4b1d4970d Mon Sep 17 00:00:00 2001 From: Larry Li Date: Mon, 23 Mar 2026 09:30:57 -0700 Subject: [PATCH 11/11] update --- metrics/client.go | 67 +++++++++++++++++++----------------------- metrics/client_test.go | 11 ++++--- 2 files changed, 35 insertions(+), 43 deletions(-) diff --git a/metrics/client.go b/metrics/client.go index cf2f2ab..248dfaa 100644 --- a/metrics/client.go +++ b/metrics/client.go @@ -15,18 +15,14 @@ import ( ) var ( + // RPCCallLatency measures RPC duration in milliseconds (bucket upper bounds from 50 ms to 8 s). + // Values are latency.Seconds()*1000, not float64(duration) — the latter is nanoseconds and will skew quantiles. RPCCallLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ - Name: "rpc_call_latency", - Help: "The duration of an RPC call in seconds", + Name: rpcCallLatencyBeholder, + Help: "The duration of an RPC call in milliseconds", Buckets: []float64{ - float64(50 * time.Millisecond), - float64(100 * time.Millisecond), - float64(200 * time.Millisecond), - float64(500 * time.Millisecond), - float64(1 * time.Second), - float64(2 * time.Second), - float64(4 * time.Second), - float64(8 * time.Second), + 50, 100, 200, 500, + 1000, 2000, 4000, 8000, }, }, []string{"chainFamily", "chainID", "rpcUrl", "isSendOnly", "success", "rpcCallName"}) @@ -42,29 +38,27 @@ const ( ) // RPCClientMetrics records RPC latency and errors to Prometheus and Beholder (same pattern as multinode metrics). +// Construct once per chain (or process) with ChainFamily and ChainID; pass rpcUrl and isSendOnly on each call +// when they vary by node or request. type RPCClientMetrics interface { - // RecordRequest records latency for an RPC call (observed in seconds for Prometheus). + // RecordRequest records latency for an RPC call (observed in milliseconds for Prometheus and Beholder). // If err is non-nil, increments rpc_call_errors_total. - RecordRequest(ctx context.Context, callName string, latency time.Duration, err error) + RecordRequest(ctx context.Context, rpcURL string, isSendOnly bool, callName string, latency time.Duration, err error) } var _ RPCClientMetrics = (*rpcClientMetrics)(nil) type rpcClientMetrics struct { - chainFamily string - chainID string - rpcURL string - isSendOnly bool - latency metric.Float64Histogram - errorsTotal metric.Int64Counter + chainFamily string + chainID string + latencyHis metric.Float64Histogram + errorsCounter metric.Int64Counter } -// RPCClientMetricsConfig holds fixed labels for an RPC client instance. +// RPCClientMetricsConfig holds labels that are fixed for the lifetime of the metrics handle (e.g. one per chain). type RPCClientMetricsConfig struct { ChainFamily string ChainID string - RPCURL string - IsSendOnly bool } // NewRPCClientMetrics creates RPC client metrics that publish to Prometheus and Beholder. @@ -78,51 +72,50 @@ func NewRPCClientMetrics(cfg RPCClientMetricsConfig) (RPCClientMetrics, error) { return nil, fmt.Errorf("failed to register RPC call errors metric: %w", err) } return &rpcClientMetrics{ - chainFamily: cfg.ChainFamily, - chainID: cfg.ChainID, - rpcURL: cfg.RPCURL, - isSendOnly: cfg.IsSendOnly, - latency: latency, - errorsTotal: errorsTotal, + chainFamily: cfg.ChainFamily, + chainID: cfg.ChainID, + latencyHis: latency, + errorsCounter: errorsTotal, }, nil } -func (m *rpcClientMetrics) RecordRequest(ctx context.Context, callName string, latency time.Duration, err error) { +func (m *rpcClientMetrics) RecordRequest(ctx context.Context, rpcURL string, isSendOnly bool, callName string, latency time.Duration, err error) { successStr := "true" if err != nil { successStr = "false" } - sendStr := strconv.FormatBool(m.isSendOnly) - sec := latency.Seconds() + sendStr := strconv.FormatBool(isSendOnly) + ms := latency.Seconds() * 1000 - RPCCallLatency.WithLabelValues(m.chainFamily, m.chainID, m.rpcURL, sendStr, successStr, callName).Observe(sec) + RPCCallLatency.WithLabelValues(m.chainFamily, m.chainID, rpcURL, sendStr, successStr, callName).Observe(ms) latAttrs := metric.WithAttributes( attribute.String("chainFamily", m.chainFamily), attribute.String("chainID", m.chainID), - attribute.String("rpcUrl", m.rpcURL), + attribute.String("rpcUrl", rpcURL), attribute.String("isSendOnly", sendStr), attribute.String("success", successStr), attribute.String("rpcCallName", callName), ) - m.latency.Record(ctx, sec, latAttrs) + m.latencyHis.Record(ctx, ms, latAttrs) if err != nil { - RPCCallErrorsTotal.WithLabelValues(m.chainFamily, m.chainID, m.rpcURL, sendStr, callName).Inc() + RPCCallErrorsTotal.WithLabelValues(m.chainFamily, m.chainID, rpcURL, sendStr, callName).Inc() errAttrs := metric.WithAttributes( attribute.String("chainFamily", m.chainFamily), attribute.String("chainID", m.chainID), - attribute.String("rpcUrl", m.rpcURL), + attribute.String("rpcUrl", rpcURL), attribute.String("isSendOnly", sendStr), attribute.String("rpcCallName", callName), ) - m.errorsTotal.Add(ctx, 1, errAttrs) + m.errorsCounter.Add(ctx, 1, errAttrs) } } // NoopRPCClientMetrics is a no-op implementation for when metrics are disabled. type NoopRPCClientMetrics struct{} -func (NoopRPCClientMetrics) RecordRequest(context.Context, string, time.Duration, error) {} +func (NoopRPCClientMetrics) RecordRequest(context.Context, string, bool, string, time.Duration, error) { +} var _ RPCClientMetrics = NoopRPCClientMetrics{} diff --git a/metrics/client_test.go b/metrics/client_test.go index 5eb5427..ba407a0 100644 --- a/metrics/client_test.go +++ b/metrics/client_test.go @@ -13,20 +13,19 @@ func TestNewRPCClientMetrics(t *testing.T) { m, err := NewRPCClientMetrics(RPCClientMetricsConfig{ ChainFamily: "evm", ChainID: "1", - RPCURL: "http://localhost:8545", - IsSendOnly: false, }) require.NoError(t, err) require.NotNil(t, m) ctx := context.Background() - m.RecordRequest(ctx, "latest_block", 100*time.Millisecond, nil) - m.RecordRequest(ctx, "latest_block", 50*time.Millisecond, errors.New("rpc error")) + const url = "http://localhost:8545" + m.RecordRequest(ctx, url, false, "latest_block", 100*time.Millisecond, nil) + m.RecordRequest(ctx, url, true, "latest_block", 50*time.Millisecond, errors.New("rpc error")) } func TestNoopRPCClientMetrics_RecordRequest(t *testing.T) { var m NoopRPCClientMetrics ctx := context.Background() - m.RecordRequest(ctx, "latest_block", 100*time.Millisecond, nil) - m.RecordRequest(ctx, "latest_block", 50*time.Millisecond, errors.New("rpc error")) + m.RecordRequest(ctx, "http://localhost:8545", false, "latest_block", 100*time.Millisecond, nil) + m.RecordRequest(ctx, "http://localhost:8545", false, "latest_block", 50*time.Millisecond, errors.New("rpc error")) }