diff --git a/metrics/client.go b/metrics/client.go index cdca9a7..248dfaa 100644 --- a/metrics/client.go +++ b/metrics/client.go @@ -1,25 +1,121 @@ package metrics import ( + "context" + "fmt" + "strconv" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + "github.com/smartcontractkit/chainlink-common/pkg/beholder" ) var ( + // RPCCallLatency measures RPC duration in milliseconds (bucket upper bounds from 50 ms to 8 s). + // Values are latency.Seconds()*1000, not float64(duration) — the latter is nanoseconds and will skew quantiles. RPCCallLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ - Name: "rpc_call_latency", + Name: rpcCallLatencyBeholder, Help: "The duration of an RPC call in milliseconds", Buckets: []float64{ - float64(50 * time.Millisecond), - float64(100 * time.Millisecond), - float64(200 * time.Millisecond), - float64(500 * time.Millisecond), - float64(1 * time.Second), - float64(2 * time.Second), - float64(4 * time.Second), - float64(8 * time.Second), + 50, 100, 200, 500, + 1000, 2000, 4000, 8000, }, }, []string{"chainFamily", "chainID", "rpcUrl", "isSendOnly", "success", "rpcCallName"}) + + RPCCallErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "rpc_call_errors_total", + Help: "The total number of failed RPC calls", + }, []string{"chainFamily", "chainID", "rpcUrl", "isSendOnly", "rpcCallName"}) +) + +const ( + rpcCallLatencyBeholder = "rpc_call_latency" + rpcCallErrorsTotalBeholder = "rpc_call_errors_total" ) + +// RPCClientMetrics records RPC latency and errors to Prometheus and Beholder (same pattern as multinode metrics). +// Construct once per chain (or process) with ChainFamily and ChainID; pass rpcUrl and isSendOnly on each call +// when they vary by node or request. +type RPCClientMetrics interface { + // RecordRequest records latency for an RPC call (observed in milliseconds for Prometheus and Beholder). + // If err is non-nil, increments rpc_call_errors_total. + RecordRequest(ctx context.Context, rpcURL string, isSendOnly bool, callName string, latency time.Duration, err error) +} + +var _ RPCClientMetrics = (*rpcClientMetrics)(nil) + +type rpcClientMetrics struct { + chainFamily string + chainID string + latencyHis metric.Float64Histogram + errorsCounter metric.Int64Counter +} + +// RPCClientMetricsConfig holds labels that are fixed for the lifetime of the metrics handle (e.g. one per chain). +type RPCClientMetricsConfig struct { + ChainFamily string + ChainID string +} + +// NewRPCClientMetrics creates RPC client metrics that publish to Prometheus and Beholder. +func NewRPCClientMetrics(cfg RPCClientMetricsConfig) (RPCClientMetrics, error) { + latency, err := beholder.GetMeter().Float64Histogram(rpcCallLatencyBeholder) + if err != nil { + return nil, fmt.Errorf("failed to register RPC call latency metric: %w", err) + } + errorsTotal, err := beholder.GetMeter().Int64Counter(rpcCallErrorsTotalBeholder) + if err != nil { + return nil, fmt.Errorf("failed to register RPC call errors metric: %w", err) + } + return &rpcClientMetrics{ + chainFamily: cfg.ChainFamily, + chainID: cfg.ChainID, + latencyHis: latency, + errorsCounter: errorsTotal, + }, nil +} + +func (m *rpcClientMetrics) RecordRequest(ctx context.Context, rpcURL string, isSendOnly bool, callName string, latency time.Duration, err error) { + successStr := "true" + if err != nil { + successStr = "false" + } + sendStr := strconv.FormatBool(isSendOnly) + ms := latency.Seconds() * 1000 + + RPCCallLatency.WithLabelValues(m.chainFamily, m.chainID, rpcURL, sendStr, successStr, callName).Observe(ms) + + latAttrs := metric.WithAttributes( + attribute.String("chainFamily", m.chainFamily), + attribute.String("chainID", m.chainID), + attribute.String("rpcUrl", rpcURL), + attribute.String("isSendOnly", sendStr), + attribute.String("success", successStr), + attribute.String("rpcCallName", callName), + ) + m.latencyHis.Record(ctx, ms, latAttrs) + + if err != nil { + RPCCallErrorsTotal.WithLabelValues(m.chainFamily, m.chainID, rpcURL, sendStr, callName).Inc() + errAttrs := metric.WithAttributes( + attribute.String("chainFamily", m.chainFamily), + attribute.String("chainID", m.chainID), + attribute.String("rpcUrl", rpcURL), + attribute.String("isSendOnly", sendStr), + attribute.String("rpcCallName", callName), + ) + m.errorsCounter.Add(ctx, 1, errAttrs) + } +} + +// NoopRPCClientMetrics is a no-op implementation for when metrics are disabled. +type NoopRPCClientMetrics struct{} + +func (NoopRPCClientMetrics) RecordRequest(context.Context, string, bool, string, time.Duration, error) { +} + +var _ RPCClientMetrics = NoopRPCClientMetrics{} diff --git a/metrics/client_test.go b/metrics/client_test.go new file mode 100644 index 0000000..ba407a0 --- /dev/null +++ b/metrics/client_test.go @@ -0,0 +1,31 @@ +package metrics + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestNewRPCClientMetrics(t *testing.T) { + m, err := NewRPCClientMetrics(RPCClientMetricsConfig{ + ChainFamily: "evm", + ChainID: "1", + }) + require.NoError(t, err) + require.NotNil(t, m) + + ctx := context.Background() + const url = "http://localhost:8545" + m.RecordRequest(ctx, url, false, "latest_block", 100*time.Millisecond, nil) + m.RecordRequest(ctx, url, true, "latest_block", 50*time.Millisecond, errors.New("rpc error")) +} + +func TestNoopRPCClientMetrics_RecordRequest(t *testing.T) { + var m NoopRPCClientMetrics + ctx := context.Background() + m.RecordRequest(ctx, "http://localhost:8545", false, "latest_block", 100*time.Millisecond, nil) + m.RecordRequest(ctx, "http://localhost:8545", false, "latest_block", 50*time.Millisecond, errors.New("rpc error")) +} diff --git a/metrics/multinode.go b/metrics/multinode.go index c278913..5bc1ff5 100644 --- a/metrics/multinode.go +++ b/metrics/multinode.go @@ -135,29 +135,29 @@ type GenericMultiNodeMetrics interface { var _ GenericMultiNodeMetrics = &multiNodeMetrics{} type multiNodeMetrics struct { - network string - chainID string - nodeStates metric.Int64Gauge - nodeClientVersion metric.Int64Gauge - nodeVerifies metric.Int64Counter - nodeVerifiesFailed metric.Int64Counter - nodeVerifiesSuccess metric.Int64Counter - nodeTransitionsToAlive metric.Int64Counter - nodeTransitionsToInSync metric.Int64Counter - nodeTransitionsToOutOfSync metric.Int64Counter - nodeTransitionsToUnreachable metric.Int64Counter - nodeTransitionsToInvalidChainID metric.Int64Counter - nodeTransitionsToUnusable metric.Int64Counter - nodeTransitionsToSyncing metric.Int64Counter - highestSeenBlock metric.Int64Gauge - highestFinalizedBlock metric.Int64Gauge - seenBlocks metric.Int64Counter - polls metric.Int64Counter - pollsFailed metric.Int64Counter - pollsSuccess metric.Int64Counter - finalizedStateFailed metric.Int64Counter - nodeTransitionsToFinalizedStateNotAvailable metric.Int64Counter - invariantViolations metric.Int64Counter + network string + chainID string + nodeStates metric.Int64Gauge + nodeClientVersion metric.Int64Gauge + nodeVerifies metric.Int64Counter + nodeVerifiesFailed metric.Int64Counter + nodeVerifiesSuccess metric.Int64Counter + nodeTransitionsToAlive metric.Int64Counter + nodeTransitionsToInSync metric.Int64Counter + nodeTransitionsToOutOfSync metric.Int64Counter + nodeTransitionsToUnreachable metric.Int64Counter + nodeTransitionsToInvalidChainID metric.Int64Counter + nodeTransitionsToUnusable metric.Int64Counter + nodeTransitionsToSyncing metric.Int64Counter + highestSeenBlock metric.Int64Gauge + highestFinalizedBlock metric.Int64Gauge + seenBlocks metric.Int64Counter + polls metric.Int64Counter + pollsFailed metric.Int64Counter + pollsSuccess metric.Int64Counter + finalizedStateFailed metric.Int64Counter + nodeTransitionsToFinalizedStateNotAvailable metric.Int64Counter + invariantViolations metric.Int64Counter } func NewGenericMultiNodeMetrics(network string, chainID string) (GenericMultiNodeMetrics, error) { @@ -289,7 +289,7 @@ func NewGenericMultiNodeMetrics(network string, chainID string) (GenericMultiNod pollsSuccess: pollsSuccess, finalizedStateFailed: finalizedStateFailed, nodeTransitionsToFinalizedStateNotAvailable: nodeTransitionsToFinalizedStateNotAvailable, - invariantViolations: invariantViolations, + invariantViolations: invariantViolations, }, nil }