From ae7a6799e7e51b7fcd326f0d36b7f5f97c348adc Mon Sep 17 00:00:00 2001 From: Abhishek Rai Date: Wed, 29 Apr 2026 18:36:54 -0700 Subject: [PATCH] feat: bound default metric cardinality --- README.md | 60 ++++++---- docs/usage.md | 8 ++ helm/promgithub/templates/deployment.yaml | 2 + helm/promgithub/values.yaml | 3 + src/async_test.go | 12 +- src/github.go | 126 +++++++++++++++++---- src/metrics.go | 122 ++++++++++++++++++-- src/metrics_test.go | 131 +++++++++++++++------- 8 files changed, 368 insertions(+), 96 deletions(-) diff --git a/README.md b/README.md index 99ec82c..affca83 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,53 @@ # Github Prometheus Exporter (promgithub) -`promgithub` is a service that receives GitHub webhook events and exposes Prometheus metrics for repository activity, workflow runs, workflow jobs, commits, and pull requests. +`promgithub` receives GitHub webhook events and exposes Prometheus metrics for repository activity, workflow runs, workflow jobs, commits, and pull requests. -It is designed to be simple to deploy and can run either: +It can run either: - as a single instance - as multiple instances with Redis for shared deduplication and state ## Metrics exported -`promgithub` exports the following metrics: - -| Name | Type | Labels | Description | -|------------------------------------|-----------|-------------------------------------------------------------------------|-------------------------------------------| -| `promgithub_workflow_status` | Counter | `repository`, `branch`, `workflow_name`, `workflow_status`, `conclusion` | Total number of workflow runs with status | -| `promgithub_workflow_duration` | Histogram | `repository`, `branch`, `workflow_name`, `workflow_status`, `conclusion` | Duration of workflow runs | -| `promgithub_workflow_queued` | Gauge | `repository`, `branch`, `workflow_name` | Number of workflow runs queued | -| `promgithub_workflow_in_progress` | Gauge | `repository`, `branch`, `workflow_name` | Number of workflow runs in progress | -| `promgithub_workflow_completed` | Gauge | `repository`, `branch`, `workflow_conclusion`, `workflow_name` | Number of workflow runs completed | -| `promgithub_job_status` | Counter | `repository`, `branch`, `workflow_name`, `job_status`, `job_conclusion` | Total number of jobs with status | -| `promgithub_job_duration` | Histogram | `repository`, `branch`, `workflow_name`, `job_status`, `job_conclusion` | Duration of jobs runs in seconds | -| `promgithub_job_queued` | Gauge | `repository`, `branch`, `workflow_name` | Number of jobs queued | -| `promgithub_job_in_progress` | Gauge | `repository`, `branch`, `workflow_name` | Number of jobs in progress | -| `promgithub_job_completed` | Gauge | `repository`, `branch`, `job_conclusion`, `workflow_name` | Number of jobs completed | -| `promgithub_commit_pushed` | Counter | `repository` | Total number of commits pushed | -| `promgithub_pull_request` | Counter | `repository`, `base_branch`, `pull_request_status` | Total number of pull requests | +### Default metrics -## Metric model +The default metric set is bounded-cardinality and production-safe for larger repository sets. + +| Name | Type | Labels | Description | +|-----------------------------------|-----------|------------------------------------------------|-------------------------------------------| +| `promgithub_workflow_status` | Counter | `repository`, `workflow_status`, `conclusion` | Total number of workflow runs with status | +| `promgithub_workflow_duration` | Histogram | `repository`, `workflow_status`, `conclusion` | Duration of workflow runs | +| `promgithub_workflow_queued` | Gauge | `repository` | Number of workflow runs queued | +| `promgithub_workflow_in_progress` | Gauge | `repository` | Number of workflow runs in progress | +| `promgithub_workflow_completed` | Gauge | `repository`, `workflow_conclusion` | Number of workflow runs completed | +| `promgithub_job_status` | Counter | `repository`, `job_status`, `job_conclusion` | Total number of jobs with status | +| `promgithub_job_duration` | Histogram | `repository`, `job_status`, `job_conclusion` | Duration of jobs runs in seconds | +| `promgithub_job_queued` | Gauge | `repository` | Number of jobs queued | +| `promgithub_job_in_progress` | Gauge | `repository` | Number of jobs in progress | +| `promgithub_job_completed` | Gauge | `repository`, `job_conclusion` | Number of jobs completed | +| `promgithub_commit_pushed` | Counter | `repository` | Total number of commits pushed | +| `promgithub_pull_request` | Counter | `repository`, `pull_request_status` | Total number of pull requests | + +### Optional detailed metrics + +Set `PROMGITHUB_ENABLE_DETAILED_METRICS=true` to also emit opt-in detailed metric families with higher-cardinality labels: -The exporter focuses on repository and workflow health signals while avoiding noisy per-entity labels such as runner names, job names, commit author identities, and pull request authors. +- `promgithub_workflow_status_detailed` +- `promgithub_workflow_duration_detailed` +- `promgithub_workflow_queued_detailed` +- `promgithub_workflow_in_progress_detailed` +- `promgithub_workflow_completed_detailed` +- `promgithub_job_status_detailed` +- `promgithub_job_duration_detailed` +- `promgithub_job_queued_detailed` +- `promgithub_job_in_progress_detailed` +- `promgithub_job_completed_detailed` +- `promgithub_pull_request_detailed` + +These detailed metrics preserve labels such as `branch`, `workflow_name`, and `base_branch`. They are disabled by default because they can grow quickly in larger GitHub environments. + +## Metric model -This keeps the default metric set compact and practical for Prometheus while still preserving the `branch` label for branch-specific workflow and job visibility. +The exporter now defaults to repository-level operational metrics and keeps higher-cardinality dimensions as an explicit opt-in. This avoids unbounded series growth from branch churn, workflow-name sprawl, and pull-request base-branch fragmentation while still allowing teams to enable richer labels when they understand the cost. ## Redis-backed multi-instance mode diff --git a/docs/usage.md b/docs/usage.md index f82e426..a2699b4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -21,6 +21,7 @@ The service supports the following environment variables: - `PROMGITHUB_REDIS_DB` (optional): Redis database number, default `0`. - `PROMGITHUB_REDIS_KEY_PREFIX` (optional): Prefix used for Redis keys, default `promgithub`. - `PROMGITHUB_REDIS_DELIVERY_TTL` (optional): TTL for webhook delivery dedupe keys, default `24h`. +- `PROMGITHUB_ENABLE_DETAILED_METRICS` (optional): When `true`, also emits higher-cardinality `*_detailed` metric families with labels such as `branch`, `workflow_name`, and `base_branch`. Default `false`. If Redis is configured, the service stores delivery and run state in Redis. @@ -43,6 +44,7 @@ PROMGITHUB_REDIS_PASSWORD="" \ PROMGITHUB_REDIS_DB="0" \ PROMGITHUB_REDIS_KEY_PREFIX="promgithub" \ PROMGITHUB_REDIS_DELIVERY_TTL="24h" \ +PROMGITHUB_ENABLE_DETAILED_METRICS="true" \ PROMGITHUB_SERVICE_PORT="8080" \ /path/to/binary/promgithub ``` @@ -67,6 +69,7 @@ docker run \ -e PROMGITHUB_REDIS_DB=0 \ -e PROMGITHUB_REDIS_KEY_PREFIX=promgithub \ -e PROMGITHUB_REDIS_DELIVERY_TTL=24h \ + -e PROMGITHUB_ENABLE_DETAILED_METRICS=true \ -e PROMGITHUB_SERVICE_PORT=8080 \ -p 8080:8080 \ ghcr.io/darthfork/promgithub: @@ -91,6 +94,7 @@ services: PROMGITHUB_REDIS_DB: 0 PROMGITHUB_REDIS_KEY_PREFIX: promgithub PROMGITHUB_REDIS_DELIVERY_TTL: 24h + PROMGITHUB_ENABLE_DETAILED_METRICS: "true" PROMGITHUB_SERVICE_PORT: 8080 ports: - "8080:8080" @@ -129,6 +133,8 @@ promgithub: db: 0 keyPrefix: promgithub deliveryTTL: 24h + metrics: + enableDetailed: false ``` ### Values for a bundled Redis deployment @@ -146,6 +152,8 @@ promgithub: db: 0 keyPrefix: promgithub deliveryTTL: 24h + metrics: + enableDetailed: true ``` When `redis.enabled=true`, the chart deploys Redis as a dependency and configures `promgithub` to connect to it automatically. diff --git a/helm/promgithub/templates/deployment.yaml b/helm/promgithub/templates/deployment.yaml index d9e8f00..0e69650 100644 --- a/helm/promgithub/templates/deployment.yaml +++ b/helm/promgithub/templates/deployment.yaml @@ -54,6 +54,8 @@ spec: - name: PROMGITHUB_REDIS_DELIVERY_TTL value: "{{ .Values.redisConfig.deliveryTTL | default "24h" }}" {{- end }} + - name: PROMGITHUB_ENABLE_DETAILED_METRICS + value: "{{ .Values.metrics.enableDetailed | default false }}" envFrom: - secretRef: name: "{{ include "promgithub.fullname" . }}" diff --git a/helm/promgithub/values.yaml b/helm/promgithub/values.yaml index f69931a..4545794 100644 --- a/helm/promgithub/values.yaml +++ b/helm/promgithub/values.yaml @@ -46,6 +46,9 @@ redisConfig: keyPrefix: promgithub deliveryTTL: 24h +metrics: + enableDetailed: false + # This is for setting up the promgithub service service: # This sets the service type diff --git a/src/async_test.go b/src/async_test.go index bfb77b2..67e0799 100644 --- a/src/async_test.go +++ b/src/async_test.go @@ -35,8 +35,16 @@ func TestAsyncProcessorEnqueueAndProcess(t *testing.T) { t.Fatal("timed out waiting for async processing") } - if got := testutil.ToFloat64(asyncProcessedEventsCounter.WithLabelValues("workflow_run")); got != 1 { - t.Fatalf("expected processed counter to be 1, got %v", got) + deadline := time.Now().Add(2 * time.Second) + for { + if got := testutil.ToFloat64(asyncProcessedEventsCounter.WithLabelValues("workflow_run")); got == 1 { + break + } + if time.Now().After(deadline) { + got := testutil.ToFloat64(asyncProcessedEventsCounter.WithLabelValues("workflow_run")) + t.Fatalf("expected processed counter to be 1, got %v", got) + } + time.Sleep(10 * time.Millisecond) } } diff --git a/src/github.go b/src/github.go index 8f62ced..0e6474d 100644 --- a/src/github.go +++ b/src/github.go @@ -101,6 +101,11 @@ type runMetricSet struct { durationHistogram *prometheus.HistogramVec } +type runMetricSets struct { + core runMetricSet + detailed runMetricSet +} + type runStoreMethods struct { get func(context.Context, int) (RunState, bool, error) update func(context.Context, int, RunState) error @@ -255,7 +260,18 @@ func shouldApplyStateTransition(previous, next RunState) bool { return true } -func applyGaugeDelta(details RunState, delta float64, queuedGauge, inProgressGauge, completedGauge *prometheus.GaugeVec) { +func applyCoreGaugeDelta(details RunState, delta float64, queuedGauge, inProgressGauge, completedGauge *prometheus.GaugeVec) { + switch normalizeStatus(details.Status) { + case statusQueued: + queuedGauge.WithLabelValues(details.Repository).Add(delta) + case statusInProgress: + inProgressGauge.WithLabelValues(details.Repository).Add(delta) + case statusCompleted: + completedGauge.WithLabelValues(details.Repository, details.Conclusion).Add(delta) + } +} + +func applyDetailedGaugeDelta(details RunState, delta float64, queuedGauge, inProgressGauge, completedGauge *prometheus.GaugeVec) { switch normalizeStatus(details.Status) { case statusQueued: queuedGauge.WithLabelValues(details.Repository, details.Branch, details.Name).Add(delta) @@ -266,7 +282,25 @@ func applyGaugeDelta(details RunState, delta float64, queuedGauge, inProgressGau } } -func observeDuration(details RunState, durationHistogram *prometheus.HistogramVec) { +func observeCoreDuration(details RunState, durationHistogram *prometheus.HistogramVec) { + if normalizeStatus(details.Status) != statusCompleted { + return + } + + startedAt, startedOK := parseMetricTime(details.StartedAt) + endedAt, endedOK := parseMetricTime(details.EndedAt) + if !startedOK || !endedOK || endedAt.Before(startedAt) { + return + } + + durationHistogram.WithLabelValues( + details.Repository, + details.Status, + details.Conclusion, + ).Observe(endedAt.Sub(startedAt).Seconds()) +} + +func observeDetailedDuration(details RunState, durationHistogram *prometheus.HistogramVec) { if normalizeStatus(details.Status) != statusCompleted { return } @@ -286,7 +320,24 @@ func observeDuration(details RunState, durationHistogram *prometheus.HistogramVe ).Observe(endedAt.Sub(startedAt).Seconds()) } -func applyStatefulMetrics(details RunState, previous *RunState, metrics runMetricSet) { +func applyCoreStatefulMetrics(details RunState, previous *RunState, metrics runMetricSet) { + metrics.statusCounter.WithLabelValues( + details.Repository, + details.Status, + details.Conclusion, + ).Inc() + + if previous != nil { + applyCoreGaugeDelta(*previous, -1, metrics.queuedGauge, metrics.inProgressGauge, metrics.completedGauge) + } + applyCoreGaugeDelta(details, 1, metrics.queuedGauge, metrics.inProgressGauge, metrics.completedGauge) + + if previous == nil || normalizeStatus(previous.Status) != statusCompleted { + observeCoreDuration(details, metrics.durationHistogram) + } +} + +func applyDetailedStatefulMetrics(details RunState, previous *RunState, metrics runMetricSet) { metrics.statusCounter.WithLabelValues( details.Repository, details.Branch, @@ -296,12 +347,12 @@ func applyStatefulMetrics(details RunState, previous *RunState, metrics runMetri ).Inc() if previous != nil { - applyGaugeDelta(*previous, -1, metrics.queuedGauge, metrics.inProgressGauge, metrics.completedGauge) + applyDetailedGaugeDelta(*previous, -1, metrics.queuedGauge, metrics.inProgressGauge, metrics.completedGauge) } - applyGaugeDelta(details, 1, metrics.queuedGauge, metrics.inProgressGauge, metrics.completedGauge) + applyDetailedGaugeDelta(details, 1, metrics.queuedGauge, metrics.inProgressGauge, metrics.completedGauge) if previous == nil || normalizeStatus(previous.Status) != statusCompleted { - observeDuration(details, metrics.durationHistogram) + observeDetailedDuration(details, metrics.durationHistogram) } } @@ -341,12 +392,15 @@ func updateTrackedRunMetrics( details runMetricDetails, store runStoreMethods, entityName string, - metrics runMetricSet, + metrics runMetricSets, ) { nextState := normalizeRunState(details) if stateStore == nil { - applyStatefulMetrics(nextState, nil, metrics) + applyCoreStatefulMetrics(nextState, nil, metrics.core) + if enableDetailedMetrics { + applyDetailedStatefulMetrics(nextState, nil, metrics.detailed) + } return } @@ -362,7 +416,10 @@ func updateTrackedRunMetrics( return } - applyStatefulMetrics(nextState, previousState, metrics) + applyCoreStatefulMetrics(nextState, previousState, metrics.core) + if enableDetailedMetrics { + applyDetailedStatefulMetrics(nextState, previousState, metrics.detailed) + } } func workflowRunStoreMethods() runStoreMethods { @@ -409,12 +466,21 @@ func updateWorkflowMetrics(ctx context.Context, body []byte) { }, workflowRunStoreMethods(), "workflow_run", - runMetricSet{ - statusCounter: workflowStatusCounter, - queuedGauge: workflowQueuedGauge, - inProgressGauge: workflowInProgressGauge, - completedGauge: workflowCompletedGauge, - durationHistogram: workflowDurationHistogram, + runMetricSets{ + core: runMetricSet{ + statusCounter: workflowStatusCounter, + queuedGauge: workflowQueuedGauge, + inProgressGauge: workflowInProgressGauge, + completedGauge: workflowCompletedGauge, + durationHistogram: workflowDurationHistogram, + }, + detailed: runMetricSet{ + statusCounter: workflowStatusDetailedCounter, + queuedGauge: workflowQueuedDetailedGauge, + inProgressGauge: workflowInProgressDetailedGauge, + completedGauge: workflowCompletedDetailedGauge, + durationHistogram: workflowDurationDetailedHistogram, + }, }, ) } @@ -441,12 +507,21 @@ func updateJobMetrics(ctx context.Context, body []byte) { }, workflowJobStoreMethods(), "workflow_job", - runMetricSet{ - statusCounter: jobStatusCounter, - queuedGauge: jobQueuedGauge, - inProgressGauge: jobInProgressGauge, - completedGauge: jobCompletedGauge, - durationHistogram: jobDurationHistogram, + runMetricSets{ + core: runMetricSet{ + statusCounter: jobStatusCounter, + queuedGauge: jobQueuedGauge, + inProgressGauge: jobInProgressGauge, + completedGauge: jobCompletedGauge, + durationHistogram: jobDurationHistogram, + }, + detailed: runMetricSet{ + statusCounter: jobStatusDetailedCounter, + queuedGauge: jobQueuedDetailedGauge, + inProgressGauge: jobInProgressDetailedGauge, + completedGauge: jobCompletedDetailedGauge, + durationHistogram: jobDurationDetailedHistogram, + }, }, ) } @@ -474,7 +549,14 @@ func updatePullRequestMetrics(body []byte) { pullRequestCounter.WithLabelValues( payload.Repository.FullName, - payload.PullRequest.Base.Ref, payload.Action, ).Inc() + + if enableDetailedMetrics { + pullRequestDetailedCounter.WithLabelValues( + payload.Repository.FullName, + payload.PullRequest.Base.Ref, + payload.Action, + ).Inc() + } } diff --git a/src/metrics.go b/src/metrics.go index e0d3444..c65e954 100644 --- a/src/metrics.go +++ b/src/metrics.go @@ -1,18 +1,25 @@ package main import ( + "os" + "strings" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) +const detailedMetricsEnvVar = "PROMGITHUB_ENABLE_DETAILED_METRICS" + var ( + enableDetailedMetrics = parseBoolEnv(os.Getenv(detailedMetricsEnvVar)) + // Workflow metrics. workflowStatusCounter = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "promgithub_workflow_status", Help: "Total number of workflow runs with status", }, - []string{"repository", "branch", "workflow_name", "workflow_status", "conclusion"}, + []string{"repository", "workflow_status", "conclusion"}, ) workflowDurationHistogram = promauto.NewHistogramVec( @@ -21,7 +28,7 @@ var ( Help: "Duration of workflow runs", Buckets: prometheus.DefBuckets, }, - []string{"repository", "branch", "workflow_name", "workflow_status", "conclusion"}, + []string{"repository", "workflow_status", "conclusion"}, ) workflowQueuedGauge = promauto.NewGaugeVec( @@ -29,7 +36,7 @@ var ( Name: "promgithub_workflow_queued", Help: "Number of workflow runs queued", }, - []string{"repository", "branch", "workflow_name"}, + []string{"repository"}, ) workflowInProgressGauge = promauto.NewGaugeVec( @@ -37,7 +44,7 @@ var ( Name: "promgithub_workflow_in_progress", Help: "Number of workflow runs in progress", }, - []string{"repository", "branch", "workflow_name"}, + []string{"repository"}, ) workflowCompletedGauge = promauto.NewGaugeVec( @@ -45,6 +52,47 @@ var ( Name: "promgithub_workflow_completed", Help: "Number of workflow runs completed", }, + []string{"repository", "workflow_conclusion"}, + ) + + workflowStatusDetailedCounter = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "promgithub_workflow_status_detailed", + Help: "Total number of workflow runs with status and optional high-cardinality labels", + }, + []string{"repository", "branch", "workflow_name", "workflow_status", "conclusion"}, + ) + + workflowDurationDetailedHistogram = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "promgithub_workflow_duration_detailed", + Help: "Duration of workflow runs with optional high-cardinality labels", + Buckets: prometheus.DefBuckets, + }, + []string{"repository", "branch", "workflow_name", "workflow_status", "conclusion"}, + ) + + workflowQueuedDetailedGauge = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "promgithub_workflow_queued_detailed", + Help: "Number of workflow runs queued with optional high-cardinality labels", + }, + []string{"repository", "branch", "workflow_name"}, + ) + + workflowInProgressDetailedGauge = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "promgithub_workflow_in_progress_detailed", + Help: "Number of workflow runs in progress with optional high-cardinality labels", + }, + []string{"repository", "branch", "workflow_name"}, + ) + + workflowCompletedDetailedGauge = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "promgithub_workflow_completed_detailed", + Help: "Number of workflow runs completed with optional high-cardinality labels", + }, []string{"repository", "branch", "workflow_conclusion", "workflow_name"}, ) @@ -54,7 +102,7 @@ var ( Name: "promgithub_job_status", Help: "Total number of jobs with status", }, - []string{"repository", "branch", "workflow_name", "job_status", "job_conclusion"}, + []string{"repository", "job_status", "job_conclusion"}, ) jobDurationHistogram = promauto.NewHistogramVec( @@ -63,7 +111,7 @@ var ( Help: "Duration of jobs runs in seconds", Buckets: prometheus.DefBuckets, }, - []string{"repository", "branch", "workflow_name", "job_status", "job_conclusion"}, + []string{"repository", "job_status", "job_conclusion"}, ) jobQueuedGauge = promauto.NewGaugeVec( @@ -71,7 +119,7 @@ var ( Name: "promgithub_job_queued", Help: "Number of jobs queued", }, - []string{"repository", "branch", "workflow_name"}, + []string{"repository"}, ) jobInProgressGauge = promauto.NewGaugeVec( @@ -79,7 +127,7 @@ var ( Name: "promgithub_job_in_progress", Help: "Number of jobs in progress", }, - []string{"repository", "branch", "workflow_name"}, + []string{"repository"}, ) jobCompletedGauge = promauto.NewGaugeVec( @@ -87,6 +135,47 @@ var ( Name: "promgithub_job_completed", Help: "Number of jobs completed", }, + []string{"repository", "job_conclusion"}, + ) + + jobStatusDetailedCounter = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "promgithub_job_status_detailed", + Help: "Total number of jobs with status and optional high-cardinality labels", + }, + []string{"repository", "branch", "workflow_name", "job_status", "job_conclusion"}, + ) + + jobDurationDetailedHistogram = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "promgithub_job_duration_detailed", + Help: "Duration of jobs runs in seconds with optional high-cardinality labels", + Buckets: prometheus.DefBuckets, + }, + []string{"repository", "branch", "workflow_name", "job_status", "job_conclusion"}, + ) + + jobQueuedDetailedGauge = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "promgithub_job_queued_detailed", + Help: "Number of jobs queued with optional high-cardinality labels", + }, + []string{"repository", "branch", "workflow_name"}, + ) + + jobInProgressDetailedGauge = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "promgithub_job_in_progress_detailed", + Help: "Number of jobs in progress with optional high-cardinality labels", + }, + []string{"repository", "branch", "workflow_name"}, + ) + + jobCompletedDetailedGauge = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "promgithub_job_completed_detailed", + Help: "Number of jobs completed with optional high-cardinality labels", + }, []string{"repository", "branch", "job_conclusion", "workflow_name"}, ) @@ -103,6 +192,14 @@ var ( Name: "promgithub_pull_request", Help: "Total number of pull requests", }, + []string{"repository", "pull_request_status"}, + ) + + pullRequestDetailedCounter = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "promgithub_pull_request_detailed", + Help: "Total number of pull requests with optional high-cardinality labels", + }, []string{"repository", "base_branch", "pull_request_status"}, ) @@ -160,3 +257,12 @@ var ( []string{"event_type"}, ) ) + +func parseBoolEnv(value string) bool { + switch strings.ToLower(strings.TrimSpace(value)) { + case "1", "true", "yes", "on": + return true + default: + return false + } +} diff --git a/src/metrics_test.go b/src/metrics_test.go index e04b1e4..10b69ae 100644 --- a/src/metrics_test.go +++ b/src/metrics_test.go @@ -16,6 +16,12 @@ func withInMemoryStateStore(t *testing.T) { t.Cleanup(func() { stateStore = oldStore }) } +func withDetailedMetricsEnabled(t *testing.T) { + oldValue := enableDetailedMetrics + enableDetailedMetrics = true + t.Cleanup(func() { enableDetailedMetrics = oldValue }) +} + func TestWorkflowStatusCounter(t *testing.T) { withInMemoryStateStore(t) workflowStatusCounter.Reset() @@ -29,7 +35,27 @@ func TestWorkflowStatusCounter(t *testing.T) { if err := testutil.CollectAndCompare(workflowStatusCounter, strings.NewReader(` # HELP promgithub_workflow_status Total number of workflow runs with status # TYPE promgithub_workflow_status counter - promgithub_workflow_status{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed"} 1 + promgithub_workflow_status{conclusion="success",repository="user/repo",workflow_status="completed"} 1 + `)); err != nil { + t.Errorf("unexpected metrics: %v", err) + } +} + +func TestWorkflowStatusDetailedCounter(t *testing.T) { + withInMemoryStateStore(t) + withDetailedMetricsEnabled(t) + workflowStatusDetailedCounter.Reset() + reg.MustRegister(workflowStatusDetailedCounter) + body, err := os.ReadFile("../test_data/workflow_run.json") + if err != nil { + t.Fatalf("Failed to read test data file: %v", err) + } + updateWorkflowMetrics(context.Background(), body) + + if err := testutil.CollectAndCompare(workflowStatusDetailedCounter, strings.NewReader(` + # HELP promgithub_workflow_status_detailed Total number of workflow runs with status and optional high-cardinality labels + # TYPE promgithub_workflow_status_detailed counter + promgithub_workflow_status_detailed{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -48,7 +74,7 @@ func TestJobStatusCounter(t *testing.T) { if err := testutil.CollectAndCompare(jobStatusCounter, strings.NewReader(` # HELP promgithub_job_status Total number of jobs with status # TYPE promgithub_job_status counter - promgithub_job_status{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI"} 1 + promgithub_job_status{job_conclusion="success",job_status="completed",repository="user/repo"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -84,7 +110,26 @@ func TestPullRequestsCounter(t *testing.T) { if err := testutil.CollectAndCompare(pullRequestCounter, strings.NewReader(` # HELP promgithub_pull_request Total number of pull requests # TYPE promgithub_pull_request counter - promgithub_pull_request{base_branch="main",pull_request_status="opened",repository="user/repo"} 1 + promgithub_pull_request{pull_request_status="opened",repository="user/repo"} 1 + `)); err != nil { + t.Errorf("unexpected metrics: %v", err) + } +} + +func TestPullRequestsDetailedCounter(t *testing.T) { + withDetailedMetricsEnabled(t) + pullRequestDetailedCounter.Reset() + reg.MustRegister(pullRequestDetailedCounter) + body, err := os.ReadFile("../test_data/pull_request.json") + if err != nil { + t.Fatalf("Failed to read test data file: %v", err) + } + updatePullRequestMetrics(body) + + if err := testutil.CollectAndCompare(pullRequestDetailedCounter, strings.NewReader(` + # HELP promgithub_pull_request_detailed Total number of pull requests with optional high-cardinality labels + # TYPE promgithub_pull_request_detailed counter + promgithub_pull_request_detailed{base_branch="main",pull_request_status="opened",repository="user/repo"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -103,20 +148,20 @@ func TestWorkflowDurationHistogram(t *testing.T) { if err := testutil.CollectAndCompare(workflowDurationHistogram, strings.NewReader(` # HELP promgithub_workflow_duration Duration of workflow runs # TYPE promgithub_workflow_duration histogram - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="0.005"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="0.01"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="0.025"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="0.05"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="0.1"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="0.25"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="0.5"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="1"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="2.5"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="5"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="10"} 0 - promgithub_workflow_duration_bucket{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed",le="+Inf"} 1 - promgithub_workflow_duration_sum{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed"} 3600 - promgithub_workflow_duration_count{branch="main",conclusion="success",repository="user/repo",workflow_name="CI",workflow_status="completed"} 1 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="0.005"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="0.01"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="0.025"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="0.05"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="0.1"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="0.25"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="0.5"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="1"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="2.5"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="5"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="10"} 0 + promgithub_workflow_duration_bucket{conclusion="success",repository="user/repo",workflow_status="completed",le="+Inf"} 1 + promgithub_workflow_duration_sum{conclusion="success",repository="user/repo",workflow_status="completed"} 3600 + promgithub_workflow_duration_count{conclusion="success",repository="user/repo",workflow_status="completed"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -135,20 +180,20 @@ func TestJobDurationHistogram(t *testing.T) { if err := testutil.CollectAndCompare(jobDurationHistogram, strings.NewReader(` # HELP promgithub_job_duration Duration of jobs runs in seconds # TYPE promgithub_job_duration histogram - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="0.005"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="0.01"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="0.025"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="0.05"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="0.1"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="0.25"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="0.5"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="1"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="2.5"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="5"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="10"} 0 - promgithub_job_duration_bucket{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI",le="+Inf"} 1 - promgithub_job_duration_sum{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI"} 3600 - promgithub_job_duration_count{branch="main",job_conclusion="success",job_status="completed",repository="user/repo",workflow_name="CI"} 1 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="0.005"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="0.01"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="0.025"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="0.05"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="0.1"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="0.25"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="0.5"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="1"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="2.5"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="5"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="10"} 0 + promgithub_job_duration_bucket{job_conclusion="success",job_status="completed",repository="user/repo",le="+Inf"} 1 + promgithub_job_duration_sum{job_conclusion="success",job_status="completed",repository="user/repo"} 3600 + promgithub_job_duration_count{job_conclusion="success",job_status="completed",repository="user/repo"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -180,7 +225,7 @@ func TestWorkflowQueuedGauge(t *testing.T) { if err := testutil.CollectAndCompare(workflowQueuedGauge, strings.NewReader(` # HELP promgithub_workflow_queued Number of workflow runs queued # TYPE promgithub_workflow_queued gauge - promgithub_workflow_queued{branch="main",repository="user/repo",workflow_name="CI"} 1 + promgithub_workflow_queued{repository="user/repo"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -213,7 +258,7 @@ func TestWorkflowInProgressGauge(t *testing.T) { if err := testutil.CollectAndCompare(workflowInProgressGauge, strings.NewReader(` # HELP promgithub_workflow_in_progress Number of workflow runs in progress # TYPE promgithub_workflow_in_progress gauge - promgithub_workflow_in_progress{branch="main",repository="user/repo",workflow_name="CI"} 1 + promgithub_workflow_in_progress{repository="user/repo"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -232,7 +277,7 @@ func TestWorkflowCompletedGauge(t *testing.T) { if err := testutil.CollectAndCompare(workflowCompletedGauge, strings.NewReader(` # HELP promgithub_workflow_completed Number of workflow runs completed # TYPE promgithub_workflow_completed gauge - promgithub_workflow_completed{branch="main",repository="user/repo",workflow_conclusion="success",workflow_name="CI"} 1 + promgithub_workflow_completed{repository="user/repo",workflow_conclusion="success"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -265,7 +310,7 @@ func TestJobQueuedGauge(t *testing.T) { if err := testutil.CollectAndCompare(jobQueuedGauge, strings.NewReader(` # HELP promgithub_job_queued Number of jobs queued # TYPE promgithub_job_queued gauge - promgithub_job_queued{branch="main",repository="user/repo",workflow_name="CI"} 1 + promgithub_job_queued{repository="user/repo"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -298,7 +343,7 @@ func TestJobInProgressGauge(t *testing.T) { if err := testutil.CollectAndCompare(jobInProgressGauge, strings.NewReader(` # HELP promgithub_job_in_progress Number of jobs in progress # TYPE promgithub_job_in_progress gauge - promgithub_job_in_progress{branch="main",repository="user/repo",workflow_name="CI"} 1 + promgithub_job_in_progress{repository="user/repo"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -317,7 +362,7 @@ func TestJobCompletedGauge(t *testing.T) { if err := testutil.CollectAndCompare(jobCompletedGauge, strings.NewReader(` # HELP promgithub_job_completed Number of jobs completed # TYPE promgithub_job_completed gauge - promgithub_job_completed{branch="main",job_conclusion="success",repository="user/repo",workflow_name="CI"} 1 + promgithub_job_completed{job_conclusion="success",repository="user/repo"} 1 `)); err != nil { t.Errorf("unexpected metrics: %v", err) } @@ -358,13 +403,13 @@ func TestWorkflowGaugeTransitionIsIdempotent(t *testing.T) { updateWorkflowMetrics(context.Background(), completedBody) updateWorkflowMetrics(context.Background(), inProgressBody) - if got := testutil.ToFloat64(workflowQueuedGauge.WithLabelValues("user/repo", "main", "CI")); got != 0 { + if got := testutil.ToFloat64(workflowQueuedGauge.WithLabelValues("user/repo")); got != 0 { t.Fatalf("expected queued gauge to be 0, got %v", got) } - if got := testutil.ToFloat64(workflowInProgressGauge.WithLabelValues("user/repo", "main", "CI")); got != 0 { + if got := testutil.ToFloat64(workflowInProgressGauge.WithLabelValues("user/repo")); got != 0 { t.Fatalf("expected in progress gauge to be 0, got %v", got) } - if got := testutil.ToFloat64(workflowCompletedGauge.WithLabelValues("user/repo", "main", "success", "CI")); got != 1 { + if got := testutil.ToFloat64(workflowCompletedGauge.WithLabelValues("user/repo", "success")); got != 1 { t.Fatalf("expected completed gauge to be 1, got %v", got) } } @@ -405,13 +450,13 @@ func TestJobGaugeTransitionIsIdempotent(t *testing.T) { updateJobMetrics(context.Background(), completedBody) updateJobMetrics(context.Background(), inProgressBody) - if got := testutil.ToFloat64(jobQueuedGauge.WithLabelValues("user/repo", "main", "CI")); got != 0 { + if got := testutil.ToFloat64(jobQueuedGauge.WithLabelValues("user/repo")); got != 0 { t.Fatalf("expected queued gauge to be 0, got %v", got) } - if got := testutil.ToFloat64(jobInProgressGauge.WithLabelValues("user/repo", "main", "CI")); got != 0 { + if got := testutil.ToFloat64(jobInProgressGauge.WithLabelValues("user/repo")); got != 0 { t.Fatalf("expected in progress gauge to be 0, got %v", got) } - if got := testutil.ToFloat64(jobCompletedGauge.WithLabelValues("user/repo", "main", "success", "CI")); got != 1 { + if got := testutil.ToFloat64(jobCompletedGauge.WithLabelValues("user/repo", "success")); got != 1 { t.Fatalf("expected completed gauge to be 1, got %v", got) } }