Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@ func main() {
var enableHTTP2 bool
var tlsOpts []func(*tls.Config)

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8443",
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080",
"The address the metrics endpoint binds to. Use 0 to disable.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081",
"The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", true,
"Enable leader election for controller manager.")
flag.BoolVar(&secureMetrics, "metrics-secure", true,
flag.BoolVar(&secureMetrics, "metrics-secure", false,
"If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.")
flag.StringVar(&metricsCertPath, "metrics-cert-path", "",
"The directory that contains the metrics server certificate.")
Expand Down Expand Up @@ -153,9 +153,11 @@ func main() {
platform.SnapshotRegion = v
}

nodeRecorder := mgr.GetEventRecorderFor("seinode-controller")
if err := (&nodecontroller.SeiNodeReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: nodeRecorder,
Platform: platform,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "Failed to create controller", "controller", "SeiNode")
Expand Down
1 change: 1 addition & 0 deletions config/default/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ resources:
- ../rbac
- ../manager
- ../network-policy
- ../monitoring
7 changes: 5 additions & 2 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ spec:
- command:
- /manager
args:
- --metrics-bind-address=:8443
- --metrics-bind-address=:8080
- --leader-elect
- --health-probe-bind-address=:8081
image: 189176372795.dkr.ecr.us-east-2.amazonaws.com/sei/sei-k8s-controller@sha256:87b8b8ed04013f23ada6f2a34e162c98708c90424b2e6cd552bc7ac1e4284d9f
Expand Down Expand Up @@ -63,7 +63,10 @@ spec:
value: eu-central-1
- name: SEI_CONTROLLER_SA_PRINCIPAL
value: "cluster.local/ns/sei-k8s-controller-system/sa/sei-k8s-controller-manager"
ports: []
ports:
- containerPort: 8080
name: metrics
protocol: TCP
securityContext:
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
Expand Down
6 changes: 3 additions & 3 deletions config/manager/metrics_service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ metadata:
namespace: system
spec:
ports:
- name: https
port: 8443
- name: http-metrics
port: 8080
protocol: TCP
targetPort: 8443
targetPort: 8080
selector:
control-plane: controller-manager
app.kubernetes.io/name: sei-k8s-controller
6 changes: 6 additions & 0 deletions config/monitoring/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- service-monitor.yaml
- prometheus-rule.yaml
82 changes: 82 additions & 0 deletions config/monitoring/prometheus-rule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: controller-alerts
namespace: system
labels:
control-plane: controller-manager
app.kubernetes.io/name: sei-k8s-controller
app.kubernetes.io/managed-by: kustomize
spec:
groups:
- name: sei-controller
rules:
- alert: SeiNodeGroupDegraded
expr: sei_controller_seinodegroup_phase{phase="Degraded"} == 1
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "SeiNodeGroup {{ $labels.namespace }}/{{ $labels.name }} is degraded"
description: "Some nodes are not ready. Check child SeiNode status."

- alert: SeiNodeGroupFailed
expr: sei_controller_seinodegroup_phase{phase="Failed"} == 1
for: 5m
labels:
severity: critical
team: platform
annotations:
summary: "SeiNodeGroup {{ $labels.namespace }}/{{ $labels.name }} has failed"
description: "All nodes have failed. Immediate investigation required."

- alert: SeiNodeStuckInitializing
expr: sei_controller_seinode_phase{phase="Initializing"} == 1
for: 30m
labels:
severity: warning
team: platform
annotations:
summary: "SeiNode {{ $labels.namespace }}/{{ $labels.name }} stuck initializing"
description: "Node has been in Initializing phase for over 30 minutes."

- alert: SeiNodeStuckPending
expr: max by (namespace, name) (sei_controller_seinode_phase{phase=~"Pending|PreInitializing"}) == 1
for: 15m
labels:
severity: warning
team: platform
annotations:
summary: "SeiNode {{ $labels.namespace }}/{{ $labels.name }} stuck pending"
description: "Node has been in Pending/PreInitializing phase for over 15 minutes."

- alert: SidecarUnreachableHigh
expr: rate(sei_controller_sidecar_unreachable_total[5m]) > 0.5
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "Sidecar for {{ $labels.namespace }}/{{ $labels.node }} is frequently unreachable"
description: "Sidecar connectivity failures sustained above 0.5/s for 10 minutes."

- alert: ControllerReconcileErrors
expr: increase(sei_controller_reconcile_errors_total[15m]) > 5
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Controller {{ $labels.controller }} has elevated reconcile errors"
description: "More than 5 reconcile errors in the last 15 minutes for {{ $labels.namespace }}/{{ $labels.name }}."

- alert: ControllerHighReconcileLatency
expr: histogram_quantile(0.99, rate(sei_controller_seinodegroup_reconcile_substep_duration_seconds_bucket[5m])) > 10
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "SeiNodeGroup reconcile substep {{ $labels.substep }} is slow"
description: "p99 latency above 10s for 10 minutes."
18 changes: 18 additions & 0 deletions config/monitoring/service-monitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: controller-manager-metrics
namespace: system
labels:
control-plane: controller-manager
app.kubernetes.io/name: sei-k8s-controller
app.kubernetes.io/managed-by: kustomize
spec:
selector:
matchLabels:
control-plane: controller-manager
app.kubernetes.io/name: sei-k8s-controller
endpoints:
- port: http-metrics
path: /metrics
interval: 30s
2 changes: 1 addition & 1 deletion config/network-policy/allow-metrics-traffic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ spec:
matchLabels:
metrics: enabled
ports:
- port: 8443
- port: 8080
protocol: TCP
46 changes: 42 additions & 4 deletions internal/controller/node/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
Expand Down Expand Up @@ -68,6 +69,7 @@ func DefaultPlatformConfig() PlatformConfig {
type SeiNodeReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Platform PlatformConfig
// BuildSidecarClientFn overrides sidecar client construction for testing.
BuildSidecarClientFn func(node *seiv1alpha1.SeiNode) SidecarStatusClient
Expand All @@ -81,6 +83,7 @@ type SeiNodeReconciler struct {
// +kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups="",resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch
// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch

func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
node := &seiv1alpha1.SeiNode{}
Expand All @@ -91,6 +94,10 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
return ctrl.Result{}, err
}

if node.Status.Phase != "" {
emitNodePhase(node.Namespace, node.Name, node.Status.Phase)
}

if !node.DeletionTimestamp.IsZero() {
return r.handleNodeDeletion(ctx, node)
}
Expand Down Expand Up @@ -144,6 +151,13 @@ func (r *SeiNodeReconciler) reconcilePending(ctx context.Context, node *seiv1alp
if err := r.Status().Patch(ctx, node, patch); err != nil {
return ctrl.Result{}, fmt.Errorf("initializing plans: %w", err)
}

ns, name := node.Namespace, node.Name
nodePhaseTransitions.WithLabelValues(ns, string(seiv1alpha1.PhasePending), string(seiv1alpha1.PhasePreInitializing)).Inc()
emitNodePhase(ns, name, seiv1alpha1.PhasePreInitializing)
r.Recorder.Eventf(node, corev1.EventTypeNormal, "PhaseTransition",
"Phase changed from %s to %s", seiv1alpha1.PhasePending, seiv1alpha1.PhasePreInitializing)

return ctrl.Result{RequeueAfter: immediateRequeue}, nil
}

Expand All @@ -159,6 +173,7 @@ func (r *SeiNodeReconciler) reconcileInitializing(ctx context.Context, node *sei

sc := r.buildSidecarClient(node)
if sc == nil {
sidecarUnreachableTotal.WithLabelValues(node.Namespace, node.Name).Inc()
log.FromContext(ctx).Info("sidecar not reachable yet, will retry")
return ctrl.Result{RequeueAfter: taskPollInterval}, nil
}
Expand All @@ -169,10 +184,10 @@ func (r *SeiNodeReconciler) reconcileInitializing(ctx context.Context, node *sei
}

if node.Status.InitPlan.Phase == seiv1alpha1.TaskPlanComplete {
return r.setPhase(ctx, node, seiv1alpha1.PhaseRunning)
return r.transitionPhase(ctx, node, seiv1alpha1.PhaseRunning)
}
if node.Status.InitPlan.Phase == seiv1alpha1.TaskPlanFailed {
return r.setPhase(ctx, node, seiv1alpha1.PhaseFailed)
return r.transitionPhase(ctx, node, seiv1alpha1.PhaseFailed)
}
return result, nil
}
Expand All @@ -181,19 +196,40 @@ func (r *SeiNodeReconciler) reconcileInitializing(ctx context.Context, node *sei
func (r *SeiNodeReconciler) reconcileRunning(ctx context.Context, node *seiv1alpha1.SeiNode) (ctrl.Result, error) {
sc := r.buildSidecarClient(node)
if sc == nil {
sidecarUnreachableTotal.WithLabelValues(node.Namespace, node.Name).Inc()
log.FromContext(ctx).Info("sidecar not reachable, will retry")
return ctrl.Result{RequeueAfter: statusPollInterval}, nil
}
return r.reconcileRuntimeTasks(ctx, node, sc)
}

// setPhase transitions the node to a new phase.
func (r *SeiNodeReconciler) setPhase(ctx context.Context, node *seiv1alpha1.SeiNode, phase seiv1alpha1.SeiNodePhase) (ctrl.Result, error) {
// transitionPhase transitions the node to a new phase and emits the associated
// metric counter, phase gauge, and Kubernetes event.
func (r *SeiNodeReconciler) transitionPhase(ctx context.Context, node *seiv1alpha1.SeiNode, phase seiv1alpha1.SeiNodePhase) (ctrl.Result, error) {
prev := node.Status.Phase
if prev == "" {
prev = seiv1alpha1.PhasePending
}

patch := client.MergeFrom(node.DeepCopy())
node.Status.Phase = phase
if err := r.Status().Patch(ctx, node, patch); err != nil {
return ctrl.Result{}, fmt.Errorf("setting phase to %s: %w", phase, err)
}

ns, name := node.Namespace, node.Name
nodePhaseTransitions.WithLabelValues(ns, string(prev), string(phase)).Inc()
emitNodePhase(ns, name, phase)

if phase == seiv1alpha1.PhaseRunning {
dur := time.Since(node.CreationTimestamp.Time).Seconds()
nodeInitDuration.WithLabelValues(ns, node.Spec.ChainID).Observe(dur)
nodeLastInitDuration.WithLabelValues(ns, name).Set(dur)
}

r.Recorder.Eventf(node, corev1.EventTypeNormal, "PhaseTransition",
"Phase changed from %s to %s", prev, phase)

return ctrl.Result{RequeueAfter: immediateRequeue}, nil
}

Expand Down Expand Up @@ -235,6 +271,8 @@ func (r *SeiNodeReconciler) handleNodeDeletion(ctx context.Context, node *seiv1a
}
}

cleanupNodeMetrics(node.Namespace, node.Name)

controllerutil.RemoveFinalizer(node, nodeFinalizerName)
return ctrl.Result{}, r.Update(ctx, node)
}
Expand Down
92 changes: 92 additions & 0 deletions internal/controller/node/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package node

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"

seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1"
"github.com/sei-protocol/sei-k8s-controller/internal/controller/observability"
)

var allNodePhases = []string{
string(seiv1alpha1.PhasePending),
string(seiv1alpha1.PhasePreInitializing),
string(seiv1alpha1.PhaseInitializing),
string(seiv1alpha1.PhaseRunning),
string(seiv1alpha1.PhaseFailed),
string(seiv1alpha1.PhaseTerminating),
}

var (
nodePhaseGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "sei_controller_seinode_phase",
Help: "Current phase of each SeiNode (1=active, 0=inactive)",
},
[]string{"namespace", "name", "phase"},
)

nodePhaseTransitions = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "sei_controller_seinode_phase_transitions_total",
Help: "Phase state machine transitions",
},
[]string{"namespace", "from", "to"},
)

nodeInitDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "sei_controller_seinode_init_duration_seconds",
Help: "Time from Pending to Running",
Buckets: observability.InitBuckets,
},
[]string{"namespace", "chain_id"},
)

nodeLastInitDuration = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "sei_controller_seinode_last_init_duration_seconds",
Help: "Per-node init duration, set once when node reaches Running",
},
[]string{"namespace", "name"},
)

sidecarRequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "sei_controller_sidecar_request_duration_seconds",
Help: "Duration of HTTP requests to the seictl sidecar",
Buckets: observability.ReconcileBuckets,
},
[]string{"namespace", "method", "route", "status_code"},
)

sidecarUnreachableTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "sei_controller_sidecar_unreachable_total",
Help: "Number of times the sidecar was unreachable",
},
[]string{"namespace", "node"},
)
)

func init() {
metrics.Registry.MustRegister(
nodePhaseGauge,
nodePhaseTransitions,
nodeInitDuration,
nodeLastInitDuration,
sidecarRequestDuration,
sidecarUnreachableTotal,
)
}

func emitNodePhase(ns, name string, phase seiv1alpha1.SeiNodePhase) {
observability.EmitPhaseGauge(nodePhaseGauge, ns, name, string(phase), allNodePhases)
}

func cleanupNodeMetrics(namespace, name string) {
observability.DeletePhaseGauge(nodePhaseGauge, namespace, name, allNodePhases)
nodeLastInitDuration.DeleteLabelValues(namespace, name)
sidecarUnreachableTotal.DeleteLabelValues(namespace, name)
observability.ReconcileErrorsTotal.DeleteLabelValues(seiNodeControllerName, namespace, name)
}
Loading
Loading