Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ CLAUDE.md
docs/nav_order_index.txt
rust/.cargo/config.toml
sandbox/pgadmin-data/*
.claude/settings.json
plans/*
.claude/settings.json
.entire/.gitignore
.entire/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- Scheduler orchestrator tables for distributed cluster assignment (Mode 4)

CREATE TABLE scheduler_instance (
pk_instance UUID PRIMARY KEY,
str_name VARCHAR(256) NOT NULL,
pk_facility VARCHAR(36) REFERENCES facility(pk_facility),
ts_heartbeat TIMESTAMPTZ NOT NULL DEFAULT NOW(),
ts_registered TIMESTAMPTZ NOT NULL DEFAULT NOW(),
int_capacity INTEGER NOT NULL DEFAULT 100,
float_jobs_queried DOUBLE PRECISION NOT NULL DEFAULT 0,
b_draining BOOLEAN NOT NULL DEFAULT FALSE
);

CREATE INDEX idx_scheduler_instance_heartbeat ON scheduler_instance(ts_heartbeat);

CREATE TABLE scheduler_cluster_assignment (
pk_assignment UUID PRIMARY KEY DEFAULT gen_random_uuid(),
pk_instance UUID NOT NULL REFERENCES scheduler_instance(pk_instance) ON DELETE CASCADE,
str_cluster_id TEXT NOT NULL,
str_cluster_json TEXT NOT NULL,
int_version INTEGER NOT NULL DEFAULT 0,
ts_assigned TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(str_cluster_id)
);

CREATE INDEX idx_sca_instance ON scheduler_cluster_assignment(pk_instance);
39 changes: 39 additions & 0 deletions rust/config/scheduler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -241,3 +241,42 @@ scheduler:
# ignore_tags:
# - tag_to_ignore1
# - tag_to_ignore2

# =============================================================================
# ORCHESTRATOR CONFIGURATION
# =============================================================================
# Controls leader election, heartbeating, and cluster distribution for
# multi-instance deployments. All durations use humantime format (e.g. 5s, 30s).
# orchestrator:
# How often this instance updates its heartbeat
# Default: 5s
# heartbeat_interval: 5s

# Instance is considered dead after this duration without heartbeat
# Default: 30s
# failure_threshold: 30s

# How often the leader recalculates cluster distribution
# Default: 10s
# distribution_interval: 10s

# How often workers poll for assignment changes
# Default: 5s
# poll_interval: 5s

# How often non-leaders attempt to acquire the leader lock
# Default: 10s
# election_interval: 10s

# Relative capacity weight of this instance (higher = more clusters assigned)
# Default: 100
# capacity: 100

# Graceful shutdown timeout before force-killing in-flight work
# Default: 30s
# shutdown_timeout: 30s

# How long a cluster assignment is preserved before becoming eligible for
# redistribution. Prevents new instances from remaining idle.
# Default: 120s
# assignment_ttl: 120s
6 changes: 5 additions & 1 deletion rust/crates/scheduler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ tonic = { workspace = true }
itertools = "0.13.0"
humantime = "2.2.0"
humantime-serde = "1.1.1"
sqlx = { version = "0.8", features = ["runtime-tokio", "postgres", "chrono"] }
sqlx = { version = "0.8", features = ["runtime-tokio", "postgres", "chrono", "uuid"] }
home = { workspace = true }
structopt = { workspace = true }
once_cell = "1.13"
Expand All @@ -57,13 +57,17 @@ sentry = { version = "0.47", features = ["tracing"] }
axum = "0.7"
tower-http = { version = "0.5", features = ["trace"] }
urlencoding = "2.1"
gethostname = "0.4"
rand = "0.8"

[features]
default = []
smoke-tests = []
integration-tests = []

[dev-dependencies]
tokio-test = "0.4"
tracing-test = "0.2"
serial_test = "3.0"
rand = "0.8"
pg-embed = "1.0"
1 change: 1 addition & 0 deletions rust/crates/scheduler/resources/migrations
Loading
Loading