Skip to content

Commit de50eaf

Browse files
bfirshclaude
andcommitted
fix(coglet): return 503 from /health-check when not ready
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 938d9f0 commit de50eaf

10 files changed

Lines changed: 55 additions & 51 deletions

File tree

crates/coglet/src/transport/http/routes.rs

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,9 @@ fn generate_prediction_id() -> String {
9999
format!("pred_{:x}", timestamp)
100100
}
101101

102-
async fn health_check(State(service): State<Arc<PredictionService>>) -> Json<HealthCheckResponse> {
102+
async fn health_check(
103+
State(service): State<Arc<PredictionService>>,
104+
) -> (StatusCode, Json<HealthCheckResponse>) {
103105
let snapshot = service.health().await;
104106

105107
// Run user healthcheck if ready and not busy
@@ -116,10 +118,13 @@ async fn health_check(State(service): State<Arc<PredictionService>>) -> Json<Hea
116118
None
117119
};
118120

119-
Json(HealthCheckResponse::from_snapshot(
120-
snapshot,
121-
user_healthcheck_error,
122-
))
121+
let response = HealthCheckResponse::from_snapshot(snapshot, user_healthcheck_error);
122+
let status_code = match response.status {
123+
HealthResponse::Ready => StatusCode::OK,
124+
_ => StatusCode::SERVICE_UNAVAILABLE,
125+
};
126+
127+
(status_code, Json(response))
123128
}
124129

125130
/// Write /var/run/cog/ready for K8s readiness probe.
@@ -549,7 +554,7 @@ mod tests {
549554
.await
550555
.unwrap();
551556

552-
assert_eq!(response.status(), StatusCode::OK);
557+
assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
553558

554559
let json = response_json(response).await;
555560
assert_eq!(json["status"], "STARTING");
@@ -566,6 +571,7 @@ mod tests {
566571
.await
567572
.unwrap();
568573

574+
assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
569575
let json = response_json(response).await;
570576
assert_eq!(json["status"], "UNKNOWN");
571577
}
@@ -920,6 +926,7 @@ mod tests {
920926
.await
921927
.unwrap();
922928

929+
assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
923930
let json = response_json(response).await;
924931
assert_eq!(json["status"], "BUSY");
925932
}

docs/http.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,8 @@ If the upload fails, the server responds with an error.
214214
### `GET /health-check`
215215

216216
Returns the current health status of the model container.
217-
This endpoint always responds with `200 OK`
218-
check the `status` field in the response body to determine readiness.
217+
Responds with `200 OK` when the model is ready,
218+
or `503 Service Unavailable` otherwise.
219219

220220
The response body is a JSON object with the following fields:
221221

docs/llms.txt

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

integration-tests/harness/harness.go

Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,8 @@ func (h *Harness) cmdCurl(ts *testscript.TestScript, neg bool, args []string) {
458458

459459
if neg {
460460
if !statusOK {
461-
// Expected to fail - success!
461+
// Expected to fail - write body to stdout and return
462+
_, _ = ts.Stdout().Write([]byte(respBody))
462463
return
463464
}
464465
} else {
@@ -551,9 +552,9 @@ type healthCheckResponse struct {
551552
Status string `json:"status"`
552553
}
553554

554-
// waitForServer polls the server's health-check endpoint until it returns READY status.
555-
// The server may return HTTP 200 while still in STARTING state (during setup),
556-
// so we must check the actual status field in the response.
555+
// waitForServer polls the server's health-check endpoint until it returns a
556+
// post-setup status. It checks the status field in the JSON response body
557+
// regardless of HTTP status code.
557558
func waitForServer(serverURL string, timeout time.Duration) bool {
558559
client := &http.Client{Timeout: 5 * time.Second}
559560
deadline := time.Now().Add(timeout)
@@ -565,34 +566,30 @@ func waitForServer(serverURL string, timeout time.Duration) bool {
565566
continue
566567
}
567568

568-
if resp.StatusCode == http.StatusOK {
569-
body, err := io.ReadAll(resp.Body)
570-
_ = resp.Body.Close()
571-
if err != nil {
572-
time.Sleep(200 * time.Millisecond)
573-
continue
574-
}
569+
body, err := io.ReadAll(resp.Body)
570+
_ = resp.Body.Close()
571+
if err != nil {
572+
time.Sleep(200 * time.Millisecond)
573+
continue
574+
}
575575

576-
var health healthCheckResponse
577-
if err := json.Unmarshal(body, &health); err != nil {
578-
time.Sleep(200 * time.Millisecond)
579-
continue
580-
}
576+
var health healthCheckResponse
577+
if err := json.Unmarshal(body, &health); err != nil {
578+
time.Sleep(200 * time.Millisecond)
579+
continue
580+
}
581581

582-
// Return success when the server has completed setup
583-
// READY = setup completed, healthcheck passed (or no healthcheck)
584-
// UNHEALTHY = setup completed, but user healthcheck failed
585-
// BUSY = setup completed, prediction in progress
586-
if health.Status == "READY" || health.Status == "UNHEALTHY" || health.Status == "BUSY" {
587-
return true
588-
}
582+
// Return success when the server has completed setup
583+
// READY = setup completed, healthcheck passed (or no healthcheck)
584+
// UNHEALTHY = setup completed, but user healthcheck failed
585+
// BUSY = setup completed, prediction in progress
586+
if health.Status == "READY" || health.Status == "UNHEALTHY" || health.Status == "BUSY" {
587+
return true
588+
}
589589

590-
// If setup failed, no point waiting
591-
if health.Status == "SETUP_FAILED" || health.Status == "DEFUNCT" {
592-
return false
593-
}
594-
} else {
595-
_ = resp.Body.Close()
590+
// If setup failed, no point waiting
591+
if health.Status == "SETUP_FAILED" || health.Status == "DEFUNCT" {
592+
return false
596593
}
597594

598595
time.Sleep(200 * time.Millisecond)

integration-tests/tests/healthcheck_async_exception.txtar

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ cog build -t $TEST_IMAGE
77
# Start the server
88
cog serve
99

10-
# Test: Async healthcheck raising exception gives UNHEALTHY with error
11-
curl GET /health-check
10+
# Test: Async healthcheck raising exception gives UNHEALTHY with error (503)
11+
! curl GET /health-check
1212
stdout '"status":"UNHEALTHY"'
1313
stdout 'user_healthcheck_error'
1414
stdout 'Async healthcheck error'

integration-tests/tests/healthcheck_async_timeout.txtar

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ stdout '"status":"READY"'
1515
curl POST /predictions '{"input":{"text":"trigger_slow"}}'
1616
stdout '"status":"succeeded"'
1717

18-
# Now healthcheck should timeout and return UNHEALTHY
19-
curl GET /health-check
18+
# Now healthcheck should timeout and return UNHEALTHY (503)
19+
! curl GET /health-check
2020
stdout '"status":"UNHEALTHY"'
2121
stdout 'user_healthcheck_error'
2222
stdout 'timed out after 5.0 seconds'

integration-tests/tests/healthcheck_async_unhealthy.txtar

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ cog build -t $TEST_IMAGE
77
# Start the server
88
cog serve
99

10-
# Test: Async healthcheck returning False gives UNHEALTHY
11-
curl GET /health-check
10+
# Test: Async healthcheck returning False gives UNHEALTHY (503)
11+
! curl GET /health-check
1212
stdout '"status":"UNHEALTHY"'
1313
stdout 'user_healthcheck_error'
1414
stdout 'returned False'

integration-tests/tests/healthcheck_exception.txtar

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ cog build -t $TEST_IMAGE
77
# Start the server
88
cog serve
99

10-
# Exception in healthcheck should return UNHEALTHY with error message
11-
curl GET /health-check
10+
# Exception in healthcheck should return UNHEALTHY with error message (503)
11+
! curl GET /health-check
1212
stdout '"status":"UNHEALTHY"'
1313
stdout 'user_healthcheck_error'
1414
stdout 'Critical system error'

integration-tests/tests/healthcheck_timeout.txtar

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ stdout '"status":"READY"'
1515
curl POST /predictions '{"input":{"text":"trigger_slow"}}'
1616
stdout '"status":"succeeded"'
1717

18-
# Now healthcheck should timeout and return UNHEALTHY
19-
curl GET /health-check
18+
# Now healthcheck should timeout and return UNHEALTHY (503)
19+
! curl GET /health-check
2020
stdout '"status":"UNHEALTHY"'
2121
stdout 'user_healthcheck_error'
2222
stdout 'timed out after 5.0 seconds'

integration-tests/tests/healthcheck_unhealthy.txtar

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ cog build -t $TEST_IMAGE
77
# Start the server
88
cog serve
99

10-
# Unhealthy healthcheck should return UNHEALTHY status
11-
curl GET /health-check
10+
# Unhealthy healthcheck should return UNHEALTHY status (503)
11+
! curl GET /health-check
1212
stdout '"status":"UNHEALTHY"'
1313
stdout 'user_healthcheck_error'
1414
stdout 'user-defined healthcheck returned False'

0 commit comments

Comments
 (0)