@@ -18,9 +18,18 @@ function fixture(params: {
1818 kvSlots ?: number
1919 queueBuckets ?: Array < { le : string ; count : number } >
2020 ttftBuckets ?: Array < { le : string ; count : number } >
21+ /** deployment_replicas gauge. Defaults to 1 so existing tests stay healthy.
22+ * Set to 0 or null to simulate a cold/deleted deployment. */
23+ replicas ?: number | null
2124} ) : string {
2225 const lines : string [ ] = [ ]
2326 const labels = `base_model="m",deployment="${ DEPLOYMENT } ",deployment_account="test-acc",deployment_id="d1"`
27+ const replicas = params . replicas === undefined ? 1 : params . replicas
28+ if ( replicas !== null ) {
29+ lines . push (
30+ `deployment_replicas{deployment_account="test-acc",deployment_id="d1"} ${ replicas } ` ,
31+ )
32+ }
2433 if ( params . requestRate !== undefined ) {
2534 lines . push ( `request_counter_total:sum_by_deployment{${ labels } } ${ params . requestRate } ` )
2635 }
@@ -182,9 +191,38 @@ describe('computeDeploymentHealth', () => {
182191 expect ( health . reasons . some ( ( r ) => r . includes ( 'error rate' ) ) ) . toBe ( true )
183192 } )
184193
194+ test ( 'flags deployment with zero replicas as unhealthy' , ( ) => {
195+ const metrics = parsePrometheusText (
196+ fixture ( { requestRate : 0 , errorRate : 0 , kvBlocks : 0 , replicas : 0 } ) ,
197+ )
198+ const health = computeDeploymentHealth ( {
199+ deployment : DEPLOYMENT ,
200+ metrics,
201+ thresholds : DEFAULT_HEALTH_THRESHOLDS ,
202+ } )
203+ expect ( health . status ) . toBe ( 'unhealthy' )
204+ expect ( health . metrics . replicas ) . toBe ( 0 )
205+ expect ( health . reasons . some ( ( r ) => r . includes ( 'replicas' ) ) ) . toBe ( true )
206+ } )
207+
208+ test ( 'flags deployment with no replicas metric as unhealthy (cold / deleted)' , ( ) => {
209+ const metrics = parsePrometheusText (
210+ fixture ( { requestRate : 0 , errorRate : 0 , kvBlocks : 0 , replicas : null } ) ,
211+ )
212+ const health = computeDeploymentHealth ( {
213+ deployment : DEPLOYMENT ,
214+ metrics,
215+ thresholds : DEFAULT_HEALTH_THRESHOLDS ,
216+ } )
217+ expect ( health . status ) . toBe ( 'unhealthy' )
218+ expect ( health . metrics . replicas ) . toBeNull ( )
219+ expect ( health . reasons . some ( ( r ) => r . includes ( 'cold or deleted' ) ) ) . toBe ( true )
220+ } )
221+
185222 test ( 'sums error counters across multiple HTTP codes' , ( ) => {
186223 const labels = `base_model="m",deployment="${ DEPLOYMENT } ",deployment_id="d1"`
187224 const text = [
225+ `deployment_replicas{deployment_account="test-acc",deployment_id="d1"} 1` ,
188226 `request_counter_total:sum_by_deployment{${ labels } } 100` ,
189227 `requests_error_total:sum_by_deployment{${ labels } ,http_code="500"} 3` ,
190228 `requests_error_total:sum_by_deployment{${ labels } ,http_code="429"} 5` ,
@@ -231,9 +269,11 @@ describe('computeSnapshot', () => {
231269 test ( 'overall status is the worst across deployments' , ( ) => {
232270 const dep2 = 'accounts/test-acc/deployments/d2'
233271 const text = [
272+ `deployment_replicas{deployment_id="d1"} 1` ,
234273 `request_counter_total:sum_by_deployment{deployment="${ DEPLOYMENT } ",deployment_id="d1"} 100` ,
235274 `requests_error_total:sum_by_deployment{deployment="${ DEPLOYMENT } ",deployment_id="d1",http_code="500"} 0` ,
236275 `generator_kv_blocks_fraction:avg_by_deployment{deployment="${ DEPLOYMENT } ",deployment_id="d1"} 0.1` ,
276+ `deployment_replicas{deployment_id="d2"} 1` ,
237277 `request_counter_total:sum_by_deployment{deployment="${ dep2 } ",deployment_id="d2"} 100` ,
238278 `requests_error_total:sum_by_deployment{deployment="${ dep2 } ",deployment_id="d2",http_code="500"} 30` ,
239279 `generator_kv_blocks_fraction:avg_by_deployment{deployment="${ dep2 } ",deployment_id="d2"} 0.1` ,
0 commit comments