From 6e51978b91b3a69d83a6cd2d7e8718d6dd7f64e1 Mon Sep 17 00:00:00 2001
From: Junze Bao <junze@poolside.ai>
Date: Tue, 30 Sep 2025 16:11:22 +0800
Subject: [PATCH] fix double counting

Signed-off-by: Junze Bao <junze@poolside.ai>
---
 pkg/scheduler/actions/reclaim/reclaim.go | 142 +++++++++++++----------
 1 file changed, 81 insertions(+), 61 deletions(-)

diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go
index 0838e38803..25afacf70c 100644
--- a/pkg/scheduler/actions/reclaim/reclaim.go
+++ b/pkg/scheduler/actions/reclaim/reclaim.go
@@ -155,6 +155,7 @@ func getReclaimedResources(ssn *framework.Session, pendingJob *api.JobInfo, runn
 
 		for _, n := range pendingJobTopology {
 			finalPendingJobTopology[n.PendingTask.Name] = n
+			// Count total reclaimed capacity (idle + evicted) allocated to this pending task
 			reclaimedGPU += n.GPU
 		}
 
@@ -179,26 +180,6 @@ func isQueueOverused(ssn *framework.Session, victimJob *api.JobInfo) bool {
 	return queueAllocatedGPUs > queueDeservedGPUs
 }
 
-func noBudgetViolationAfterReclaim(ssn *framework.Session, victimJob, pendingJob *api.JobInfo) bool {
-	queueAllocatedGPUs := ssn.Queues[victimJob.Queue].GetAllocatedGPU()
-	queueDeservedGPUs := ssn.Queues[victimJob.Queue].GetDeservedGPU()
-
-	withoutVictimJob := int64(0)
-	if len(victimJob.Tasks) == int(victimJob.MinAvailable) {
-		// when it's not an elastic workload, we check the total request of the job
-		withoutVictimJob = queueAllocatedGPUs - victimJob.GetTotalRequestGPU()
-	} else {
-		// when it's an elastic workload, we check the minimum between the elastic GPUs and the requested GPUs of the victim job
-		elasticGPUs := victimJob.GetElasticGPUs()
-		requestedGPUs := pendingJob.GetTotalRequestGPU()
-		withoutVictimJob = queueAllocatedGPUs - min(elasticGPUs, requestedGPUs)
-	}
-	if queueDeservedGPUs <= withoutVictimJob {
-		return true
-	}
-	return false
-}
-
 type EvictTask struct {
 	NodeName     string
 	GPU          int64
@@ -207,68 +188,107 @@ type EvictTask struct {
 }
 
 func findNodesForPendingJob(ssn *framework.Session, victimJob, pendingJob *api.JobInfo) map[string]*EvictTask {
-	// topology maps have "required number of GPUs" -> "node names"
-	// {1: ["node1", "node2"]} means the nodes
-	// using the VictimTask struct instead of a single node name string because we need to carry the task information
-	// and the idle gpu count on that node for the calculation below
-	victimNodes := map[string]*EvictTask{}
-	for _, task := range victimJob.Tasks {
-		node := ssn.Nodes[task.NodeName]
+	// Build per-node capacity with idle counted once and evictable tasks listed
+	type nodeCapacity struct {
+		nodeName     string
+		idleGPU      int64
+		evictable    []*api.TaskInfo
+		evictableGPU int64
+	}
+
+	nodeCaps := map[string]*nodeCapacity{}
+	for _, t := range victimJob.Tasks {
+		node := ssn.Nodes[t.NodeName]
 		if node == nil {
 			continue
 		}
-		// we need to consider future idle in case the node wasn't fully occupied by the victim task
-		nodeFutureIdleGPU := node.FutureIdle().ScalarResources["nvidia.com/gpu"]
-		numGPU := int64(task.Resreq.ScalarResources["nvidia.com/gpu"] + nodeFutureIdleGPU)
-		tasks, ok := victimNodes[task.NodeName]
+		gpu := int64(t.Resreq.ScalarResources["nvidia.com/gpu"])
+		cap, ok := nodeCaps[t.NodeName]
 		if !ok {
-			victimNodes[task.NodeName] = &EvictTask{
-				NodeName:     task.NodeName,
-				GPU:          numGPU,
-				TasksToEvict: []*api.TaskInfo{task},
-			}
-		} else {
-			victimNodes[task.NodeName] = &EvictTask{
-				NodeName:     task.NodeName,
-				GPU:          tasks.GPU + numGPU,
-				TasksToEvict: append(tasks.TasksToEvict, task),
+			cap = &nodeCapacity{
+				nodeName:     t.NodeName,
+				idleGPU:      int64(node.FutureIdle().ScalarResources["nvidia.com/gpu"]),
+				evictable:    []*api.TaskInfo{},
+				evictableGPU: 0,
 			}
+			nodeCaps[t.NodeName] = cap
 		}
+		cap.evictable = append(cap.evictable, t)
+		cap.evictableGPU += gpu
 	}
+
 	// record task name -> node name so we know how to pipeline the tasks later
 	pendingJobTopology := map[string]*EvictTask{}
 	for _, task := range pendingJob.Tasks {
-		requiredGPU := int64(task.Resreq.ScalarResources["nvidia.com/gpu"])
-		for _, node := range victimNodes {
-			if node.GPU < requiredGPU {
+		required := int64(task.Resreq.ScalarResources["nvidia.com/gpu"])
+		placed := false
+		for _, cap := range nodeCaps {
+			totalAvail := cap.idleGPU + cap.evictableGPU
+			if totalAvail < required {
 				continue
 			}
+
+			// simulate consumption on this node
+			remainingIdle := cap.idleGPU
+			remainingTasks := make([]*api.TaskInfo, len(cap.evictable))
+			copy(remainingTasks, cap.evictable)
+
 			result := &EvictTask{
-				NodeName:    node.NodeName,
+				NodeName:    cap.nodeName,
 				PendingTask: task,
-				GPU:         int64(0),
+				GPU:         0,
 			}
-			if len(node.TasksToEvict) == 0 {
-				node.GPU -= requiredGPU
-				result.GPU = requiredGPU
-				pendingJobTopology[task.Name] = result
-				delete(pendingJob.Tasks, task.UID)
-				break
-			}
-			for idx, t := range node.TasksToEvict {
-				requiredGPU -= int64(t.Resreq.ScalarResources["nvidia.com/gpu"])
-				result.TasksToEvict = append(result.TasksToEvict, t)
-				result.GPU += int64(t.Resreq.ScalarResources["nvidia.com/gpu"])
-				node.GPU -= int64(t.Resreq.ScalarResources["nvidia.com/gpu"])
-				node.TasksToEvict = deleteFromSlice(node.TasksToEvict, idx)
-				if requiredGPU == 0 {
-					break
+
+			// use idle first
+			if remainingIdle > 0 {
+				use := remainingIdle
+				if use > required {
+					use = required
 				}
+				remainingIdle -= use
+				required -= use
+				result.GPU += use
+			}
+
+			// then evict tasks until satisfied
+			evictedIdx := 0
+			for required > 0 && evictedIdx < len(remainingTasks) {
+				vt := remainingTasks[evictedIdx]
+				g := int64(vt.Resreq.ScalarResources["nvidia.com/gpu"])
+				result.TasksToEvict = append(result.TasksToEvict, vt)
+				result.GPU += g
+				required -= g
+				evictedIdx++
+			}
+
+			if required > 0 {
+				// not enough even after evictions; try another node
+				continue
 			}
+
+			// commit consumption to this node capacity
+			cap.idleGPU = remainingIdle
+			if evictedIdx >= len(remainingTasks) {
+				cap.evictable = []*api.TaskInfo{}
+			} else {
+				cap.evictable = remainingTasks[evictedIdx:]
+			}
+			// recompute evictableGPU after removing evicted tasks
+			newEvictableGPU := int64(0)
+			for _, vt := range cap.evictable {
+				newEvictableGPU += int64(vt.Resreq.ScalarResources["nvidia.com/gpu"])
+			}
+			cap.evictableGPU = newEvictableGPU
+
 			pendingJobTopology[task.Name] = result
 			delete(pendingJob.Tasks, task.UID)
+			placed = true
 			break
 		}
+		if !placed {
+			// could not place this pending task with this victim job's nodes
+			continue
+		}
 	}
 	return pendingJobTopology
 }