CORE-13 feat: Support RT-DETRv4 detection model (Comfy-Org#12748)

kijai · web-flow · commit a500f1edacfa · 2026-03-28T23:34:10.000-04:00
diff --git a/comfy/ldm/rt_detr/rtdetr_v4.py b/comfy/ldm/rt_detr/rtdetr_v4.py
diff --git a/comfy/model_base.py b/comfy/model_base.py
@@ -52,6 +52,7 @@
 import comfy.ldm.kandinsky5.model
 import comfy.ldm.anima.model
 import comfy.ldm.ace.ace_step15
+import comfy.ldm.rt_detr.rtdetr_v4
 
 import comfy.model_management
 import comfy.patcher_extension
@@ -1957,3 +1958,7 @@ def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
 
     def concat_cond(self, **kwargs):
         return None
+
+class RT_DETR_v4(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.rt_detr.rtdetr_v4.RTv4)
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
@@ -698,6 +698,12 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
         dit_config["audio_model"] = "ace1.5"
         return dit_config
 
+    if '{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix) in state_dict_keys: # RT-DETR_v4
+        dit_config = {}
+        dit_config["image_model"] = "RT_DETR_v4"
+        dit_config["enc_h"] = state_dict['{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix)].shape[0]
+        return dit_config
+
     if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
         return None
 
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
@@ -1734,6 +1734,21 @@ def clip_target(self, state_dict={}):
         hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
         return supported_models_base.ClipTarget(comfy.text_encoders.longcat_image.LongCatImageTokenizer, comfy.text_encoders.longcat_image.te(**hunyuan_detect))
 
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
+
+class RT_DETR_v4(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "RT_DETR_v4",
+    }
+
+    supported_inference_dtypes = [torch.float16, torch.float32]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.RT_DETR_v4(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return None
+
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4]
 
 models += [SVD_img2vid]
diff --git a/comfy_extras/nodes_rtdetr.py b/comfy_extras/nodes_rtdetr.py
@@ -0,0 +1,154 @@
+from typing_extensions import override
+
+import torch
+from comfy.ldm.rt_detr.rtdetr_v4 import COCO_CLASSES
+import comfy.model_management
+import comfy.utils
+from comfy_api.latest import ComfyExtension, io
+from torchvision.transforms import ToPILImage, ToTensor
+from PIL import ImageDraw, ImageFont
+
+
+class RTDETR_detect(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="RTDETR_detect",
+            display_name="RT-DETR Detect",
+            category="detection/",
+            search_aliases=["bbox", "bounding box", "object detection", "coco"],
+            inputs=[
+                io.Model.Input("model", display_name="model"),
+                io.Image.Input("image", display_name="image"),
+                io.Float.Input("threshold", display_name="threshold", default=0.5),
+                io.Combo.Input("class_name", options=["all"] + COCO_CLASSES, default="all", tooltip="Filter detections by class. Set to 'all' to disable filtering."),
+                io.Int.Input("max_detections", display_name="max_detections", default=100, tooltip="Maximum number of detections to return per image. In order of descending confidence score."),
+            ],
+            outputs=[
+                io.BoundingBox.Output("bboxes")],
+        )
+
+    @classmethod
+    def execute(cls, model, image, threshold, class_name, max_detections) -> io.NodeOutput:
+        B, H, W, C = image.shape
+
+        image_in = comfy.utils.common_upscale(image.movedim(-1, 1), 640, 640, "bilinear", crop="disabled")
+
+        comfy.model_management.load_model_gpu(model)
+        results = model.model.diffusion_model(image_in, (W, H))  # list of B dicts
+
+        all_bbox_dicts = []
+
+        for det in results:
+            keep   = det['scores'] > threshold
+            boxes  = det['boxes'][keep].cpu()
+            labels = det['labels'][keep].cpu()
+            scores = det['scores'][keep].cpu()
+
+            bbox_dicts = [
+                {
+                    "x": float(box[0]),
+                    "y": float(box[1]),
+                    "width": float(box[2] - box[0]),
+                    "height": float(box[3] - box[1]),
+                    "label": COCO_CLASSES[int(label)],
+                    "score": float(score)
+                }
+                for box, label, score in zip(boxes, labels, scores)
+                if class_name == "all" or COCO_CLASSES[int(label)] == class_name
+            ]
+            bbox_dicts.sort(key=lambda d: d["score"], reverse=True)
+            all_bbox_dicts.append(bbox_dicts[:max_detections])
+
+        return io.NodeOutput(all_bbox_dicts)
+
+
+class DrawBBoxes(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="DrawBBoxes",
+            display_name="Draw BBoxes",
+            category="detection/",
+            search_aliases=["bbox", "bounding box", "object detection", "rt_detr", "visualize detections", "coco"],
+            inputs=[
+                io.Image.Input("image", optional=True),
+                io.BoundingBox.Input("bboxes", force_input=True),
+            ],
+            outputs=[
+                io.Image.Output("out_image"),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, bboxes, image=None) -> io.NodeOutput:
+        # Normalise to list[list[dict]], then fit to batch size B.
+        B = image.shape[0] if image is not None else 1
+        if isinstance(bboxes, dict):
+            bboxes = [[bboxes]]
+        elif not isinstance(bboxes, list) or not bboxes:
+            bboxes = [[]]
+        elif isinstance(bboxes[0], dict):
+            bboxes = [bboxes]  # flat list → same detections for every image
+
+        if len(bboxes) == 1:
+            bboxes = bboxes * B
+        bboxes = (bboxes + [[]] * B)[:B]
+
+        if image is None:
+            B = len(bboxes)
+            max_w = max((int(d["x"] + d["width"])  for frame in bboxes for d in frame), default=640)
+            max_h = max((int(d["y"] + d["height"]) for frame in bboxes for d in frame), default=640)
+            image = torch.zeros((B, max_h, max_w, 3), dtype=torch.float32)
+
+        all_out_images = []
+        for i in range(B):
+            detections = bboxes[i]
+            if detections:
+                boxes  = torch.tensor([[d["x"], d["y"], d["x"] + d["width"], d["y"] + d["height"]] for d in detections])
+                labels = [d.get("label") if d.get("label") in COCO_CLASSES else None for d in detections]
+                scores = torch.tensor([d.get("score", 1.0) for d in detections])
+            else:
+                boxes  = torch.zeros((0, 4))
+                labels = []
+                scores = torch.zeros((0,))
+
+            pil_image = image[i].movedim(-1, 0)
+            img = ToPILImage()(pil_image)
+            if detections:
+                img = cls.draw_detections(img, boxes, labels, scores)
+            all_out_images.append(ToTensor()(img).unsqueeze(0).movedim(1, -1))
+
+        out_images = torch.cat(all_out_images, dim=0).to(comfy.model_management.intermediate_device())
+        return io.NodeOutput(out_images)
+
+    @classmethod
+    def draw_detections(cls, img, boxes, labels, scores):
+        draw = ImageDraw.Draw(img)
+        try:
+            font = ImageFont.truetype('arial.ttf', 16)
+        except Exception:
+            font = ImageFont.load_default()
+        colors = [(255,0,0),(0,200,0),(0,0,255),(255,165,0),(128,0,128),
+                (0,255,255),(255,20,147),(100,149,237)]
+        for box, label, score in sorted(zip(boxes, labels, scores), key=lambda x: x[2].item()):
+            x1, y1, x2, y2 = box.tolist()
+            color_idx = COCO_CLASSES.index(label) if label is not None else 0
+            c = colors[color_idx % len(colors)]
+            draw.rectangle([x1, y1, x2, y2], outline=c, width=3)
+            if label is not None:
+                draw.text((x1 + 2, y1 + 2), f'{label} {score:.2f}', fill=c, font=font)
+        return img
+
+
+class RTDETRExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            RTDETR_detect,
+            DrawBBoxes,
+        ]
+
+
+async def comfy_entrypoint() -> RTDETRExtension:
+    return RTDETRExtension()
diff --git a/comfy_extras/nodes_sdpose.py b/comfy_extras/nodes_sdpose.py
@@ -661,14 +661,15 @@ def define_schema(cls):
                 io.Int.Input("output_width",  default=512, min=64, max=4096, step=8, tooltip="Width each crop is resized to."),
                 io.Int.Input("output_height", default=512, min=64, max=4096, step=8, tooltip="Height each crop is resized to."),
                 io.Int.Input("padding", default=0, min=0, max=1024, step=1, tooltip="Extra padding in pixels added on each side of the bbox before cropping."),
+                io.Combo.Input("keep_aspect", options=["stretch", "pad"], default="stretch", tooltip="Whether to stretch the crop to fit the output size, or pad with black pixels to preserve aspect ratio."),
             ],
             outputs=[
                 io.Image.Output(tooltip="All crops stacked into a single image batch."),
             ],
         )
 
     @classmethod
-    def execute(cls, image, bboxes, output_width, output_height, padding) -> io.NodeOutput:
+    def execute(cls, image, bboxes, output_width, output_height, padding, keep_aspect="stretch") -> io.NodeOutput:
         total_frames = image.shape[0]
         img_h = image.shape[1]
         img_w = image.shape[2]
@@ -716,7 +717,19 @@ def execute(cls, image, bboxes, output_width, output_height, padding) -> io.Node
                 x1, y1, x2, y2 = fb_x1, fb_y1, fb_x2, fb_y2
 
             crop_chw = frame_chw[:, :, y1:y2, x1:x2]  # (1, C, crop_h, crop_w)
-            resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="bilinear", crop="disabled")
+
+            if keep_aspect == "pad":
+                crop_h, crop_w = y2 - y1, x2 - x1
+                scale = min(output_width / crop_w, output_height / crop_h)
+                scaled_w = int(round(crop_w * scale))
+                scaled_h = int(round(crop_h * scale))
+                scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="bilinear", crop="disabled")
+                pad_left = (output_width  - scaled_w) // 2
+                pad_top  = (output_height - scaled_h) // 2
+                resized = torch.zeros(1, num_ch, output_height, output_width, dtype=image.dtype, device=image.device)
+                resized[:, :, pad_top:pad_top + scaled_h, pad_left:pad_left + scaled_w] = scaled
+            else:  # "stretch"
+                resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="bilinear", crop="disabled")
             crops.append(resized)
 
         if not crops:
diff --git a/nodes.py b/nodes.py
@@ -2457,6 +2457,7 @@ async def init_builtin_extra_nodes():
         "nodes_number_convert.py",
         "nodes_painter.py",
         "nodes_curve.py",
+        "nodes_rtdetr.py"
     ]
 
     import_failed = []

Original file line number	Diff line number	Diff line change
`@@ -2457,6 +2457,7 @@ async def init_builtin_extra_nodes():`
`2457`	`2457`	`"nodes_number_convert.py",`
`2458`	`2458`	`"nodes_painter.py",`
`2459`	`2459`	`"nodes_curve.py",`
	`2460`	`+ "nodes_rtdetr.py"`
`2460`	`2461`	`]`
`2461`	`2462`
`2462`	`2463`	`import_failed = []`