From a927537833ae355209cae512dd1b124bc61a7b93 Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Mon, 30 Mar 2026 11:56:09 +0700 Subject: [PATCH 01/16] feat: update executable path --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 8a56203..ba5dc02 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [build_scripts] -executable=/usr/bin/env python3 +executable=/home/group11/final_project_ws/real_vision_venv/bin/python3 [develop] script_dir=$base/lib/vision [install] From 0721e7700a95d50f2be0ab8aee0455dd3b2f3aed Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Mon, 30 Mar 2026 11:57:12 +0700 Subject: [PATCH 02/16] feat: fix clip visualization --- vision/clip_classifier.py | 42 +++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/vision/clip_classifier.py b/vision/clip_classifier.py index 8bc0eb3..b0d654b 100755 --- a/vision/clip_classifier.py +++ b/vision/clip_classifier.py @@ -51,6 +51,7 @@ import cv2 import numpy as np import sys +import os import json from datetime import datetime from typing import List, Dict, Tuple, Optional @@ -277,8 +278,12 @@ def __init__(self, candidate_labels: List[str] = None): ) self.get_logger().warn("Subscribing to: /vision/sam_detections (placeholder Image). Build msgs for full integration.") + # Mitigate OpenCV Qt backend font path issues in virtual environments. + self._configure_opencv_qt_fonts() + # OpenCV window setup self.window_name = f"CLIP Classifier - {self.rgb_topic}" + cv2.startWindowThread() cv2.namedWindow(self.window_name, cv2.WINDOW_NORMAL) cv2.resizeWindow(self.window_name, 800, 600) @@ -300,6 +305,29 @@ def __init__(self, candidate_labels: List[str] = None): self.get_logger().info("Service: /vision/find_multi_object") self.get_logger().info(f"Subscriber: /vision/sam_detections (auto-classify on SAM publish)") self.get_logger().info(f"OpenCV Window: '{self.window_name}'") + + def _configure_opencv_qt_fonts(self): + """Set a valid Qt font directory when OpenCV's bundled qt/fonts folder is missing.""" + if os.environ.get('QT_QPA_FONTDIR'): + return + + candidate_dirs = [ + '/usr/share/fonts/truetype/dejavu', + '/usr/share/fonts/truetype/freefont', + '/usr/share/fonts/truetype/liberation2', + '/usr/share/fonts', + ] + + for font_dir in candidate_dirs: + if os.path.isdir(font_dir): + os.environ['QT_QPA_FONTDIR'] = font_dir + self.get_logger().info(f"Set QT_QPA_FONTDIR to {font_dir}") + return + + self.get_logger().warn( + "Could not find a system font directory for QT_QPA_FONTDIR. " + "OpenCV Qt warnings may appear." + ) def _init_clip_model(self): """Initialize CLIP model""" @@ -1548,16 +1576,10 @@ def main(args=None): try: node = CLIPClassifier(candidate_labels=candidate_labels) - - # Use MultiThreadedExecutor for ReentrantCallbackGroup - executor = MultiThreadedExecutor() - executor.add_node(node) - - try: - executor.spin() - finally: - executor.shutdown() - node.destroy_node() + + # Keep OpenCV HighGUI operations on the main thread for stable rendering. + rclpy.spin(node) + node.destroy_node() except KeyboardInterrupt: pass finally: From 48c3d541b6528450426b6877d39f0db128918aa9 Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Mon, 30 Mar 2026 18:20:21 +0700 Subject: [PATCH 03/16] fix: obb not receiving SAM detections --- vision/clip_classifier.py | 42 +++-------- vision/obb_angle_service_node.py | 115 ++++++++++++++++++++++++------- 2 files changed, 101 insertions(+), 56 deletions(-) diff --git a/vision/clip_classifier.py b/vision/clip_classifier.py index b0d654b..8bc0eb3 100755 --- a/vision/clip_classifier.py +++ b/vision/clip_classifier.py @@ -51,7 +51,6 @@ import cv2 import numpy as np import sys -import os import json from datetime import datetime from typing import List, Dict, Tuple, Optional @@ -278,12 +277,8 @@ def __init__(self, candidate_labels: List[str] = None): ) self.get_logger().warn("Subscribing to: /vision/sam_detections (placeholder Image). Build msgs for full integration.") - # Mitigate OpenCV Qt backend font path issues in virtual environments. - self._configure_opencv_qt_fonts() - # OpenCV window setup self.window_name = f"CLIP Classifier - {self.rgb_topic}" - cv2.startWindowThread() cv2.namedWindow(self.window_name, cv2.WINDOW_NORMAL) cv2.resizeWindow(self.window_name, 800, 600) @@ -305,29 +300,6 @@ def __init__(self, candidate_labels: List[str] = None): self.get_logger().info("Service: /vision/find_multi_object") self.get_logger().info(f"Subscriber: /vision/sam_detections (auto-classify on SAM publish)") self.get_logger().info(f"OpenCV Window: '{self.window_name}'") - - def _configure_opencv_qt_fonts(self): - """Set a valid Qt font directory when OpenCV's bundled qt/fonts folder is missing.""" - if os.environ.get('QT_QPA_FONTDIR'): - return - - candidate_dirs = [ - '/usr/share/fonts/truetype/dejavu', - '/usr/share/fonts/truetype/freefont', - '/usr/share/fonts/truetype/liberation2', - '/usr/share/fonts', - ] - - for font_dir in candidate_dirs: - if os.path.isdir(font_dir): - os.environ['QT_QPA_FONTDIR'] = font_dir - self.get_logger().info(f"Set QT_QPA_FONTDIR to {font_dir}") - return - - self.get_logger().warn( - "Could not find a system font directory for QT_QPA_FONTDIR. " - "OpenCV Qt warnings may appear." - ) def _init_clip_model(self): """Initialize CLIP model""" @@ -1576,10 +1548,16 @@ def main(args=None): try: node = CLIPClassifier(candidate_labels=candidate_labels) - - # Keep OpenCV HighGUI operations on the main thread for stable rendering. - rclpy.spin(node) - node.destroy_node() + + # Use MultiThreadedExecutor for ReentrantCallbackGroup + executor = MultiThreadedExecutor() + executor.add_node(node) + + try: + executor.spin() + finally: + executor.shutdown() + node.destroy_node() except KeyboardInterrupt: pass finally: diff --git a/vision/obb_angle_service_node.py b/vision/obb_angle_service_node.py index 9bdf7d6..f205cf5 100644 --- a/vision/obb_angle_service_node.py +++ b/vision/obb_angle_service_node.py @@ -37,6 +37,7 @@ from rclpy.node import Node from rclpy.callback_groups import ReentrantCallbackGroup from rclpy.executors import MultiThreadedExecutor +from rclpy.qos import QoSProfile, ReliabilityPolicy, DurabilityPolicy from custom_interfaces.srv import FindObjectAngleBB, FindObjectAngle, DetectObjects from custom_interfaces.msg import SAMDetections from sensor_msgs.msg import Image, CameraInfo @@ -44,6 +45,7 @@ import numpy as np import cv2 import time +import threading class OBBAngleServiceNode(Node): @@ -55,8 +57,10 @@ class OBBAngleServiceNode(Node): def __init__(self): super().__init__('obb_angle_service_node') - # Use reentrant callback group for nested service calls - self.callback_group = ReentrantCallbackGroup() + # Keep subscriptions and services in separate callback groups so + # detection updates can be processed while service callbacks are waiting. + self.subscription_callback_group = ReentrantCallbackGroup() + self.service_callback_group = ReentrantCallbackGroup() # Storage for latest detections and images self.latest_detections = None @@ -66,8 +70,13 @@ def __init__(self): self.bridge = CvBridge() # Thread lock for thread-safe access - import threading self.detections_lock = threading.Lock() + self.detections_condition = threading.Condition(self.detections_lock) + self.latest_detections_stamp_ns = 0 + + # Queue visualization work so service callbacks can return immediately. + self.viz_lock = threading.Lock() + self.pending_viz = None # OpenCV visualization window (unified for both single and multi-object) self.window_name = 'OBB Angle Detection' @@ -80,13 +89,21 @@ def __init__(self): self.rgb_topic = '/camera/color/image_raw' else: self.rgb_topic = '/camera/image_raw' + + # Best-effort QoS is generally more compatible with high-rate detector topics. + self.sam_qos = QoSProfile( + depth=10, + reliability=ReliabilityPolicy.BEST_EFFORT, + durability=DurabilityPolicy.VOLATILE, + ) # Subscribe to RGB camera for visualization self.rgb_subscription = self.create_subscription( Image, self.rgb_topic, self.rgb_callback, - 10 + 10, + callback_group=self.subscription_callback_group, ) # Subscribe to SAM detections for multi-object OBB @@ -94,7 +111,8 @@ def __init__(self): SAMDetections, '/vision/sam_detections', self.sam_detections_callback, - 10 + self.sam_qos, + callback_group=self.subscription_callback_group, ) # Subscribe to depth camera @@ -102,7 +120,8 @@ def __init__(self): Image, "/camera/depth/image_rect_raw", self.depth_callback, - 10 + 10, + callback_group=self.subscription_callback_group, ) # Subscribe to camera info @@ -110,14 +129,15 @@ def __init__(self): CameraInfo, "/camera/color/camera_info", self.info_callback, - 10 + 10, + callback_group=self.subscription_callback_group, ) # Service client for real-time detection self.detect_objects_client = self.create_client( DetectObjects, '/vision/detect_objects', - callback_group=self.callback_group + callback_group=self.service_callback_group ) # Create OBB angle service servers @@ -125,14 +145,14 @@ def __init__(self): FindObjectAngleBB, '/obb/find_object_angle_bb', self.find_object_angle_bb_callback, - callback_group=self.callback_group + callback_group=self.service_callback_group ) self.find_object_angle_srv = self.create_service( FindObjectAngle, '/obb/find_object_angle', self.find_object_angle_callback, - callback_group=self.callback_group + callback_group=self.service_callback_group ) # Create OpenCV window (unified) @@ -152,6 +172,7 @@ def __init__(self): self.get_logger().info(' - /obb/find_object_angle (All objects OBB)') self.get_logger().info('Subscriptions:') self.get_logger().info(' - /vision/sam_detections (SAMDetections)') + self.get_logger().info(' - /vision/sam_detections QoS: BEST_EFFORT, VOLATILE, depth=10') self.get_logger().info(f' - {self.rgb_topic} (Image)') self.get_logger().info(' - /camera/depth/image_rect_raw (Image)') self.get_logger().info(' - /camera/color/camera_info (CameraInfo)') @@ -177,9 +198,28 @@ def rgb_callback(self, msg: Image): def sam_detections_callback(self, msg: SAMDetections): """Store latest SAM detections""" - with self.detections_lock: + with self.detections_condition: self.latest_detections = msg + self.latest_detections_stamp_ns = time.monotonic_ns() + self.detections_condition.notify_all() + if not hasattr(self, '_sam_received'): + self._sam_received = True + self.get_logger().info(f'First SAM detections received: {msg.total_detections} objects') self.get_logger().debug(f'Received SAM detections: {msg.total_detections} objects') + + def wait_for_sam_detections(self, timeout_sec=1.5, min_stamp_ns=0): + """Wait until at least one SAM detection message is available.""" + deadline = time.monotonic() + timeout_sec + with self.detections_condition: + while True: + if self.latest_detections is not None and self.latest_detections_stamp_ns >= min_stamp_ns: + return True + + remaining = deadline - time.monotonic() + if remaining <= 0.0: + return False + + self.detections_condition.wait(timeout=min(0.05, remaining)) def depth_callback(self, msg: Image): """Store latest depth image""" @@ -194,9 +234,28 @@ def info_callback(self, msg: CameraInfo): def keep_window_alive(self): """Timer callback to keep OpenCV window responsive""" - # This just calls waitKey to process window events - # Without this, the window may freeze or not display properly - cv2.waitKey(1) + # Render queued visualization outside service callbacks to avoid + # blocking service responses on GUI operations. + pending = None + with self.viz_lock: + if self.pending_viz is not None: + pending = self.pending_viz + self.pending_viz = None + + if pending is not None: + results, mode = pending + try: + self.visualize_obb(results, mode=mode) + except Exception as e: + self.get_logger().warn(f'Visualization update failed: {e}') + else: + # Process window events even when no new frame is queued. + cv2.waitKey(1) + + def queue_visualization(self, results, mode="auto"): + """Queue the latest visualization payload for timer-based rendering.""" + with self.viz_lock: + self.pending_viz = (results, mode) def calculate_obb_from_bbox(self, x1, y1, x2, y2, mask=None): """ @@ -552,11 +611,15 @@ def find_object_angle_bb_callback(self, request, response): return response # Trigger real-time detection to get fresh detections + request_stamp_ns = time.monotonic_ns() self.get_logger().info('Triggering real-time detection...') detection_success = self.trigger_real_time_detection() - - # Small delay to ensure detections are updated - time.sleep(0.1) + min_stamp_ns = request_stamp_ns if detection_success else 0 + + # Wait for SAM callback instead of fixed sleep. + got_sam = self.wait_for_sam_detections(timeout_sec=1.5, min_stamp_ns=min_stamp_ns) + if not got_sam: + self.get_logger().warn('Timed out waiting for SAM detections update') # Get latest detections with self.detections_lock: @@ -649,9 +712,9 @@ def find_object_angle_bb_callback(self, request, response): self.get_logger().info(f'OBB Result ({best_detection.object_id}): center=({u:.1f},{v:.1f}), angle={angle_deg:.1f}deg, size={width:.0f}x{height:.0f}') self.get_logger().info('=' * 60) - # Visualize with unified function - pass the INPUT bbox from request for visualization + # Queue visualization; do not block service response path. viz_data = [(best_detection.object_id, u, v, theta_geom, width, height, [request.x1, request.y1, request.x2, request.y2])] - self.visualize_obb(viz_data, mode="single") + self.queue_visualization(viz_data, mode="single") except Exception as e: self.get_logger().error(f'Error in find_object_angle_bb: {e}') @@ -678,11 +741,15 @@ def find_object_angle_callback(self, request, response): try: # Trigger real-time detection + request_stamp_ns = time.monotonic_ns() self.get_logger().info('Triggering real-time detection for all objects...') detection_success = self.trigger_real_time_detection() - - # Small delay to ensure detections are updated - time.sleep(0.1) + min_stamp_ns = request_stamp_ns if detection_success else 0 + + # Wait for SAM callback instead of fixed sleep. + got_sam = self.wait_for_sam_detections(timeout_sec=1.5, min_stamp_ns=min_stamp_ns) + if not got_sam: + self.get_logger().warn('Timed out waiting for SAM detections update') # Get latest detections with self.detections_lock: @@ -791,7 +858,7 @@ def find_object_angle_callback(self, request, response): # Visualize all objects with unified function if len(viz_results) > 0: - self.visualize_obb(viz_results, mode="multi") + self.queue_visualization(viz_results, mode="multi") except Exception as e: self.get_logger().error(f'Error in find_object_angle: {e}') @@ -822,7 +889,7 @@ def main(args=None): node = OBBAngleServiceNode() # Use MultiThreadedExecutor for non-blocking service calls - executor = MultiThreadedExecutor() + executor = MultiThreadedExecutor(num_threads=4) executor.add_node(node) try: From a4d790f3e7c8545a37fe0d1bbc5f3efeac35c3bd Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Mon, 30 Mar 2026 18:48:35 +0700 Subject: [PATCH 04/16] fix: clip and obb visualization --- vision/clip_classifier.py | 11 +++++++---- vision/obb_angle_service_node.py | 10 +++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/vision/clip_classifier.py b/vision/clip_classifier.py index 8bc0eb3..897094e 100755 --- a/vision/clip_classifier.py +++ b/vision/clip_classifier.py @@ -282,8 +282,9 @@ def __init__(self, candidate_labels: List[str] = None): cv2.namedWindow(self.window_name, cv2.WINDOW_NORMAL) cv2.resizeWindow(self.window_name, 800, 600) - # Timer for visualization (30 Hz) - self.viz_timer = self.create_timer(0.033, self.visualization_callback) + # GUI updates are pumped from the main thread in main() to avoid + # HighGUI freezes under multi-threaded executors. + self.viz_timer = None self.get_logger().info("CLIP Classifier Started") self.get_logger().info(f"Subscribing to: {self.rgb_topic}") @@ -1550,11 +1551,13 @@ def main(args=None): node = CLIPClassifier(candidate_labels=candidate_labels) # Use MultiThreadedExecutor for ReentrantCallbackGroup - executor = MultiThreadedExecutor() + executor = MultiThreadedExecutor(num_threads=4) executor.add_node(node) try: - executor.spin() + while rclpy.ok(): + executor.spin_once(timeout_sec=0.03) + node.visualization_callback() finally: executor.shutdown() node.destroy_node() diff --git a/vision/obb_angle_service_node.py b/vision/obb_angle_service_node.py index f205cf5..41b36cd 100644 --- a/vision/obb_angle_service_node.py +++ b/vision/obb_angle_service_node.py @@ -159,8 +159,9 @@ def __init__(self): cv2.namedWindow(self.window_name, cv2.WINDOW_NORMAL) cv2.resizeWindow(self.window_name, 1200, 800) - # Create timer for continuous window update (keeps window responsive) - self.viz_timer = self.create_timer(0.1, self.keep_window_alive) + # GUI updates are pumped from the main thread in main() to avoid + # HighGUI freezes under multi-threaded executors. + self.viz_timer = None self.get_logger().info('=' * 80) self.get_logger().info('OBB Angle Service Node Started') @@ -894,10 +895,13 @@ def main(args=None): try: node.get_logger().info('OBB Angle Service Node spinning...') - executor.spin() + while rclpy.ok(): + executor.spin_once(timeout_sec=0.03) + node.keep_window_alive() except KeyboardInterrupt: node.get_logger().info('Shutting down OBB Angle Service Node...') finally: + executor.shutdown() cv2.destroyAllWindows() node.destroy_node() rclpy.shutdown() From 1bc315ada03ae44bc7cb0685db01c248c7b183be Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Fri, 3 Apr 2026 10:20:16 +0700 Subject: [PATCH 05/16] feat: update camera position --- vision/pixel_to_real_world.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vision/pixel_to_real_world.py b/vision/pixel_to_real_world.py index a0be114..10dab48 100644 --- a/vision/pixel_to_real_world.py +++ b/vision/pixel_to_real_world.py @@ -51,10 +51,12 @@ def __init__(self): # error = (x - x^, y - y^, z - z^) # t_base_cam_new = t_base_cam_old + error # This compensates for calibration errors in the camera position. + # Practically: put the silver tip of the tape measure where the center of the gripper is, read the distance to the center of the cube + # see what direction (- or +) the read is, add that # Benchmark: # LOW: (-0.0386, 0.5303, 0.5238) # HIGH: (-0.0361, 0.5303, 0.6458) - self.t_base_cam = np.array([-0.0361, 0.5303, 0.6458]) + self.t_base_cam = np.array([-0.146, 0.635, 0.8]) self.R_base_cam = np.array([ [1.0, 0.0, 0.0], From f698985792d76428021566bba9d7de0d084be8f0 Mon Sep 17 00:00:00 2001 From: Methasit-Pun Date: Fri, 3 Apr 2026 23:15:33 +0700 Subject: [PATCH 06/16] feat: improve dashboard logic --- dashboard/index.html | 1665 ++++++++++++++++---------- docs/COLLECT_AND_EXPORT.md | 264 ++++ setup.py | 2 + vision/benchmark_dashboard.py | 11 + vision_scripts/collect_and_export.py | 497 ++++++++ 5 files changed, 1838 insertions(+), 601 deletions(-) create mode 100644 docs/COLLECT_AND_EXPORT.md create mode 100644 vision_scripts/collect_and_export.py diff --git a/dashboard/index.html b/dashboard/index.html index f8d2cfa..1071733 100644 --- a/dashboard/index.html +++ b/dashboard/index.html @@ -3,745 +3,1208 @@ - Vision Benchmark Dashboard + Vision Pipeline Research Dashboard -
-
-

🎯 Vision Benchmark Dashboard

-

Real-time monitoring of ROS2 Vision Pipeline Services

-
- -
-
- Auto-refreshing every 2 seconds - + + + + + +
+ + +
+
+
Vision Pipeline Research Dashboard
+
Segment Anything Model · CLIP · GraspNet · Scene Understanding · ROS 2
+
+
+
+ Auto-refresh every 2 s +
+ +
+
-
-
-
0
-
Total Service Calls
+
+ + + +
+
+
0
+
Stored Runs
+
Rolling last 20
+
+
+
0
+
SAM Records
-
-
0
-
Pixel to Real
+
+
0
+
CLIP Records
-
-
0
-
SAM Detections
+
+
0
+
Grasp Records
-
-
0
-
CLIP Classifications
+
+
0
+
Scene Records
-
-
0
-
Grasp Detections
+
+
0
+
Pixel→Real
-
-
0
-
Scene Analysis
+
+
0
+
Total Svc Calls
+
This session
- -
-
-

📍 Pixel to Real Conversion

- 0 Records + + +
+
+
+
+
Vision Run Summary
+
vision_runs_history.json · newest first · max 20 runs
+
+ 0 Runs
-
- - - - - - - - - - - - - - +
+
+
+
Total Runs
+
+
+
+
Last Objects
+
+
+
+
Last Relations
+
+
+
+
Last Latency
+
+
+
+
Test IDTimestampInput U (px)Input V (px)Output X (m)Output Y (m)Output Z (m)
No data available
+ + + + + + + + + + + + + + + + + +
RunTimestampLatency (s)SAM DetectionsSAM Avg ConfCLIP FilteredScene ObjectsRelationsGraspableSAMCLIPScene
No run history yet — run collect_and_export.py
+
+
+ + + +
+
+
+
+
Detected Objects — Latest Run
+
Merged SAM · CLIP · GraspNet per-object data
+
+ 0 Objects +
+
+ + + + + + + + + + + + + + + + +
Object IDLabelBounding BoxSAM ConfidenceCLIP ConfidenceDistance (cm)IoUStableHas GraspGrasp Quality
No data yet
+
+
+ + +
+
+
+
+
Spatial Relations — Latest Run
+
Object-level spatial relation graph · Scene Understanding output
+
+ 0 Relations +
+
+ + + + + + + + + + +
SubjectRelationTargetConfidenceDistance 2DDescription
No data yet
- -
-
-

🎯 SAM Object Detection

- 0 Records + + +
+
+
+
+
SAM — Segment Anything Model
+
/vision/run_pipeline · /vision/detect_objects
+
+ 0 Records
-
-
-
0.00
-
Avg IoU
+
+
+
+
Avg Confidence
-
-
0%
-
Stability Rate
+
+
+
Avg IoU
-
-
0.00
-
Avg Confidence
+
+
+
Stability Rate
-
- - - - - - - - - - - - - +
+
Object IDBounding BoxCenter PointConfidenceIoU ScoreAP (IoU≥0.5)Distance (cm)Timestamp
+ + + + + + + + + + - +
Object IDBounding BoxCenter (u, v)ConfidenceIoU ScoreAP ≥ 0.5Distance (cm)Timestamp
No data available
No SAM data — start /vision/run_pipeline
- -
-
-

🏷️ CLIP Classification

- 0 Records + +
+
+
+
+
CLIP — Contrastive Language–Image Pre-training
+
/vision/classify_bbox_filtered · /vision/classify_all
+
+ 0 Records
-
-
-
N/A
-
Top-1 Accuracy
+
+
+
+
Top-1 Accuracy
-
-
0.00
-
Avg Confidence
+
+
+
Avg Confidence
-
- - - - - - - - - - - +
+
Test IDLabelConfidenceTop-1 AccuracyBounding BoxTimestamp
+ + + + + + + + - +
Test IDLabelConfidenceTop-1 AccuracyBounding BoxTimestamp
No data available
No CLIP data — start /vision/classify_bbox_filtered
- -
-
-

🤖 Grasp Detection

- 0 Records + +
+
+
+
+
GraspNet — 6-DoF Grasp Pose Estimation
+
/vision/detect_grasp · /vision/detect_grasp_bbox
+
+ 0 Records
-
-
-
0.00
-
Avg Quality Score
+
+
+
+
Avg Quality Score
-
-
0.00m
-
Avg Grasp Width
+
+
+
Avg Grasp Width
-
- - - - - - - - - - - - - +
+
Test IDObject IDPosition (u,v)Position (x,y,z)Quality ScoreGrasp WidthApproachTimestamp
+ + + + + + + + + + - +
Test IDObject IDPixel (u, v)World (x, y, z) mQuality ScoreWidth (m)ApproachTimestamp
No data available
No grasp data — start /vision/detect_grasp
- -
-
-

🌐 Scene Understanding

- 0 Records + +
+
+
+
+
Scene Understanding — Holistic Analysis
+
/vision/understand_scene · spatial reasoning + object relations
+
+ 0 Records
-
-
-
0
-
Total Objects
+
+
+
+
Total Objects
-
-
0
-
Relations
+
+
+
Relations
-
-
0%
-
Spatial Accuracy
+
+
+
Spatial Accuracy
-
-
0%
-
Adjacency Accuracy
+
+
+
Adjacency Accuracy
-
- - - - - - - - - - - - +
+
Scene IDObjectsRelationsDescriptionSpatial AccAdjacency AccTimestamp
+ + + + + + + + + - +
Scene IDObjectsRelationsScene DescriptionSpatial AccAdjacency AccTimestamp
No data available
No scene data — start /vision/understand_scene
+ + +
+
+
+
+
Pixel → Real World Conversion
+
/pixel_to_real · camera-to-world coordinate projection
+
+ 0 Records +
+
+ + + + + + + + + + + + + +
Test IDTimestampInput U (px)Input V (px)Output X (m)Output Y (m)Output Z (m)
No data — call /pixel_to_real
+
+
+ +
+ + +
+ + + const latest = runs[runs.length - 1]; + document.getElementById('rhLastObjects').textContent = latest.scene?.total_objects ?? '—'; + document.getElementById('rhLastRelations').textContent= latest.scene?.total_relations ?? '—'; + document.getElementById('rhLastLatency').textContent = + latest.meta?.latency_s != null ? latest.meta.latency_s.toFixed(2) + ' s' : '—'; + + // Run history table + tbody.innerHTML = runs.slice().reverse().map(run => { + const meta = run.meta || {}; + const sam = run.sam || {}; + const clip = run.clip || {}; + const scene = run.scene || {}; + const t = meta.timestamp ? new Date(meta.timestamp).toLocaleString() : '—'; + return ` + #${meta.run_no ?? '?'} + ${t} + ${meta.latency_s != null ? meta.latency_s.toFixed(2) : '—'} + ${sam.total_detections ?? '—'} + ${sam.avg_confidence != null ? confBar(sam.avg_confidence) : '—'} + ${clip.filtered_regions ?? '—'} + ${scene.total_objects ?? '—'} + ${scene.total_relations ?? '—'} + ${scene.graspable_objects ?? '—'} + ${chipSvc(sam.success)} + ${chipSvc(clip.success)} + ${chipSvc(scene.success)} + `; + }).join(''); + + // Latest objects + const objects = latest.objects || []; + document.getElementById('latestObjectsBadge').textContent = `${objects.length} Objects`; + setNav('nav-objects', objects.length); + objBody.innerHTML = objects.length ? objects.map(obj => ` + ${obj.object_id ?? '—'} + ${obj.label ?? 'unknown'} + (${obj.bbox_x1??0},${obj.bbox_y1??0})→(${obj.bbox_x2??0},${obj.bbox_y2??0}) + ${confBar(obj.sam_confidence)} + ${confBar(obj.clip_confidence)} + ${obj.distance_cm != null ? obj.distance_cm.toFixed(1) : '—'} + ${obj.iou_score != null ? obj.iou_score.toFixed(3) : '—'} + ${chipOk(obj.is_stable)} + ${chipOk(obj.has_grasp)} + ${obj.grasp ? confBar(obj.grasp.quality_score) : ''} + `).join('') : noObj; + + // Latest relations + const relations = latest.relations || []; + document.getElementById('latestRelationsBadge').textContent = `${relations.length} Relations`; + setNav('nav-relations', relations.length); + relBody.innerHTML = relations.length ? relations.map(rel => ` + ${rel.subject ?? ''} + ${rel.relation ?? ''} + ${rel.target_object ?? ''} + ${confBar(rel.confidence)} + ${rel.distance_2d != null ? rel.distance_2d.toFixed(1) : '—'} + ${rel.description ?? ''} + `).join('') : noRel; + } + + // ── Clear data (legacy) ─────────────────────────────────────────────────── + async function clearData() { + if (confirm('Clear all in-session benchmark data?')) { + alert('Run:\nros2 service call /benchmark/clear_data std_srvs/srv/Trigger'); + } + } + + // ── Polling ─────────────────────────────────────────────────────────────── + setInterval(fetchData, 2000); + setInterval(fetchRunHistory, 2000); + fetchData(); + fetchRunHistory(); + diff --git a/docs/COLLECT_AND_EXPORT.md b/docs/COLLECT_AND_EXPORT.md new file mode 100644 index 0000000..beb6287 --- /dev/null +++ b/docs/COLLECT_AND_EXPORT.md @@ -0,0 +1,264 @@ +# Vision Data Collector & Excel Exporter + +Runs the full SAM → CLIP → Scene Understanding pipeline **once**, stores the result in a rolling +JSON history (last 20 runs), and writes a ready-to-open Excel workbook. + +--- + +## Output Files + +| File | Location | Description | +|---|---|---| +| `vision_runs_history.json` | workspace root (next to `README.md`) | Rolling database of last 20 runs | +| `vision_runs_export.xlsx` | workspace root | Excel workbook, 4 sheets | + +### Excel Sheets + +| Sheet | One row per | Key columns | +|---|---|---| +| **Runs** | Each script invocation | timestamp, total objects, graspable, avg confidence, stability %, scene description, latency | +| **Objects** | Each detected object | CLIP label, bbox coords, SAM + CLIP confidence, distance cm, IoU score, grasp quality | +| **Relations** | Each spatial relation | subject → relation → target, confidence, 2D distance | +| **Grasps** | Each grasp pose | x/y/z position (m), quality score, gripper width (m) | + +--- + +## Prerequisites + +### 1. Install openpyxl (one-time) + +```bash +pip install openpyxl +``` + +### 2. ROS2 workspace must be built + +```bash +cd ~/ros2_ws # or wherever your workspace is +colcon build --packages-select vision +source install/setup.bash +``` + +--- + +## How to Run + +### Step 1 — Start the vision nodes (separate terminals) + +```bash +# Terminal 1: SAM detector (required) +ros2 run vision simple_sam_detector + +# Terminal 2: CLIP classifier (required for object labels) +ros2 run vision clip_classifier + +# Terminal 3: Scene understanding (required for relations + grasp data) +ros2 run vision scene_understanding + +# Terminal 4: GraspNet detector (optional — needed for grasp positions) +ros2 run vision graspnet_detector +``` + +Wait until you see the nodes announce their services in the terminal output before continuing. + +### Step 2 — Make sure a camera is publishing + +```bash +# Simulation (Gazebo): +ros2 launch ur_yt_sim spawn_ur5_camera_gripper_moveit.launch.py + +# Real RealSense camera: +ros2 launch realsense2_camera rs_launch.py + +# Static image (for testing without hardware): +ros2 launch vision vision_with_camera.launch.py camera_type:=file image_file:=path/to/image.jpg +``` + +### Step 3 — Run the collector + +```bash +# Option A: directly with python3 +python3 vision_scripts/collect_and_export.py + +# Option B: as a ROS2 entry point (after colcon build) +ros2 run vision collect_and_export +``` + +### Expected terminal output + +``` +============================================================ + Vision Pipeline Data Collector & Excel Exporter +============================================================ + History file : /path/to/vision/vision_runs_history.json + Excel file : /path/to/vision/vision_runs_export.xlsx + Max runs kept: 20 +============================================================ +Waiting for /vision/run_pipeline ... +RUN #1 — Step 1: SAM /vision/run_pipeline + SAM: 3 objects detected +RUN #1 — Step 2: CLIP /vision/classify_bbox_filtered + CLIP: 2 regions classified +RUN #1 — Step 3: Scene /vision/understand_scene + Scene: 3 objects, 4 relations +RUN #1 complete in 2.341s — 3 objects, 4 relations +============================================================ +[OK] History saved → .../vision_runs_history.json (1 runs stored) +[OK] Excel exported → .../vision_runs_export.xlsx + +Done. Open vision_runs_export.xlsx to view results. +``` + +--- + +## Run it multiple times + +Each time you run the script it **appends** one new entry to the JSON and **overwrites** the Excel. +After 20 runs the oldest entry is automatically dropped. + +```bash +# Run 5 times in a loop (bash) +for i in {1..5}; do + python3 vision_scripts/collect_and_export.py + sleep 2 +done +``` + +--- + +## Debugging + +### Problem: "service not available — skipping" + +The script will skip any service that does not respond within 5 seconds and +continue with the remaining services without crashing. + +**Fix:** Check that the required node is running: + +```bash +ros2 node list # should show /simple_sam_detector, /clip_classifier, etc. +ros2 service list # should show /vision/run_pipeline, /vision/classify_bbox_filtered, etc. +``` + +If a node is missing, start it (see Step 1 above). + +--- + +### Problem: No camera image / SAM returns 0 objects + +```bash +# Check camera topics are publishing +ros2 topic list | grep camera +ros2 topic hz /camera/image_raw # should show ~30 Hz + +# Check SAM subscriber got a frame +ros2 topic echo /vision/status # shows uptime and detection count +``` + +SAM only detects when a frame has been received. Wait a second after starting the camera before calling the collector. + +--- + +### Problem: CLIP returns "No classified regions" + +CLIP auto-classifies when it receives a message on `/vision/sam_detections`. +The collector calls `/vision/run_pipeline` first (which publishes to that topic), then +waits 500 ms before calling CLIP. If CLIP is slow on first run (model loading): + +```bash +# Call run_pipeline manually first to warm up CLIP +ros2 service call /vision/run_pipeline std_srvs/srv/Trigger + +# Wait a few seconds, then run the collector +python3 vision_scripts/collect_and_export.py +``` + +--- + +### Problem: Scene understanding times out + +Scene understanding calls `/vision/detect_objects` internally which can take a few +seconds. The collector allows **15 seconds** for it. If it still times out: + +```bash +# Test scene understanding manually +ros2 service call /vision/understand_scene std_srvs/srv/Trigger +``` + +If that also hangs, check that `simple_sam_detector` is running (scene understanding +calls it internally). + +--- + +### Problem: "openpyxl not installed" + +```bash +pip install openpyxl + +# Verify: +python3 -c "import openpyxl; print(openpyxl.__version__)" +``` + +The script will still save `vision_runs_history.json` even without openpyxl — only +the Excel export is skipped. + +--- + +### Problem: Excel file is open in Excel and the script fails to overwrite it + +Close the Excel file first, then re-run the collector. + +--- + +### Inspect the raw JSON history + +```bash +# Pretty-print the history file +python3 -c "import json; print(json.dumps(json.load(open('vision_runs_history.json')), indent=2))" + +# Show just the run summaries +python3 -c " +import json +runs = json.load(open('vision_runs_history.json')) +for r in runs: + m = r['meta']; s = r['sam']; sc = r['scene'] + print(f\"Run #{m['run_no']} | {m['timestamp']} | objects={s.get('total_detections','?')} | latency={m['latency_s']}s\") +" +``` + +--- + +### Manually clear the history + +```bash +rm vision_runs_history.json +``` + +The next run will start fresh from run #1. + +--- + +## File Structure + +``` +vision/ +├── vision_scripts/ +│ └── collect_and_export.py ← the collector script +├── vision_runs_history.json ← created on first run +├── vision_runs_export.xlsx ← created on first run +└── docs/ + └── COLLECT_AND_EXPORT.md ← this file +``` + +--- + +## Services Called (in order) + +| # | Service | Type | Purpose | +|---|---|---|---| +| 1 | `/vision/run_pipeline` | `std_srvs/Trigger` | SAM detects objects, publishes to topic | +| 2 | `/vision/classify_bbox_filtered` | `std_srvs/Trigger` | CLIP labels each region (conf > 0.5) | +| 3 | `/vision/understand_scene` | `std_srvs/Trigger` | Full scene: objects + relations + grasps | + +If a service is not running the step is skipped and that section of the run record will show `"success": false`. +The JSON and Excel are still saved with whatever data was collected. diff --git a/setup.py b/setup.py index f82099a..569de07 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ 'scipy>=1.11.0', 'scikit-learn>=1.3.0', 'pandas>=2.0.0', + 'openpyxl>=3.1.0', 'tqdm>=4.66.0', 'jsonschema>=4.19.0', ], @@ -58,6 +59,7 @@ tests_require=['pytest'], entry_points={ 'console_scripts': [ + 'collect_and_export = vision_scripts.collect_and_export:main', 'simple_sam_detector = vision.simple_sam_detector:main', 'clip_classifier = vision.clip_classifier:main', 'sam_clip_pipeline = vision.sam_clip_pipeline:main', diff --git a/vision/benchmark_dashboard.py b/vision/benchmark_dashboard.py index 3bc0459..552365c 100644 --- a/vision/benchmark_dashboard.py +++ b/vision/benchmark_dashboard.py @@ -356,6 +356,17 @@ def do_GET(self): self.end_headers() data_json = json.dumps(self.dashboard_node.data) self.wfile.write(data_json.encode()) + elif self.path == '/api/run-history': + # Serve vision_runs_history.json from workspace root + history_file = package_path / 'vision_runs_history.json' + self.send_response(200) + self.send_header('Content-type', 'application/json') + self.send_header('Access-Control-Allow-Origin', '*') + self.end_headers() + if history_file.exists(): + self.wfile.write(history_file.read_bytes()) + else: + self.wfile.write(b'[]') else: # Serve static files super().do_GET() diff --git a/vision_scripts/collect_and_export.py b/vision_scripts/collect_and_export.py new file mode 100644 index 0000000..b61ae99 --- /dev/null +++ b/vision_scripts/collect_and_export.py @@ -0,0 +1,497 @@ +#!/usr/bin/env python3 +""" +Vision Pipeline Data Collector & Excel Exporter + +Calls all 4 vision services in sequence, stores the last 20 runs in a JSON +history file, and exports every run to an Excel workbook (4 sheets). + +Services called (in order): + 1. /vision/run_pipeline (std_srvs/Trigger) -> SAM detects + publishes + 2. /vision/classify_bbox_filtered (std_srvs/Trigger) -> CLIP labels per region + 3. /vision/understand_scene (std_srvs/Trigger) -> objects + relations + grasps + +Output files (written to workspace root next to README.md): + vision_runs_history.json — rolling last-20-runs database + vision_runs_export.xlsx — Excel workbook, 4 sheets: + • Runs — one row per pipeline run + • Objects — one row per detected object + • Relations — one row per spatial relation + • Grasps — one row per grasp pose + +Usage: + # Make sure the vision nodes are already running, then: + python3 vision_scripts/collect_and_export.py + + # Or as a ROS2 entry point (after colcon build): + ros2 run vision collect_and_export +""" + +import rclpy +from rclpy.node import Node +from std_srvs.srv import Trigger +import json +import time +import os +import sys +from datetime import datetime +from pathlib import Path + +# --------------------------------------------------------------------------- +# Output file paths (workspace root = parent of this script's package dir) +# --------------------------------------------------------------------------- +_SCRIPT_DIR = Path(__file__).resolve().parent +_WORKSPACE_ROOT = _SCRIPT_DIR.parent +HISTORY_FILE = _WORKSPACE_ROOT / "vision_runs_history.json" +EXCEL_FILE = _WORKSPACE_ROOT / "vision_runs_export.xlsx" +MAX_HISTORY = 20 + + +# --------------------------------------------------------------------------- +# Helper: load / save history +# --------------------------------------------------------------------------- + +def _load_history(): + if HISTORY_FILE.exists(): + try: + with open(HISTORY_FILE, "r") as f: + data = json.load(f) + if isinstance(data, list): + return data + except Exception: + pass + return [] + + +def _save_history(runs): + runs = runs[-MAX_HISTORY:] # keep last 20 only + with open(HISTORY_FILE, "w") as f: + json.dump(runs, f, indent=2) + + +# --------------------------------------------------------------------------- +# Helper: export to Excel using openpyxl +# --------------------------------------------------------------------------- + +def _export_excel(runs): + """Write runs list → Excel with 4 sheets.""" + try: + from openpyxl import Workbook + from openpyxl.styles import PatternFill, Font, Alignment + from openpyxl.utils import get_column_letter + except ImportError: + print("[WARN] openpyxl not installed. Run: pip install openpyxl") + print("[WARN] Skipping Excel export. JSON history saved successfully.") + return + + wb = Workbook() + + # ---- colour scheme ---- + header_fill = PatternFill("solid", fgColor="4F81BD") + header_font = Font(bold=True, color="FFFFFF") + alt_fill = PatternFill("solid", fgColor="DCE6F1") + + def _write_sheet(ws, headers, rows, colour_col=None): + """Write header row then data rows with alternating row colour.""" + ws.append(headers) + for cell in ws[1]: + cell.fill = header_fill + cell.font = header_font + cell.alignment = Alignment(horizontal="center") + for row_idx, row in enumerate(rows, start=2): + ws.append(row) + if row_idx % 2 == 0: + for cell in ws[row_idx]: + cell.fill = alt_fill + # Auto-width + for col_idx, header in enumerate(headers, start=1): + col_letter = get_column_letter(col_idx) + max_len = len(str(header)) + for row in ws.iter_rows(min_col=col_idx, max_col=col_idx): + for cell in row: + try: + max_len = max(max_len, len(str(cell.value or ""))) + except Exception: + pass + ws.column_dimensions[col_letter].width = min(max_len + 4, 40) + + # ---- Sheet 1: Runs ---- + ws_runs = wb.active + ws_runs.title = "Runs" + runs_headers = [ + "Run No", "Timestamp", "SAM Total Objects", "CLIP Filtered Regions", + "Graspable Objects", "Total Relations", "Avg SAM Confidence", + "Avg IoU", "Stability Rate (%)", "Scene Description", + "SAM Success", "CLIP Success", "Scene Success", "Latency (s)" + ] + runs_rows = [] + for run in runs: + meta = run.get("meta", {}) + sam = run.get("sam", {}) + clip = run.get("clip", {}) + scene = run.get("scene", {}) + runs_rows.append([ + meta.get("run_no", ""), + meta.get("timestamp", ""), + sam.get("total_detections", ""), + clip.get("filtered_regions", ""), + scene.get("graspable_objects", ""), + scene.get("total_relations", ""), + sam.get("avg_confidence", ""), + sam.get("average_iou", ""), + round(sam.get("stability_rate", 0) * 100, 1), + scene.get("scene_description", ""), + sam.get("success", ""), + clip.get("success", ""), + scene.get("success", ""), + meta.get("latency_s", ""), + ]) + _write_sheet(ws_runs, runs_headers, runs_rows) + + # ---- Sheet 2: Objects ---- + ws_obj = wb.create_sheet("Objects") + obj_headers = [ + "Run No", "Timestamp", "Object ID", "CLIP Label", + "BBox X1", "BBox Y1", "BBox X2", "BBox Y2", + "SAM Confidence", "CLIP Confidence", + "Distance (cm)", "IoU Score", "Is Stable", + "Has Grasp", "Grasp Quality", "Grasp Width (m)" + ] + obj_rows = [] + for run in runs: + meta = run.get("meta", {}) + run_no = meta.get("run_no", "") + ts = meta.get("timestamp", "") + for obj in run.get("objects", []): + grasp = obj.get("grasp", {}) + obj_rows.append([ + run_no, ts, + obj.get("object_id", ""), + obj.get("label", ""), + obj.get("bbox_x1", ""), obj.get("bbox_y1", ""), + obj.get("bbox_x2", ""), obj.get("bbox_y2", ""), + obj.get("sam_confidence", ""), + obj.get("clip_confidence", ""), + obj.get("distance_cm", ""), + obj.get("iou_score", ""), + obj.get("is_stable", ""), + obj.get("has_grasp", ""), + grasp.get("quality_score", ""), + grasp.get("width_m", ""), + ]) + _write_sheet(ws_obj, obj_headers, obj_rows) + + # ---- Sheet 3: Relations ---- + ws_rel = wb.create_sheet("Relations") + rel_headers = [ + "Run No", "Timestamp", "Scene ID", + "Subject", "Relation", "Target Object", + "Confidence", "Distance 2D", "Description" + ] + rel_rows = [] + for run in runs: + meta = run.get("meta", {}) + run_no = meta.get("run_no", "") + ts = meta.get("timestamp", "") + scene = run.get("scene", {}) + scene_id = scene.get("scene_id", "") + for rel in run.get("relations", []): + rel_rows.append([ + run_no, ts, scene_id, + rel.get("subject", ""), + rel.get("relation", ""), + rel.get("target_object", ""), + rel.get("confidence", ""), + rel.get("distance_2d", ""), + rel.get("description", ""), + ]) + _write_sheet(ws_rel, rel_headers, rel_rows) + + # ---- Sheet 4: Grasps ---- + ws_grasp = wb.create_sheet("Grasps") + grasp_headers = [ + "Run No", "Timestamp", + "Object ID", "Pos X (m)", "Pos Y (m)", "Pos Z (m)", + "Orient X", "Orient Y", "Orient Z", "Orient W", + "Quality Score", "Grasp Width (m)", "Approach Direction" + ] + grasp_rows = [] + for run in runs: + meta = run.get("meta", {}) + run_no = meta.get("run_no", "") + ts = meta.get("timestamp", "") + for g in run.get("grasps", []): + grasp_rows.append([ + run_no, ts, + g.get("object_id", ""), + g.get("pos_x", ""), g.get("pos_y", ""), g.get("pos_z", ""), + g.get("orient_x", ""), g.get("orient_y", ""), + g.get("orient_z", ""), g.get("orient_w", ""), + g.get("quality_score", ""), + g.get("width_m", ""), + g.get("approach_direction", ""), + ]) + _write_sheet(ws_grasp, grasp_headers, grasp_rows) + + wb.save(str(EXCEL_FILE)) + print(f"[OK] Excel exported → {EXCEL_FILE}") + + +# --------------------------------------------------------------------------- +# ROS2 collector node +# --------------------------------------------------------------------------- + +class VisionDataCollector(Node): + + def __init__(self): + super().__init__("vision_data_collector") + + self._sam_client = self.create_client(Trigger, "/vision/run_pipeline") + self._clip_client = self.create_client(Trigger, "/vision/classify_bbox_filtered") + self._scene_client = self.create_client(Trigger, "/vision/understand_scene") + + # ------------------------------------------------------------------ + # Low-level call helper + # ------------------------------------------------------------------ + + def _call(self, client, service_name, timeout=10.0): + """Wait for service, call it, return (success, message_str).""" + self.get_logger().info(f"Waiting for {service_name} ...") + if not client.wait_for_service(timeout_sec=5.0): + self.get_logger().warn(f"{service_name} not available — skipping") + return False, None + + future = client.call_async(Trigger.Request()) + start = time.time() + while not future.done(): + rclpy.spin_once(self, timeout_sec=0.05) + if time.time() - start > timeout: + self.get_logger().error(f"{service_name} call timed out") + return False, None + + result = future.result() + if result is None: + return False, None + return result.success, result.message + + # ------------------------------------------------------------------ + # Collect one run + # ------------------------------------------------------------------ + + def collect_run(self, run_no): + """Call all services and return a unified run dict.""" + ts = datetime.utcnow().isoformat() + "Z" + t0 = time.perf_counter() + + run = { + "meta": {"run_no": run_no, "timestamp": ts}, + "sam": {"success": False}, + "clip": {"success": False}, + "scene": {"success": False}, + "objects": [], + "relations": [], + "grasps": [], + } + + # ---- 1. SAM ---- + self.get_logger().info("=" * 60) + self.get_logger().info(f"RUN #{run_no} — Step 1: SAM /vision/run_pipeline") + sam_ok, sam_msg = self._call(self._sam_client, "/vision/run_pipeline") + if sam_ok and sam_msg: + try: + sam_data = json.loads(sam_msg) + summary = sam_data.get("summary", {}) + metrics = sam_data.get("metrics", {}) + coco = metrics.get("coco_ap_style", {}) + circ = metrics.get("circularity_confidence", {}) + + run["sam"] = { + "success": True, + "total_detections": summary.get("total_detections", 0), + "avg_confidence": circ.get("average_confidence", 0.0), + "average_iou": coco.get("average_iou", 0.0), + "stability_rate": coco.get("stability_rate", 0.0), + } + + # Parse per-object data from SAM response + for frame in sam_data.get("detections", []): + for det in frame.get("detections", []): + bbox = det.get("bbox", [0, 0, 0, 0]) + run["objects"].append({ + "object_id": det.get("class_name", "object"), + "label": "", # filled by CLIP below + "bbox_x1": bbox[0] if len(bbox) > 0 else "", + "bbox_y1": bbox[1] if len(bbox) > 1 else "", + "bbox_x2": bbox[2] if len(bbox) > 2 else "", + "bbox_y2": bbox[3] if len(bbox) > 3 else "", + "sam_confidence": det.get("confidence", ""), + "clip_confidence": "", + "distance_cm": det.get("distance_cm", ""), + "iou_score": det.get("iou_with_previous", ""), + "is_stable": det.get("is_stable_detection", ""), + "has_grasp": False, + "grasp": {}, + }) + self.get_logger().info(f" SAM: {run['sam']['total_detections']} objects detected") + except Exception as e: + self.get_logger().error(f" SAM parse error: {e}") + + # Wait 500 ms for CLIP to auto-process the published SAM topic + time.sleep(0.5) + + # ---- 2. CLIP ---- + self.get_logger().info(f"RUN #{run_no} — Step 2: CLIP /vision/classify_bbox_filtered") + clip_ok, clip_msg = self._call(self._clip_client, "/vision/classify_bbox_filtered") + if clip_ok and clip_msg: + try: + clip_data = json.loads(clip_msg) + run["clip"] = { + "success": True, + "total_sam_regions": clip_data.get("total_sam_regions", 0), + "filtered_regions": clip_data.get("filtered_regions", 0), + } + # Merge CLIP labels into objects by region_id index + clip_regions = {r["region_id"]: r for r in clip_data.get("regions", [])} + for idx, obj in enumerate(run["objects"]): + region = clip_regions.get(idx) + if region: + obj["label"] = region.get("label", "") + obj["clip_confidence"] = region.get("confidence", "") + self.get_logger().info(f" CLIP: {run['clip']['filtered_regions']} regions classified") + except Exception as e: + self.get_logger().error(f" CLIP parse error: {e}") + + # ---- 3. Scene Understanding ---- + self.get_logger().info(f"RUN #{run_no} — Step 3: Scene /vision/understand_scene") + scene_ok, scene_msg = self._call(self._scene_client, "/vision/understand_scene", timeout=15.0) + if scene_ok and scene_msg: + try: + scene_data = json.loads(scene_msg) + run["scene"] = { + "success": True, + "scene_id": scene_data.get("scene_id", ""), + "total_objects": scene_data.get("total_objects", 0), + "total_relations": scene_data.get("total_relations", 0), + "graspable_objects": scene_data.get("graspable_objects", 0), + "scene_description": scene_data.get("scene_description", ""), + } + + # Per-object details (enriches existing objects list) + scene_objects = scene_data.get("objects", {}) + for obj_id, obj_info in scene_objects.items(): + # Try to match to an existing object entry by bounding box proximity + # or append a new entry if scene understanding found it + matched = False + s_bbox = obj_info.get("bbox", []) + for obj in run["objects"]: + if (len(s_bbox) >= 4 and + obj.get("bbox_x1") == s_bbox[0] and + obj.get("bbox_y1") == s_bbox[1]): + # Update with scene info + obj["label"] = obj.get("label") or obj_info.get("label", "") + obj["has_grasp"] = obj_info.get("has_grasp", False) + if obj_info.get("has_grasp") and obj_info.get("grasp_quality") is not None: + obj["grasp"] = { + "quality_score": obj_info.get("grasp_quality"), + "width_m": "", # not available at this level + } + matched = True + break + + if not matched: + s_bbox_safe = s_bbox if len(s_bbox) >= 4 else ["", "", "", ""] + run["objects"].append({ + "object_id": obj_id, + "label": obj_info.get("label", ""), + "bbox_x1": s_bbox_safe[0], + "bbox_y1": s_bbox_safe[1], + "bbox_x2": s_bbox_safe[2], + "bbox_y2": s_bbox_safe[3], + "sam_confidence": "", + "clip_confidence": obj_info.get("confidence", ""), + "distance_cm": obj_info.get("distance_cm", ""), + "iou_score": "", + "is_stable": "", + "has_grasp": obj_info.get("has_grasp", False), + "grasp": { + "quality_score": obj_info.get("grasp_quality", ""), + "width_m": "", + }, + }) + + # Relations for this object + for rel in obj_info.get("relations", []): + run["relations"].append({ + "subject": obj_id, + "relation": rel.get("relation", ""), + "target_object": rel.get("target_object", ""), + "confidence": rel.get("confidence", ""), + "distance_2d": rel.get("distance_2d", ""), + "description": rel.get("description", ""), + }) + + self.get_logger().info( + f" Scene: {run['scene']['total_objects']} objects, " + f"{run['scene']['total_relations']} relations" + ) + except Exception as e: + self.get_logger().error(f" Scene parse error: {e}") + + # ---- Finalize ---- + run["meta"]["latency_s"] = round(time.perf_counter() - t0, 3) + self.get_logger().info( + f"RUN #{run_no} complete in {run['meta']['latency_s']}s — " + f"{len(run['objects'])} objects, {len(run['relations'])} relations" + ) + self.get_logger().info("=" * 60) + return run + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main(args=None): + rclpy.init(args=args) + + node = VisionDataCollector() + + print() + print("=" * 60) + print(" Vision Pipeline Data Collector & Excel Exporter") + print("=" * 60) + print(f" History file : {HISTORY_FILE}") + print(f" Excel file : {EXCEL_FILE}") + print(f" Max runs kept: {MAX_HISTORY}") + print("=" * 60) + + try: + # Determine the next run number from existing history + history = _load_history() + last_run_no = history[-1]["meta"]["run_no"] if history else 0 + run_no = last_run_no + 1 + + # Collect the run + run = node.collect_run(run_no) + + # Append to history and persist + history.append(run) + _save_history(history) + print(f"[OK] History saved → {HISTORY_FILE} ({len(history[-MAX_HISTORY:])} runs stored)") + + # Export all stored runs to Excel + _export_excel(history[-MAX_HISTORY:]) + + print() + print("Done. Open vision_runs_export.xlsx to view results.") + print() + + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + if rclpy.ok(): + rclpy.shutdown() + + +if __name__ == "__main__": + main() From 0177a111bbdd83641b69b0c0fd1d4c7e48da4f89 Mon Sep 17 00:00:00 2001 From: Methasit-Pun Date: Sat, 4 Apr 2026 00:07:59 +0700 Subject: [PATCH 07/16] feat: add mini dashboard --- dashboard/index.html | 221 +++++- dashboard/objects.html | 1049 ++++++++++++++++++++++++++ vision_scripts/collect_and_export.py | 162 +++- 3 files changed, 1400 insertions(+), 32 deletions(-) create mode 100644 dashboard/objects.html diff --git a/dashboard/index.html b/dashboard/index.html index 1071733..a3bd85d 100644 --- a/dashboard/index.html +++ b/dashboard/index.html @@ -500,6 +500,17 @@ Pixel → Real 0
+ + +
Analysis
+ @@ -550,6 +561,11 @@
0
Pixel→Real
+
+
0
+
OBB Objects
+
Latest run
+
0
Total Svc Calls
@@ -583,7 +599,23 @@
-
Last Latency
+
Total Latency
+
+
+
+
SAM Latency
+
+
+
+
CLIP Latency
+
+
+
+
Scene Latency
+
+
+
+
OBB Latency
@@ -591,19 +623,25 @@ Run Timestamp - Latency (s) - SAM Detections + Total (s) + SAM (s) + CLIP (s) + Scene (s) + OBB (s) + SAM Objects SAM Avg Conf CLIP Filtered Scene Objects Relations Graspable + OBB Objects SAM CLIP Scene + OBB - No run history yet — run collect_and_export.py + No run history yet — run collect_and_export.py
@@ -633,9 +671,12 @@ Stable Has Grasp Grasp Quality + OBB Angle (°) + OBB Width (px) + OBB Height (px) - No data yet + No data yet
@@ -668,6 +709,59 @@
+ +
+
+
+
+
OBB — Oriented Bounding Box Angle Benchmark
+
/obb/find_object_angle · angle parallel to width · 0° = vertical · range −90° to +90°
+
+ 0 Objects +
+
+
+
+
Objects (Latest)
+
+
+
+
Avg Angle (°)
+
+
+
+
Min Angle (°)
+
+
+
+
Max Angle (°)
+
+
+
+
OBB Latency (s)
+
+
+
+ + + + + + + + + + + + + + + + +
Object IDLabelOBB Angle (°)OBB Theta (rad)OBB Width (px)OBB Height (px)Center (u, v)Bounding BoxSAM ConfidenceDistance (cm)
No OBB data — ensure obb_angle_service_node is running
+
+
+
@@ -1138,29 +1232,45 @@ document.getElementById('rhLastRelations').textContent= latest.scene?.total_relations ?? '—'; document.getElementById('rhLastLatency').textContent = latest.meta?.latency_s != null ? latest.meta.latency_s.toFixed(2) + ' s' : '—'; - - // Run history table + document.getElementById('rhSamLatency').textContent = + latest.sam?.latency_s != null ? latest.sam.latency_s.toFixed(2) + ' s' : '—'; + document.getElementById('rhClipLatency').textContent = + latest.clip?.latency_s != null ? latest.clip.latency_s.toFixed(2) + ' s' : '—'; + document.getElementById('rhSceneLatency').textContent = + latest.scene?.latency_s!= null ? latest.scene.latency_s.toFixed(2)+ ' s' : '—'; + document.getElementById('rhObbLatency').textContent = + latest.obb?.latency_s != null ? latest.obb.latency_s.toFixed(2) + ' s' : '—'; + + // Run history table — update to include latency per service and OBB tbody.innerHTML = runs.slice().reverse().map(run => { - const meta = run.meta || {}; - const sam = run.sam || {}; - const clip = run.clip || {}; - const scene = run.scene || {}; - const t = meta.timestamp ? new Date(meta.timestamp).toLocaleString() : '—'; - return ` - #${meta.run_no ?? '?'} - ${t} - ${meta.latency_s != null ? meta.latency_s.toFixed(2) : '—'} - ${sam.total_detections ?? '—'} - ${sam.avg_confidence != null ? confBar(sam.avg_confidence) : '—'} - ${clip.filtered_regions ?? '—'} - ${scene.total_objects ?? '—'} - ${scene.total_relations ?? '—'} - ${scene.graspable_objects ?? '—'} - ${chipSvc(sam.success)} - ${chipSvc(clip.success)} - ${chipSvc(scene.success)} - `; - }).join(''); + const meta = run.meta || {}; + const sam = run.sam || {}; + const clip = run.clip || {}; + const scene = run.scene || {}; + const obb = run.obb || {}; + const t = meta.timestamp ? new Date(meta.timestamp).toLocaleString() : '—'; + const obbObjects = (run.objects || []).filter(o => o.obb_angle_deg !== '' && o.obb_angle_deg != null).length; + return ` + #${meta.run_no ?? '?'} + ${t} + ${meta.latency_s != null ? meta.latency_s.toFixed(2) : '—'} + ${sam.latency_s != null ? sam.latency_s.toFixed(2) : '—'} + ${clip.latency_s != null ? clip.latency_s.toFixed(2) : '—'} + ${scene.latency_s != null ? scene.latency_s.toFixed(2) : '—'} + ${obb.latency_s != null ? obb.latency_s.toFixed(2) : '—'} + ${sam.total_detections ?? '—'} + ${sam.avg_confidence != null ? confBar(sam.avg_confidence) : '—'} + ${clip.filtered_regions ?? '—'} + ${scene.total_objects ?? '—'} + ${scene.total_relations ?? '—'} + ${scene.graspable_objects ?? '—'} + ${obbObjects} + ${chipSvc(sam.success)} + ${chipSvc(clip.success)} + ${chipSvc(scene.success)} + ${chipSvc(obb.success)} + `; + }).join(''); // Latest objects const objects = latest.objects || []; @@ -1177,6 +1287,9 @@ ${chipOk(obj.is_stable)} ${chipOk(obj.has_grasp)} ${obj.grasp ? confBar(obj.grasp.quality_score) : ''} + ${obj.obb_angle_deg != null && obj.obb_angle_deg !== '' ? obj.obb_angle_deg.toFixed(1) + '°' : ''} + ${obj.obb_width_px != null && obj.obb_width_px !== '' ? Math.round(obj.obb_width_px) : '—'} + ${obj.obb_height_px != null && obj.obb_height_px !== '' ? Math.round(obj.obb_height_px) : '—'} `).join('') : noObj; // Latest relations @@ -1191,6 +1304,60 @@ ${rel.distance_2d != null ? rel.distance_2d.toFixed(1) : '—'} ${rel.description ?? ''} `).join('') : noRel; + + // OBB section from latest run + updateOBBSection(latest); + } + + function updateOBBSection(latestRun) { + const obbMeta = latestRun.obb || {}; + const objects = (latestRun.objects || []).filter(o => o.obb_angle_deg != null && o.obb_angle_deg !== ''); + const tbody = document.getElementById('obbBody'); + + document.getElementById('obbBadge').textContent = `${objects.length} Objects`; + document.getElementById('obbTotalObjects').textContent = objects.length; + document.getElementById('obbLatency').textContent = + obbMeta.latency_s != null ? obbMeta.latency_s.toFixed(2) + ' s' : '—'; + setNav('nav-obb', objects.length); + document.getElementById('obbCount').textContent = objects.length; + + if (!objects.length) { + document.getElementById('obbAvgAngle').textContent = '—'; + document.getElementById('obbMinAngle').textContent = '—'; + document.getElementById('obbMaxAngle').textContent = '—'; + tbody.innerHTML = 'No OBB data — ensure obb_angle_service_node is running'; + return; + } + + const angles = objects.map(o => o.obb_angle_deg); + const avg = angles.reduce((s, v) => s + v, 0) / angles.length; + document.getElementById('obbAvgAngle').textContent = avg.toFixed(1) + '°'; + document.getElementById('obbMinAngle').textContent = Math.min(...angles).toFixed(1) + '°'; + document.getElementById('obbMaxAngle').textContent = Math.max(...angles).toFixed(1) + '°'; + + tbody.innerHTML = objects.map(obj => { + const angleDeg = obj.obb_angle_deg; + // Angle bar: map -90..+90 to 0..100% + const pct = Math.round(((angleDeg + 90) / 180) * 100); + const barColor = Math.abs(angleDeg) < 20 ? 'var(--ok)' : Math.abs(angleDeg) < 60 ? 'var(--warn)' : 'var(--fail)'; + return ` + ${obj.object_id ?? '—'} + ${obj.label ?? 'unknown'} + +
+
+ ${angleDeg.toFixed(1)}° +
+ + ${obj.obb_theta_rad != null ? Number(obj.obb_theta_rad).toFixed(4) : '—'} + ${obj.obb_width_px != null ? Math.round(obj.obb_width_px) : '—'} + ${obj.obb_height_px != null ? Math.round(obj.obb_height_px) : '—'} + ${obj.obb_center_u != null ? Math.round(obj.obb_center_u) : '—'}, ${obj.obb_center_v != null ? Math.round(obj.obb_center_v) : '—'} + (${obj.bbox_x1??0},${obj.bbox_y1??0})→(${obj.bbox_x2??0},${obj.bbox_y2??0}) + ${confBar(obj.sam_confidence)} + ${obj.distance_cm != null ? obj.distance_cm.toFixed(1) : '—'} + `; + }).join(''); } // ── Clear data (legacy) ─────────────────────────────────────────────────── diff --git a/dashboard/objects.html b/dashboard/objects.html new file mode 100644 index 0000000..7af533a --- /dev/null +++ b/dashboard/objects.html @@ -0,0 +1,1049 @@ + + + + + + Object Explorer — Vision Research Dashboard + + + + + + + + +
+ + +
+
+
Object Explorer
+
Sort · Group · Assign Names · Export · data pulled from vision_runs_history.json
+
+
+ Run + + +
+ Group + + +
+ + +
+ + +
+
+ +
+ + +
+
+
+ 0 + Objects +
+
+
+ + Avg SAM Conf +
+
+
+ 0 + Unique Labels +
+
+
+ 0 + Graspable +
+
+
+ + Avg OBB Angle +
+
+
+ 0 + Stable (AP≥0.5) +
+
+
+ 0 + Relations +
+
+
+ 0 + Named +
+
+ + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ObjectSAM — Segment Anything ModelCLIP ClassificationGraspNetOBB — Oriented BBoxScene Understanding
+ Assigned Name + + Obj ID + + Run # + BBox (x1,y1 → x2,y2) + Middle Point + + z / Depth (cm) + + Confidence + + IoU Score + + AP (IoU ≥ 0.5) + + Label + + Confidence + Top-1 Acc + Has Grasp + + Quality (Q) + + Width (m) + + Angle from X (°) + + θ (rad) + W × H (px)OBB Center + Relations + Relation Detail
Loading…
+
+
+ +
+ + +
+ + + + diff --git a/vision_scripts/collect_and_export.py b/vision_scripts/collect_and_export.py index b61ae99..35ae942 100644 --- a/vision_scripts/collect_and_export.py +++ b/vision_scripts/collect_and_export.py @@ -26,6 +26,7 @@ ros2 run vision collect_and_export """ +import math import rclpy from rclpy.node import Node from std_srvs.srv import Trigger @@ -36,6 +37,13 @@ from datetime import datetime from pathlib import Path +# OBB service uses custom_interfaces (optional — graceful skip if not built) +try: + from custom_interfaces.srv import FindObjectAngle + _OBB_AVAILABLE = True +except ImportError: + _OBB_AVAILABLE = False + # --------------------------------------------------------------------------- # Output file paths (workspace root = parent of this script's package dir) # --------------------------------------------------------------------------- @@ -121,7 +129,9 @@ def _write_sheet(ws, headers, rows, colour_col=None): "Run No", "Timestamp", "SAM Total Objects", "CLIP Filtered Regions", "Graspable Objects", "Total Relations", "Avg SAM Confidence", "Avg IoU", "Stability Rate (%)", "Scene Description", - "SAM Success", "CLIP Success", "Scene Success", "Latency (s)" + "SAM Success", "CLIP Success", "Scene Success", "OBB Success", + "Total Latency (s)", "SAM Latency (s)", "CLIP Latency (s)", + "Scene Latency (s)", "OBB Latency (s)" ] runs_rows = [] for run in runs: @@ -129,6 +139,7 @@ def _write_sheet(ws, headers, rows, colour_col=None): sam = run.get("sam", {}) clip = run.get("clip", {}) scene = run.get("scene", {}) + obb = run.get("obb", {}) runs_rows.append([ meta.get("run_no", ""), meta.get("timestamp", ""), @@ -143,7 +154,12 @@ def _write_sheet(ws, headers, rows, colour_col=None): sam.get("success", ""), clip.get("success", ""), scene.get("success", ""), + obb.get("success", ""), meta.get("latency_s", ""), + sam.get("latency_s", ""), + clip.get("latency_s", ""), + scene.get("latency_s", ""), + obb.get("latency_s", ""), ]) _write_sheet(ws_runs, runs_headers, runs_rows) @@ -154,7 +170,11 @@ def _write_sheet(ws, headers, rows, colour_col=None): "BBox X1", "BBox Y1", "BBox X2", "BBox Y2", "SAM Confidence", "CLIP Confidence", "Distance (cm)", "IoU Score", "Is Stable", - "Has Grasp", "Grasp Quality", "Grasp Width (m)" + "Has Grasp", "Grasp Quality", "Grasp Width (m)", + # OBB columns + "OBB Angle (deg)", "OBB Theta (rad)", + "OBB Width (px)", "OBB Height (px)", + "OBB Center U", "OBB Center V", ] obj_rows = [] for run in runs: @@ -177,6 +197,13 @@ def _write_sheet(ws, headers, rows, colour_col=None): obj.get("has_grasp", ""), grasp.get("quality_score", ""), grasp.get("width_m", ""), + # OBB + obj.get("obb_angle_deg", ""), + obj.get("obb_theta_rad", ""), + obj.get("obb_width_px", ""), + obj.get("obb_height_px", ""), + obj.get("obb_center_u", ""), + obj.get("obb_center_v", ""), ]) _write_sheet(ws_obj, obj_headers, obj_rows) @@ -232,6 +259,45 @@ def _write_sheet(ws, headers, rows, colour_col=None): ]) _write_sheet(ws_grasp, grasp_headers, grasp_rows) + # ---- Sheet 5: OBB Angles ---- + ws_obb = wb.create_sheet("OBB Angles") + obb_headers = [ + "Run No", "Timestamp", + "Object ID", "CLIP Label", + "OBB Angle (deg)", "OBB Theta (rad)", + "OBB Width (px)", "OBB Height (px)", + "OBB Center U", "OBB Center V", + "BBox X1", "BBox Y1", "BBox X2", "BBox Y2", + "SAM Confidence", "Distance (cm)", + "OBB Latency (s)", + ] + obb_rows = [] + for run in runs: + meta = run.get("meta", {}) + run_no = meta.get("run_no", "") + ts = meta.get("timestamp", "") + obb_lat = run.get("obb", {}).get("latency_s", "") + for obj in run.get("objects", []): + if obj.get("obb_angle_deg", "") == "": + continue # skip objects where OBB wasn't available + obb_rows.append([ + run_no, ts, + obj.get("object_id", ""), + obj.get("label", ""), + obj.get("obb_angle_deg", ""), + obj.get("obb_theta_rad", ""), + obj.get("obb_width_px", ""), + obj.get("obb_height_px", ""), + obj.get("obb_center_u", ""), + obj.get("obb_center_v", ""), + obj.get("bbox_x1", ""), obj.get("bbox_y1", ""), + obj.get("bbox_x2", ""), obj.get("bbox_y2", ""), + obj.get("sam_confidence", ""), + obj.get("distance_cm", ""), + obb_lat, + ]) + _write_sheet(ws_obb, obb_headers, obb_rows) + wb.save(str(EXCEL_FILE)) print(f"[OK] Excel exported → {EXCEL_FILE}") @@ -249,6 +315,13 @@ def __init__(self): self._clip_client = self.create_client(Trigger, "/vision/classify_bbox_filtered") self._scene_client = self.create_client(Trigger, "/vision/understand_scene") + # OBB client — only if custom_interfaces is built + if _OBB_AVAILABLE: + self._obb_client = self.create_client(FindObjectAngle, "/obb/find_object_angle") + else: + self._obb_client = None + self.get_logger().warn("custom_interfaces not found — OBB step will be skipped") + # ------------------------------------------------------------------ # Low-level call helper # ------------------------------------------------------------------ @@ -273,6 +346,24 @@ def _call(self, client, service_name, timeout=10.0): return False, None return result.success, result.message + def _call_obb(self, timeout=10.0): + """Call /obb/find_object_angle (FindObjectAngle service). Returns response or None.""" + if self._obb_client is None: + return None + service_name = "/obb/find_object_angle" + self.get_logger().info(f"Waiting for {service_name} ...") + if not self._obb_client.wait_for_service(timeout_sec=5.0): + self.get_logger().warn(f"{service_name} not available — skipping OBB step") + return None + future = self._obb_client.call_async(FindObjectAngle.Request()) + start = time.time() + while not future.done(): + rclpy.spin_once(self, timeout_sec=0.05) + if time.time() - start > timeout: + self.get_logger().error(f"{service_name} call timed out") + return None + return future.result() + # ------------------------------------------------------------------ # Collect one run # ------------------------------------------------------------------ @@ -284,9 +375,10 @@ def collect_run(self, run_no): run = { "meta": {"run_no": run_no, "timestamp": ts}, - "sam": {"success": False}, - "clip": {"success": False}, - "scene": {"success": False}, + "sam": {"success": False, "latency_s": 0.0}, + "clip": {"success": False, "latency_s": 0.0}, + "scene": {"success": False, "latency_s": 0.0}, + "obb": {"success": False, "total_objects": 0, "latency_s": 0.0}, "objects": [], "relations": [], "grasps": [], @@ -295,7 +387,9 @@ def collect_run(self, run_no): # ---- 1. SAM ---- self.get_logger().info("=" * 60) self.get_logger().info(f"RUN #{run_no} — Step 1: SAM /vision/run_pipeline") + t_sam = time.perf_counter() sam_ok, sam_msg = self._call(self._sam_client, "/vision/run_pipeline") + run["sam"]["latency_s"] = round(time.perf_counter() - t_sam, 3) if sam_ok and sam_msg: try: sam_data = json.loads(sam_msg) @@ -306,6 +400,7 @@ def collect_run(self, run_no): run["sam"] = { "success": True, + "latency_s": run["sam"]["latency_s"], "total_detections": summary.get("total_detections", 0), "avg_confidence": circ.get("average_confidence", 0.0), "average_iou": coco.get("average_iou", 0.0), @@ -340,12 +435,15 @@ def collect_run(self, run_no): # ---- 2. CLIP ---- self.get_logger().info(f"RUN #{run_no} — Step 2: CLIP /vision/classify_bbox_filtered") + t_clip = time.perf_counter() clip_ok, clip_msg = self._call(self._clip_client, "/vision/classify_bbox_filtered") + run["clip"]["latency_s"] = round(time.perf_counter() - t_clip, 3) if clip_ok and clip_msg: try: clip_data = json.loads(clip_msg) run["clip"] = { "success": True, + "latency_s": run["clip"]["latency_s"], "total_sam_regions": clip_data.get("total_sam_regions", 0), "filtered_regions": clip_data.get("filtered_regions", 0), } @@ -362,12 +460,15 @@ def collect_run(self, run_no): # ---- 3. Scene Understanding ---- self.get_logger().info(f"RUN #{run_no} — Step 3: Scene /vision/understand_scene") + t_scene = time.perf_counter() scene_ok, scene_msg = self._call(self._scene_client, "/vision/understand_scene", timeout=15.0) + run["scene"]["latency_s"] = round(time.perf_counter() - t_scene, 3) if scene_ok and scene_msg: try: scene_data = json.loads(scene_msg) run["scene"] = { "success": True, + "latency_s": run["scene"]["latency_s"], "scene_id": scene_data.get("scene_id", ""), "total_objects": scene_data.get("total_objects", 0), "total_relations": scene_data.get("total_relations", 0), @@ -436,6 +537,57 @@ def collect_run(self, run_no): except Exception as e: self.get_logger().error(f" Scene parse error: {e}") + # ---- 4. OBB Angle Benchmark ---- + self.get_logger().info(f"RUN #{run_no} — Step 4: OBB /obb/find_object_angle") + t_obb = time.perf_counter() + obb_resp = self._call_obb() + run["obb"]["latency_s"] = round(time.perf_counter() - t_obb, 3) + + if obb_resp is not None and obb_resp.success: + run["obb"]["success"] = True + run["obb"]["total_objects"] = obb_resp.total_objects + + # Build a lookup: object_id -> OBB data + obb_by_id = {} + for i, oid in enumerate(obb_resp.object_ids): + theta_rad = obb_resp.thetas[i] + # The service already stores the remapped angle (90 - geom_deg). + # angle_deg is directly the display angle (0° = vertical). + angle_deg = math.degrees(theta_rad) + obb_by_id[oid] = { + "obb_center_u": obb_resp.centers_u[i], + "obb_center_v": obb_resp.centers_v[i], + "obb_theta_rad": round(theta_rad, 5), + "obb_angle_deg": round(angle_deg, 2), + "obb_width_px": round(obb_resp.widths[i], 2), + "obb_height_px": round(obb_resp.heights[i], 2), + } + + # Merge into existing objects list (match by object_id) + for obj in run["objects"]: + obb = obb_by_id.get(obj.get("object_id", "")) + if obb: + obj.update(obb) + else: + # Initialise missing OBB fields so Excel has consistent columns + obj.setdefault("obb_center_u", "") + obj.setdefault("obb_center_v", "") + obj.setdefault("obb_theta_rad", "") + obj.setdefault("obb_angle_deg", "") + obj.setdefault("obb_width_px", "") + obj.setdefault("obb_height_px", "") + + self.get_logger().info( + f" OBB: {obb_resp.total_objects} objects, " + f"latency={run['obb']['latency_s']}s" + ) + else: + # Ensure OBB keys exist even if service was unavailable + for obj in run["objects"]: + for k in ("obb_center_u","obb_center_v","obb_theta_rad","obb_angle_deg","obb_width_px","obb_height_px"): + obj.setdefault(k, "") + self.get_logger().warn(" OBB: service unavailable or returned failure") + # ---- Finalize ---- run["meta"]["latency_s"] = round(time.perf_counter() - t0, 3) self.get_logger().info( From 7a79158402dc36a4147e5ff39e239a5d42c78c5c Mon Sep 17 00:00:00 2001 From: Methasit-Pun Date: Sun, 5 Apr 2026 13:17:33 +0700 Subject: [PATCH 08/16] feat: improve intrinsic and empirical logic method --- vision/pixel_to_real.py | 244 +++++++++++++++++++++++----------------- 1 file changed, 143 insertions(+), 101 deletions(-) diff --git a/vision/pixel_to_real.py b/vision/pixel_to_real.py index 1003e1c..43d678a 100644 --- a/vision/pixel_to_real.py +++ b/vision/pixel_to_real.py @@ -12,18 +12,27 @@ float64 y # world Y coordinate (m, positive forward/away from camera) - with -0.722m offset applied float64 z # world Z coordinate (m, height above table/ground) -Calibration Data: - - Origin (0, 0, 0.8) in world: pixel (320, 500) - bottom center of image - - Green box at world (0.5, 0, 0.8): pixel (320, 240) - - Gear part at world (0.83, 0.03, 0.8): pixel (305, 95) - - Drill at world (0.571546, -0.240961, 0.831898): pixel (466, 160.5) - - Monkey wrench at world (0.623673, 0.372909, 0.806652): pixel (150, 200) - - Table depth: 0.8 m from camera - - Coordinate mapping: - * u increases right → y DECREASES (u represents -y direction) - * v increases down → x DECREASES (v represents -x direction) - * x in world increases upward in image (opposite of v) - * y in world increases leftward in image (opposite of u) +Calibration Method (iLogic Hybrid): + Linear model fitted from 22 empirical measurements (least-squares, LOO-CV RMSE ≈ 2.0 cm): + x = 0.00130317·u + 0.00002114·v − 0.56859693 + y = −0.00002728·u − 0.00133088·v + 0.98011251 + + A Gaussian-IDW empirical correction is added on top: + - Problem Zone (near any of the 22 samples): correction applied → RMSE ≈ 1.5 cm + - Golden Zone (far from all samples): pure linear model used → avoids overfitting + + Empirical samples (u, v → world x, y): + (560,362)→(0.165,0.463) (468,452)→(0.064,0.367) + (334,336)→(-0.135,0.495) (241,432)→(-0.245,0.405) + (598,245)→(0.205,0.618) (493,342)→(0.104,0.520) + (304,327)→(-0.183,0.510) (206,423)→(-0.295,0.418) + (587,113)→(0.195,0.810) (490,202)→(0.090,0.715) + (273,111)→(-0.220,0.800) (177,206)→(-0.325,0.705) + (308,302)→(-0.164,0.572) (276,324)→(-0.200,0.533) + (555,298)→(0.145,0.576) (456,384)→(0.040,0.482) + (343,284)→(-0.138,0.580) (247,375)→(-0.240,0.485) + (562, 85)→(0.166,0.860) (468,173)→(0.065,0.770) + (202, 98)→(-0.310,0.835) (101,196)→(-0.415,0.740) Setup: 1. Build the custom_interfaces package: @@ -157,55 +166,55 @@ def __init__(self, # Publisher for debug visualization self.debug_pub = self.create_publisher(Image, '/pixel_to_real/debug_image', 10) - # Calibration data: pixel coordinates (u, v) -> world coordinates (x, y, z) - # Using calibration points: - # Origin: (u=320, v=500) -> (x=0, y=0, z=0.8) - # Green box: (u=320, v=240) -> (x=0.5, y=0, z=0.8) - # Gear: (u=305, v=95) -> (x=0.83, y=0.03, z=0.8) - - # Coordinate system mapping: - # u increases right -> y DECREASES (u represents -y direction) - # v increases down -> x DECREASES (v represents -x direction) - - self.u_origin = 320 # u=320 corresponds to y=0 - self.v_origin = 500 # v=500 corresponds to x=0 - - # Calculate scaling factors from calibration points: - # Green box: du=0, dv=-260 pixels -> dx=0.5, dy=0 meters - # Gear: du=-15, dv=-405 pixels -> dx=0.83, dy=0.03 meters - - # From green box (vertical movement in image): - # dv = 240 - 500 = -260 pixels (up in image) - # dx = 0.5 - 0 = 0.5 meters (positive x, which is up) - # scale_x = 0.5 / 260 = 0.00192 m/pixel - - # From gear (horizontal AND vertical movement): - # du = 305 - 320 = -15 pixels (left in image) - # dy = 0.03 - 0 = 0.03 meters (positive y, which is left) - # scale_y = 0.03 / 15 = 0.002 m/pixel - - dv_green = 240 - 500 # -260 pixels (up in image) - dx_green = 0.5 - 0 # 0.5 meters (positive x in world) - - du_gear = 305 - 320 # -15 pixels (left in image) - dy_gear = 0.03 - 0 # 0.03 meters (positive y in world) - - self.scale_x = abs(dx_green / dv_green) # 0.5/260 = 0.00192 m/pixel - self.scale_y = abs(dy_gear / du_gear) # 0.03/15 = 0.002 m/pixel - - # Depth calibration: Store reference depth for z-coordinate conversion - # At calibration points, z should be 0.8m (table height) - # We'll measure the actual depth sensor reading and use it as reference - self.z_table = 0.8 # World z-coordinate of table surface - self.depth_reference = None # Will be set from first depth reading at calibration point - - self.get_logger().info(f'Pixel-to-world calibration: scale_x={self.scale_x:.6f} m/px, scale_y={self.scale_y:.6f} m/px') - self.get_logger().info(f'Origin: pixel({self.u_origin}, {self.v_origin}) -> world(0, 0, 0.8)') - self.get_logger().info(f'Coordinate mapping: u right=-y, v down=-x') - self.get_logger().info(f'Calibrated from green box at (320,240)->(0.5,0,0.8) and gear at (305,95)->(0.83,0.03,0.8)') - self.get_logger().info(f'Validation point: drill at (466,160)->(0.572,-0.241,0.832)') - self.get_logger().info(f'Validation point: monkey_wrench at (150,200)->(0.624,0.373,0.807)') - self.get_logger().info(f'Depth calibration: Call service at (320,240) to set depth reference for z=0.8m') + # ── iLogic Calibration ──────────────────────────────────────────────── + # Linear model fitted from 22 empirical measurements (least-squares): + # x = 0.00130317 * u +0.00002114 * v -0.56859693 + # y = -0.00002728 * u -0.00133088 * v +0.98011251 + # LOO-CV RMSE ≈ 0.020 m + self.lin_cx = np.array([+0.00130317, +0.00002114, -0.56859693]) # [u, v, 1] → x + self.lin_cy = np.array([-0.00002728, -0.00133088, +0.98011251]) # [u, v, 1] → y + + # Empirical correction table (u, v, residual_x, residual_y) + # residual = true_world - linear_prediction ← precomputed offline + # Used by the IDW Gaussian kernel to correct systematic lens distortion. + self._emp = np.array([ + # u v res_x res_y + [560, 362, -0.00383, -0.02010], # M1G_TR + [468, 452, 0.01316, 0.00117], # M1G_BL + [334, 336, -0.00878, -0.02882], # M1P_TR + [241, 432, 0.00040, 0.00640], # M1P_BL + [598, 245, -0.01089, -0.01965], # M2G_TR + [493, 342, 0.02291, 0.00854], # M2G_BL + [304, 327, -0.01747, -0.02659], # M2P_TR + [206, 423, -0.00384, 0.00649], # M2P_BL + [587, 113, -0.00378, -0.00365], # M3G_TR + [490, 202, 0.01584, 0.01714], # M3G_BL + [273, 111, -0.00952, -0.02493], # M3P_TR + [177, 206, 0.00856, 0.00391], # M3P_BL + [308, 302, -0.00316, 0.00224], # M3X_TR + [276, 324, 0.00207, -0.00836], # M3X_BL + [555, 298, -0.01601, 0.00758], # M4G_TR + [456, 384, 0.00623, 0.02542], # M4G_BL + [343, 284, -0.02237, -0.01282], # M4P_TR + [247, 375, -0.00124, 0.01070], # M4P_BL + [562, 85, 0.00041, 0.00834], # M5G_TR + [468, 173, 0.02007, 0.03286], # M5G_BL + [202, 98, -0.00669, -0.00916], # M5P_TR + [101, 196, 0.01778, 0.02352], # M5P_BL + ], dtype=np.float64) + # Gaussian kernel bandwidth (pixels). Controls how far correction influence spreads. + self._idw_sigma = 80.0 + # Threshold: if IDW weight-sum < this fraction of max possible, treat as Golden Zone. + self._golden_weight_threshold = 0.05 + + # Depth calibration + self.z_table = 0.8 + self.depth_reference = None + + self.get_logger().info('iLogic pixel-to-world: linear model + IDW empirical correction') + self.get_logger().info(f' x = {self.lin_cx[0]:+.8f}*u {self.lin_cx[1]:+.8f}*v {self.lin_cx[2]:+.8f}') + self.get_logger().info(f' y = {self.lin_cy[0]:+.8f}*u {self.lin_cy[1]:+.8f}*v {self.lin_cy[2]:+.8f}') + self.get_logger().info(f' IDW sigma={self._idw_sigma}px empirical samples={len(self._emp)}') self.get_logger().info(f'RGB topic: {self.rgb_topic}') self.get_logger().info(f'Depth topic: {self.depth_topic}') self.get_logger().info(f'Camera info topic: {self.camera_info_topic}') @@ -213,11 +222,12 @@ def __init__(self, # Store calibration validation points for accuracy checking self.validation_points = [ - {"name": "green_box", "pixel": (320, 240), "world": (0.5, 0.0, 0.8)}, - {"name": "gear", "pixel": (305, 95), "world": (0.83, 0.03, 0.8)}, - {"name": "drill", "pixel": (466, 160), "world": (0.571546, -0.240961, 0.831898)}, - {"name": "monkey_wrench", "pixel": (150, 200), "world": (0.623673, 0.372909, 0.806652)}, - {"name": "origin", "pixel": (320, 500), "world": (0.0, 0.0, 0.8)} + {"name": "M1G_TR", "pixel": (560, 362), "world": ( 0.165, 0.463, 0.8)}, + {"name": "M1P_TR", "pixel": (334, 336), "world": (-0.135, 0.495, 0.8)}, + {"name": "M3G_TR", "pixel": (587, 113), "world": ( 0.195, 0.810, 0.8)}, + {"name": "M3P_TR", "pixel": (273, 111), "world": (-0.220, 0.800, 0.8)}, + {"name": "M5G_TR", "pixel": (562, 85), "world": ( 0.166, 0.860, 0.8)}, + {"name": "M5P_TR", "pixel": (202, 98), "world": (-0.310, 0.835, 0.8)}, ] # TF @@ -265,53 +275,85 @@ def depth_cb(self, msg: Image): def info_cb(self, msg: CameraInfo): self.camera_info = msg + def _idw_correction(self, u: float, v: float): + """iLogic: compute Gaussian-IDW empirical correction at pixel (u, v). + + Golden Zone (low influence from empirical samples) → correction ≈ 0. + Problem Zone (high influence near known distortion samples) → correction applied. + + Returns (corr_x, corr_y, weight_sum_norm) where weight_sum_norm in [0, 1]. + """ + eu = self._emp[:, 0] + ev = self._emp[:, 1] + err_x = self._emp[:, 2] + err_y = self._emp[:, 3] + + dists = np.sqrt((eu - u) ** 2 + (ev - v) ** 2) + weights = np.exp(-0.5 * (dists / self._idw_sigma) ** 2) + w_sum = weights.sum() + + # Maximum possible weight_sum (if u,v were exactly on a sample point) + max_w_sum = len(self._emp) * 1.0 # upper bound: all weights=1 + w_norm = w_sum / max_w_sum + + if w_sum < 1e-12: + return 0.0, 0.0, 0.0 + + corr_x = float(np.dot(weights, err_x) / w_sum) + corr_y = float(np.dot(weights, err_y) / w_sum) + return corr_x, corr_y, w_norm + def pixel_to_world_calibrated(self, u: int, v: int, depth_m: float): - """Convert pixel (u,v) to world coordinates (x,y,z) using calibration. - - Coordinate system: - - u increases right -> y DECREASES (u represents -y direction) - - v increases down -> x DECREASES (v represents -x direction) - - Origin at pixel (320, 500) = world (0, 0, 0.8) - - Depth is inversely related to z: small depth = high z (near camera, far from ground) - + """iLogic pixel → world conversion. + + 1. Apply the empirically fitted linear model (replaces the old 2-point + scale_x / scale_y formula which had large systematic errors). + 2. Add a Gaussian-IDW empirical correction: + - Problem Zone (high weight from nearby samples): correction applied fully. + - Golden Zone (low weight / far from all samples): correction fades to 0, + pure linear model used — avoids overfitting in unsampled areas. + Args: u: pixel column (positive right) v: pixel row (positive down) - depth_m: depth in meters from camera (from depth sensor) - + depth_m: depth in meters from camera + Returns: (x, y, z) in world coordinates (meters) """ - # Calculate pixel offset from origin - du = u - self.u_origin # positive = right in image - dv = v - self.v_origin # positive = down in image - - # Apply transformation based on coordinate mapping: - # v down (-dv up) -> x increases: x = -dv * scale_x - # u right (-du left) -> y increases: y = -du * scale_y - x = -dv * self.scale_x # Up in image -> positive x - y = -du * self.scale_y # Left in image -> positive y - - # Apply y-offset for robot reference frame - # y = y - 0.5442 # Shift y by -0.5442 meters - - # Convert depth to z-coordinate - # Depth is inversely related to z: smaller depth = further from ground = higher z - # At table (z=0.8), we need to calibrate based on actual depth reading - # If depth_reference is set, use it; otherwise estimate from depth + # ── Step 1: Linear model (Golden Zone baseline) ─────────────────────── + feat = np.array([u, v, 1.0]) + x = float(np.dot(self.lin_cx, feat)) + y = float(np.dot(self.lin_cy, feat)) + + # ── Step 2: iLogic empirical correction ─────────────────────────────── + corr_x, corr_y, w_norm = self._idw_correction(float(u), float(v)) + + if w_norm >= self._golden_weight_threshold: + # Problem Zone: apply correction + x += corr_x + y += corr_y + zone = 'problem' + else: + # Golden Zone: trust linear model, skip correction + zone = 'golden' + + self.get_logger().debug( + f'iLogic zone={zone} w_norm={w_norm:.3f} ' + f'corr=({corr_x:+.4f},{corr_y:+.4f}) ' + f'-> x={x:.4f} y={y:.4f}' + ) + + # ── Step 3: Depth → z ───────────────────────────────────────────────── if self.real_hardware: - z = depth_m # Direct mapping for hardware + z = depth_m elif self.depth_reference is not None: - # z = z_table + (depth_reference - depth) - # When depth < depth_reference (closer to camera), z increases - # When depth > depth_reference (further from camera), z decreases z = self.z_table + (self.depth_reference - depth_m) else: - # First call: assume this is close to table depth, set reference self.depth_reference = depth_m self.get_logger().info(f'Set depth reference: {self.depth_reference:.3f}m at z={self.z_table}m') z = self.z_table - + return (x, y, z) def read_depth_at(self, u: float, v: float, max_search: int = 5): @@ -429,8 +471,8 @@ def handle_pixel_to_real(self, req, resp): cv2.putText(debug_img, label_depth, (u + 25, v + 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2) - # Draw origin marker at (320, 500) - origin_u, origin_v = self.u_origin, self.v_origin + # Draw frame-centre marker (approximate optical centre) + origin_u, origin_v = 320, 240 if 0 <= origin_u < debug_img.shape[1] and 0 <= origin_v < debug_img.shape[0]: cv2.drawMarker(debug_img, (origin_u, origin_v), (255, 255, 0), cv2.MARKER_TILTED_CROSS, 30, 2) From a261190a3d653e379b112ebcb2b6795c8941d03a Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Thu, 9 Apr 2026 14:15:24 +0700 Subject: [PATCH 09/16] feat: ref from previous p2r --- vision/pixel_to_real.py | 665 +++++++++++++--------------------------- 1 file changed, 211 insertions(+), 454 deletions(-) diff --git a/vision/pixel_to_real.py b/vision/pixel_to_real.py index 43d678a..3347431 100644 --- a/vision/pixel_to_real.py +++ b/vision/pixel_to_real.py @@ -8,46 +8,30 @@ int32 u # pixel column (x-coordinate, positive to the right) int32 v # pixel row (y-coordinate, positive downward) Response: - float64 x # world X coordinate (m, positive to the right) - float64 y # world Y coordinate (m, positive forward/away from camera) - with -0.722m offset applied + float64 x # world X coordinate (m) + float64 y # world Y coordinate (m, positive forward/away from camera) float64 z # world Z coordinate (m, height above table/ground) -Calibration Method (iLogic Hybrid): - Linear model fitted from 22 empirical measurements (least-squares, LOO-CV RMSE ≈ 2.0 cm): - x = 0.00130317·u + 0.00002114·v − 0.56859693 - y = −0.00002728·u − 0.00133088·v + 0.98011251 - - A Gaussian-IDW empirical correction is added on top: - - Problem Zone (near any of the 22 samples): correction applied → RMSE ≈ 1.5 cm - - Golden Zone (far from all samples): pure linear model used → avoids overfitting - - Empirical samples (u, v → world x, y): - (560,362)→(0.165,0.463) (468,452)→(0.064,0.367) - (334,336)→(-0.135,0.495) (241,432)→(-0.245,0.405) - (598,245)→(0.205,0.618) (493,342)→(0.104,0.520) - (304,327)→(-0.183,0.510) (206,423)→(-0.295,0.418) - (587,113)→(0.195,0.810) (490,202)→(0.090,0.715) - (273,111)→(-0.220,0.800) (177,206)→(-0.325,0.705) - (308,302)→(-0.164,0.572) (276,324)→(-0.200,0.533) - (555,298)→(0.145,0.576) (456,384)→(0.040,0.482) - (343,284)→(-0.138,0.580) (247,375)→(-0.240,0.485) - (562, 85)→(0.166,0.860) (468,173)→(0.065,0.770) - (202, 98)→(-0.310,0.835) (101,196)→(-0.415,0.740) - -Setup: - 1. Build the custom_interfaces package: - cd ~/final_project_ws - colcon build --packages-select custom_interfaces - source install/setup.bash - - 2. Build the vision package: - colcon build --packages-select vision --symlink-install - source install/setup.bash - - 3. Start the pixel_to_real service node: - ros2 run vision pixel_to_real_service +Calibration Methods (Hybrid): + METHOD 1 – Intrinsic: + Pinhole back-projection using camera intrinsics + known camera pose (R, t). + t_base_cam is the tunable offset; R_base_cam captures camera orientation. + + METHOD 2 – Empirical: + Least-squares fit on 22 calibration points (5 measures × 2 colours × TR/BL), + 2026-03-12, camera position [-0.0361, 0.5303, 0.6458], height 0.67 m: + x = -0.56859693 + 0.00130317*u + 0.00002114*v (RMS 12.1 mm) + y = 0.98011251 - 0.00002728*u - 0.00133088*v (RMS 16.9 mm) + 2D RMSE = 20.8 mm + + HYBRID: + Gaussian blend – empirical weighted higher near image centre (calibration + region), intrinsic weighted higher toward edges. + +Tuning: + • t_base_cam – translate camera origin in base frame (x, y, z) + • Empirical coefficients – refit from new calibration measurements -Usage: # Example: green box at pixel (320, 240) → world (0.5, 0.0, 0.8) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 320, v: 240}" @@ -66,27 +50,6 @@ # View debug visualization: rqt_image_view /pixel_to_real/debug_image -How it works: - Uses calibration points to compute a linear pixel-to-world transformation. - Reads depth from /camera/depth/image_raw to determine z-coordinate. - Shows debug visualization with pixel location and computed world coordinates. -""" - - -""" -# linear approx. formula -# Given pixel coordinates (u, v) and depth d from sensor: - -# Step 1: Calculate pixel offset from origin -du = u - 320 # Origin u-coordinate -dv = v - 500 # Origin v-coordinate - -# Step 2: Apply scaling and sign conversion -x = -dv * 0.001923 # scale_x = 0.5/260 ≈ 0.001923 m/pixel -y = -du * 0.002 # scale_y = 0.03/15 = 0.002 m/pixel - -# Step 3: Convert depth to z-coordinate -z = 0.8 + (depth_reference - d) # depth_reference set on first call """ @@ -94,446 +57,240 @@ import rclpy from rclpy.node import Node from sensor_msgs.msg import Image, CameraInfo -from geometry_msgs.msg import PointStamped from cv_bridge import CvBridge import numpy as np import math -import tf2_ros import cv2 -import pyrealsense2 as rs -# Replace with your actual service type. The repo often imports custom interfaces -# from `custom_interfaces.srv`. Adjust the import if your package name differs. +from image_geometry import PinholeCameraModel + try: from custom_interfaces.srv import PixelToReal except Exception: - # The import may fail in editors; keep the name for when the package is built. PixelToReal = None class PixelToRealServer(Node): - def __init__(self, - rgb_topic: str = '/camera/image_raw', - depth_topic: str = '/camera/depth/image_raw', - info_topic: str = '/camera/camera_info', - default_target_frame: str = 'world'): + + def __init__(self): super().__init__('pixel_to_real_server') - # Parameter toggles simulated vs hardware camera topics - self.declare_parameter('real_hardware', False) + self.declare_parameter('real_hardware', True) self.real_hardware = bool(self.get_parameter('real_hardware').value) - # RealSense-specific variables for hardware mode - self.rs_pipeline = None - self.rs_align = None - self.rs_intrinsics = None - if self.real_hardware: - self.rgb_topic = '/camera/color/image_raw' - self.depth_topic = '/camera/depth/image_rect_raw' - self.camera_info_topic = 'camera/color/camera_info' - self.color_encoding = 'passthrough' - self.depth_32_encoding = 'passthrough' - self.depth_16_encoding = 'passthrough' - - # Note: We DO NOT initialize RealSense pipeline here! - # The camera is published by a separate node (e.g., realsense-ros) - # We only subscribe to the published topics - self.get_logger().info('Real hardware mode: subscribing to RealSense topics') - self.get_logger().info(f' RGB: {self.rgb_topic}') - self.get_logger().info(f' Depth: {self.depth_topic}') - self.get_logger().info(f' Camera Info: {self.camera_info_topic}') + depth_topic = '/camera/depth/image_rect_raw' + info_topic = '/camera/color/camera_info' + rgb_topic = '/camera/color/image_raw' else: - self.rgb_topic = rgb_topic or '/camera/image_raw' - self.depth_topic = depth_topic or '/camera/depth/image_raw' - self.camera_info_topic = info_topic or '/camera/camera_info' - self.color_encoding = 'bgr8' - self.depth_32_encoding = '32FC1' - self.depth_16_encoding = '16UC1' - - self.bridge = CvBridge() - self.latest_rgb = None - self.latest_rgb_header = None + depth_topic = '/camera/depth/image_raw' + info_topic = '/camera/camera_info' + rgb_topic = '/camera/image_raw' + + self.bridge = CvBridge() + self.cam_model = PinholeCameraModel() + + self.latest_rgb = None self.latest_depth = None - self.latest_depth_header = None - self.camera_info = None - self.default_target_frame = default_target_frame + self.camera_ready = False - self.rgb_sub = self.create_subscription(Image, self.rgb_topic, self.rgb_cb, 10) - self.depth_sub = self.create_subscription(Image, self.depth_topic, self.depth_cb, 10) - self.info_sub = self.create_subscription(CameraInfo, self.camera_info_topic, self.info_cb, 10) + self.depth_sub = self.create_subscription(Image, depth_topic, self.depth_cb, 10) + self.info_sub = self.create_subscription(CameraInfo, info_topic, self.info_cb, 10) + self.rgb_sub = self.create_subscription(Image, rgb_topic, self.rgb_cb, 10) - # Publisher for debug visualization self.debug_pub = self.create_publisher(Image, '/pixel_to_real/debug_image', 10) - # ── iLogic Calibration ──────────────────────────────────────────────── - # Linear model fitted from 22 empirical measurements (least-squares): - # x = 0.00130317 * u +0.00002114 * v -0.56859693 - # y = -0.00002728 * u -0.00133088 * v +0.98011251 - # LOO-CV RMSE ≈ 0.020 m - self.lin_cx = np.array([+0.00130317, +0.00002114, -0.56859693]) # [u, v, 1] → x - self.lin_cy = np.array([-0.00002728, -0.00133088, +0.98011251]) # [u, v, 1] → y - - # Empirical correction table (u, v, residual_x, residual_y) - # residual = true_world - linear_prediction ← precomputed offline - # Used by the IDW Gaussian kernel to correct systematic lens distortion. - self._emp = np.array([ - # u v res_x res_y - [560, 362, -0.00383, -0.02010], # M1G_TR - [468, 452, 0.01316, 0.00117], # M1G_BL - [334, 336, -0.00878, -0.02882], # M1P_TR - [241, 432, 0.00040, 0.00640], # M1P_BL - [598, 245, -0.01089, -0.01965], # M2G_TR - [493, 342, 0.02291, 0.00854], # M2G_BL - [304, 327, -0.01747, -0.02659], # M2P_TR - [206, 423, -0.00384, 0.00649], # M2P_BL - [587, 113, -0.00378, -0.00365], # M3G_TR - [490, 202, 0.01584, 0.01714], # M3G_BL - [273, 111, -0.00952, -0.02493], # M3P_TR - [177, 206, 0.00856, 0.00391], # M3P_BL - [308, 302, -0.00316, 0.00224], # M3X_TR - [276, 324, 0.00207, -0.00836], # M3X_BL - [555, 298, -0.01601, 0.00758], # M4G_TR - [456, 384, 0.00623, 0.02542], # M4G_BL - [343, 284, -0.02237, -0.01282], # M4P_TR - [247, 375, -0.00124, 0.01070], # M4P_BL - [562, 85, 0.00041, 0.00834], # M5G_TR - [468, 173, 0.02007, 0.03286], # M5G_BL - [202, 98, -0.00669, -0.00916], # M5P_TR - [101, 196, 0.01778, 0.02352], # M5P_BL - ], dtype=np.float64) - # Gaussian kernel bandwidth (pixels). Controls how far correction influence spreads. - self._idw_sigma = 80.0 - # Threshold: if IDW weight-sum < this fraction of max possible, treat as Golden Zone. - self._golden_weight_threshold = 0.05 - - # Depth calibration - self.z_table = 0.8 - self.depth_reference = None - - self.get_logger().info('iLogic pixel-to-world: linear model + IDW empirical correction') - self.get_logger().info(f' x = {self.lin_cx[0]:+.8f}*u {self.lin_cx[1]:+.8f}*v {self.lin_cx[2]:+.8f}') - self.get_logger().info(f' y = {self.lin_cy[0]:+.8f}*u {self.lin_cy[1]:+.8f}*v {self.lin_cy[2]:+.8f}') - self.get_logger().info(f' IDW sigma={self._idw_sigma}px empirical samples={len(self._emp)}') - self.get_logger().info(f'RGB topic: {self.rgb_topic}') - self.get_logger().info(f'Depth topic: {self.depth_topic}') - self.get_logger().info(f'Camera info topic: {self.camera_info_topic}') - self.get_logger().info(f'real_hardware parameter: {self.real_hardware}') - - # Store calibration validation points for accuracy checking - self.validation_points = [ - {"name": "M1G_TR", "pixel": (560, 362), "world": ( 0.165, 0.463, 0.8)}, - {"name": "M1P_TR", "pixel": (334, 336), "world": (-0.135, 0.495, 0.8)}, - {"name": "M3G_TR", "pixel": (587, 113), "world": ( 0.195, 0.810, 0.8)}, - {"name": "M3P_TR", "pixel": (273, 111), "world": (-0.220, 0.800, 0.8)}, - {"name": "M5G_TR", "pixel": (562, 85), "world": ( 0.166, 0.860, 0.8)}, - {"name": "M5P_TR", "pixel": (202, 98), "world": (-0.310, 0.835, 0.8)}, - ] - - # TF - self.tf_buffer = tf2_ros.Buffer() - self.tf_listener = tf2_ros.TransformListener(self.tf_buffer, self) - - # Create service + # ── Depth scale (16UC1 → metres) ───────────────────────────────────── + self.depth_scale = 0.001 # mm → m + + # ── METHOD 1: Camera pose in base_link ─────────────────────────────── + # t_base_cam : camera origin expressed in base frame ← TUNE THIS + # R_base_cam : rotation that maps camera axes to base axes + # (camera Z forward → base -Z, camera Y down → base -Y) + self.t_base_cam = np.array([-0.146, 0.635, 0.8]) # <-- adjust here + + self.R_base_cam = np.array([ + [ 1.0, 0.0, 0.0], + [ 0.0, -1.0, 0.0], + [ 0.0, 0.0, -1.0], + ]) + + # ── METHOD 2: Empirical linear model ───────────────────────────────── + # Least-squares fit on 22 calibration points (5 measures × 2 colours + # × TR/BL), 2026-03-12. Model: x = a0 + a1*u + a2*v + # y = b0 + b1*u + b2*v + # 2D RMSE = 20.8 mm (x: 12.1 mm, y: 16.9 mm) + self._emp_ax = np.array([-0.56859693, +0.00130317, +0.00002114]) # [1,u,v]→x + self._emp_ay = np.array([+0.98011251, -0.00002728, -0.00133088]) # [1,u,v]→y + self._emp_z = -0.002 # table height assumption (m) + + # ── Hybrid blend parameters ─────────────────────────────────────────── + # Empirical is most accurate near image centre (calibration region). + # Gaussian decay controls how quickly we trust intrinsics at the edges. + self._image_center = (320, 240) # (u_c, v_c) + self._blend_sigma_frac = 0.6 # fraction of max diagonal for σ + if PixelToReal is not None: - self.srv = self.create_service(PixelToReal, 'pixel_to_real', self.handle_pixel_to_real) - self.get_logger().info('Service /pixel_to_real ready (custom_interfaces.srv.PixelToReal)') - self.get_logger().info('Usage: ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 320, v: 240}"') + self.srv = self.create_service( + PixelToReal, '/pixel_to_real', self.handle_pixel_to_real + ) + self.get_logger().info('Service /pixel_to_real ready') else: - self.get_logger().error('PixelToReal srv type not found!') - self.get_logger().error('Build custom_interfaces: colcon build --packages-select custom_interfaces') + self.get_logger().error( + 'PixelToReal srv not found – build custom_interfaces first.' + ) raise RuntimeError('custom_interfaces.srv.PixelToReal not available') + self.get_logger().info( + f't_base_cam = {self.t_base_cam.tolist()} ' + f'real_hardware={self.real_hardware}' + ) + + # ── Callbacks ───────────────────────────────────────────────────────────── + def rgb_cb(self, msg: Image): - """Store the latest RGB image for pixel coordinate validation.""" try: - rgb_img = self.bridge.imgmsg_to_cv2(msg, desired_encoding=self.color_encoding) - self.latest_rgb = rgb_img - self.latest_rgb_header = msg.header + self.latest_rgb = self.bridge.imgmsg_to_cv2(msg, 'bgr8') except Exception as e: - self.get_logger().error(f"Failed to convert RGB image: {e}") + self.get_logger().error(f'RGB convert error: {e}') def depth_cb(self, msg: Image): - # Support 32FC1 and 16UC1 encodings; convert to float32 meters. try: - if msg.encoding == '32FC1' or msg.encoding == '32F': - depth_img = self.bridge.imgmsg_to_cv2(msg, desired_encoding=self.depth_32_encoding) - depth = depth_img.astype(np.float32) - elif msg.encoding == '16UC1' or msg.encoding == '16U': - d16 = self.bridge.imgmsg_to_cv2(msg, desired_encoding=self.depth_16_encoding) - depth = d16.astype(np.float32) / 1000.0 # assume mm -> m + if msg.encoding in ('16UC1', '16U'): + raw = self.bridge.imgmsg_to_cv2(msg, 'passthrough') + self.latest_depth = raw.astype(np.float32) * self.depth_scale + elif msg.encoding in ('32FC1', '32F'): + self.latest_depth = self.bridge.imgmsg_to_cv2( + msg, 'passthrough' + ).astype(np.float32) else: - # Try a generic conversion to float32 - depth_img = self.bridge.imgmsg_to_cv2(msg, desired_encoding='passthrough') - depth = depth_img.astype(np.float32) - - self.latest_depth = depth - self.latest_depth_header = msg.header + raw = self.bridge.imgmsg_to_cv2(msg, 'passthrough') + self.latest_depth = raw.astype(np.float32) except Exception as e: - self.get_logger().error(f"Failed to convert depth image: {e}") + self.get_logger().error(f'Depth convert error: {e}') def info_cb(self, msg: CameraInfo): - self.camera_info = msg - - def _idw_correction(self, u: float, v: float): - """iLogic: compute Gaussian-IDW empirical correction at pixel (u, v). + self.cam_model.fromCameraInfo(msg) + self.camera_ready = True - Golden Zone (low influence from empirical samples) → correction ≈ 0. - Problem Zone (high influence near known distortion samples) → correction applied. + # ── Depth reading ────────────────────────────────────────────────────────── - Returns (corr_x, corr_y, weight_sum_norm) where weight_sum_norm in [0, 1]. - """ - eu = self._emp[:, 0] - ev = self._emp[:, 1] - err_x = self._emp[:, 2] - err_y = self._emp[:, 3] - - dists = np.sqrt((eu - u) ** 2 + (ev - v) ** 2) - weights = np.exp(-0.5 * (dists / self._idw_sigma) ** 2) - w_sum = weights.sum() - - # Maximum possible weight_sum (if u,v were exactly on a sample point) - max_w_sum = len(self._emp) * 1.0 # upper bound: all weights=1 - w_norm = w_sum / max_w_sum - - if w_sum < 1e-12: - return 0.0, 0.0, 0.0 - - corr_x = float(np.dot(weights, err_x) / w_sum) - corr_y = float(np.dot(weights, err_y) / w_sum) - return corr_x, corr_y, w_norm - - def pixel_to_world_calibrated(self, u: int, v: int, depth_m: float): - """iLogic pixel → world conversion. - - 1. Apply the empirically fitted linear model (replaces the old 2-point - scale_x / scale_y formula which had large systematic errors). - 2. Add a Gaussian-IDW empirical correction: - - Problem Zone (high weight from nearby samples): correction applied fully. - - Golden Zone (low weight / far from all samples): correction fades to 0, - pure linear model used — avoids overfitting in unsampled areas. - - Args: - u: pixel column (positive right) - v: pixel row (positive down) - depth_m: depth in meters from camera - - Returns: - (x, y, z) in world coordinates (meters) + def get_robust_depth(self, u: int, v: int, window: int = 5) -> float | None: + """Median depth over a small window; returns None if no valid pixels.""" + if self.latest_depth is None: + return None + h, w = self.latest_depth.shape + half = window // 2 + u0, u1 = max(u - half, 0), min(u + half + 1, w) + v0, v1 = max(v - half, 0), min(v + half + 1, h) + patch = self.latest_depth[v0:v1, u0:u1] + valid = patch[np.isfinite(patch) & (patch > 0.0)] + return float(np.median(valid)) if len(valid) > 0 else None + + # ── Conversion methods ───────────────────────────────────────────────────── + + def _intrinsic_estimate(self, u: int, v: int, Z: float) -> np.ndarray: + """METHOD 1: pinhole back-projection + rigid-body transform to base. + + ray = cam_model.projectPixelTo3dRay((u, v)) [unit vector in cam frame] + p_cam = ray * Z [scale to depth] + p_base = R @ p_cam + t [transform to base frame] """ - # ── Step 1: Linear model (Golden Zone baseline) ─────────────────────── - feat = np.array([u, v, 1.0]) - x = float(np.dot(self.lin_cx, feat)) - y = float(np.dot(self.lin_cy, feat)) - - # ── Step 2: iLogic empirical correction ─────────────────────────────── - corr_x, corr_y, w_norm = self._idw_correction(float(u), float(v)) - - if w_norm >= self._golden_weight_threshold: - # Problem Zone: apply correction - x += corr_x - y += corr_y - zone = 'problem' + if not self.camera_ready: + # Fallback: hardcoded D435i intrinsics + fx, fy, cx, cy = 615.0, 615.0, 320.0, 240.0 + ray = np.array([(u - cx) / fx, (v - cy) / fy, 1.0]) else: - # Golden Zone: trust linear model, skip correction - zone = 'golden' + ray = np.array(self.cam_model.projectPixelTo3dRay((u, v))) + + p_cam = ray * Z + p_base = self.R_base_cam @ p_cam + self.t_base_cam + return p_base + + def _empirical_estimate(self, u: int, v: int) -> np.ndarray: + """METHOD 2: simple linear model fitted from calibration data.""" + feat = np.array([1.0, float(u), float(v)]) + x = float(np.dot(self._emp_ax, feat)) + y = float(np.dot(self._emp_ay, feat)) + return np.array([x, y, self._emp_z]) + + def _hybrid_estimate( + self, + u: int, v: int, + p_intrinsic: np.ndarray, + p_empirical: np.ndarray, + ) -> tuple[np.ndarray, float, float]: + """HYBRID: Gaussian-weighted blend. + + Empirical weight is highest at the image centre (calibration region) + and decays towards the edges; intrinsic picks up the slack. + """ + uc, vc = self._image_center + dist = math.sqrt((u - uc) ** 2 + (v - vc) ** 2) + sigma = math.sqrt(uc ** 2 + vc ** 2) * self._blend_sigma_frac # ~σ in pixels + + w_emp = math.exp(-dist / sigma) + w_int = 1.0 - w_emp + + p_hybrid = w_emp * p_empirical + w_int * p_intrinsic + return p_hybrid, w_emp, w_int + + # ── Service handler ──────────────────────────────────────────────────────── + + def handle_pixel_to_real(self, request, response): + u = int(request.u) + v = int(request.v) + + # ── Depth ───────────────────────────────────────────────────────────── + Z = self.get_robust_depth(u, v) + if Z is None: + self.get_logger().warn(f'No valid depth at ({u},{v}), using 0.8 m fallback.') + Z = 0.8 # fallback to estimated table depth - self.get_logger().debug( - f'iLogic zone={zone} w_norm={w_norm:.3f} ' - f'corr=({corr_x:+.4f},{corr_y:+.4f}) ' - f'-> x={x:.4f} y={y:.4f}' + # ── Compute both estimates ──────────────────────────────────────────── + p_int = self._intrinsic_estimate(u, v, Z) + p_emp = self._empirical_estimate(u, v) + + # ── Blend ───────────────────────────────────────────────────────────── + p_hybrid, w_emp, w_int = self._hybrid_estimate(u, v, p_int, p_emp) + + response.x = float(p_hybrid[0]) + response.y = float(p_hybrid[1]) + response.z = float(p_hybrid[2]) + + self.get_logger().info( + f'Pixel ({u},{v}) depth={Z:.3f}m | ' + f'Intrinsic=[{p_int[0]:.4f},{p_int[1]:.4f},{p_int[2]:.4f}] ' + f'Empirical=[{p_emp[0]:.4f},{p_emp[1]:.4f}] ' + f'Hybrid=[{p_hybrid[0]:.4f},{p_hybrid[1]:.4f},{p_hybrid[2]:.4f}] ' + f'(w_emp={w_emp:.3f} w_int={w_int:.3f})' ) - # ── Step 3: Depth → z ───────────────────────────────────────────────── - if self.real_hardware: - z = depth_m - elif self.depth_reference is not None: - z = self.z_table + (self.depth_reference - depth_m) - else: - self.depth_reference = depth_m - self.get_logger().info(f'Set depth reference: {self.depth_reference:.3f}m at z={self.z_table}m') - z = self.z_table - - return (x, y, z) - - def read_depth_at(self, u: float, v: float, max_search: int = 5): - """Read depth with bilinear interpolation; if invalid, search a median window. - Returns depth in meters. Assumes table is at 0.8m depth. - - Handles invalid depth (NaN/zero/inf) common on reflective surfaces or sensor noise - by searching a 5-pixel neighborhood and taking the median of valid depth values. - - The depth sensor returns distance from camera. We assume the table surface - is at 0.8m from the camera, which should be the largest/most common depth value. - """ - if self.latest_depth is None: - self.get_logger().warn('No depth image available, using default table depth 0.8m') - return 0.8 # Default table depth - - depth = self.latest_depth - h, w = depth.shape[:2] - if not (0 <= int(v) < h and 0 <= int(u) < w): - return 0.8 # Default if out of bounds - - def bilinear(u_, v_): - x0 = int(math.floor(u_)); x1 = min(x0 + 1, w - 1) - y0 = int(math.floor(v_)); y1 = min(y0 + 1, h - 1) - wa = (x1 - u_) * (y1 - v_) - wb = (u_ - x0) * (y1 - v_) - wc = (x1 - u_) * (v_ - y0) - wd = (u_ - x0) * (v_ - y0) - d00 = float(depth[y0, x0]); d10 = float(depth[y0, x1]) - d01 = float(depth[y1, x0]); d11 = float(depth[y1, x1]) - d = wa * d00 + wb * d10 + wc * d01 + wd * d11 - if np.isnan(d) or d <= 0.0 or np.isinf(d): - return None - return float(d) - - - #ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 220, v: 220}" - # Use topic-based depth reading for both hardware and simulation - if self.latest_depth is not None: - d = self.latest_depth[int(v), int(u)] - self.get_logger().info(f'Read depth at ({u},{v}): {d:.3f}m from topic') - else: - self.get_logger().warn('No depth data available, using default 0.8m') - return 0.8 - - - - - - - - - # Fallback: collect valid depths in neighborhood and take median - valid_depths = [] - for du in range(-max_search, max_search + 1): - for dv in range(-max_search, max_search + 1): - uu = u + du - vv = v + dv - if 0 <= int(uu) < w and 0 <= int(vv) < h: - d_val = bilinear(uu, vv) - if d_val is not None: - valid_depths.append(d_val) - - if len(valid_depths) == 0: - self.get_logger().warn(f'No valid depth found within {max_search}px of ({u:.1f},{v:.1f}), using table depth 0.8m') - return 0.8 # Default table depth - - # Use median to be robust against outliers - median_depth = float(np.median(valid_depths)) - return median_depth - - def backproject(self, u: float, v: float, d: float): - # Use camera_info intrinsics - K = self.camera_info.k - fx = K[0]; fy = K[4]; cx = K[2]; cy = K[5] - x_c = (u - cx) * d / fx - y_c = (v - cy) * d / fy - z_c = d - return np.array([x_c, y_c, z_c], dtype=np.float64) - - def handle_pixel_to_real(self, req, resp): - """Handle pixel to real coordinate conversion service request.""" - # Get pixel coordinates from request (int32 fields) - u = int(req.u) - v = int(req.v) - - # Get depth at this pixel (default to table depth 0.8m) - depth_m = self.read_depth_at(float(u), float(v)) - - # Convert pixel to world coordinates using calibration - x_w, y_w, z_w = self.pixel_to_world_calibrated(u, v, depth_m) - - # DEBUG: Visualize the pixel location and world coordinates on the image - if self.latest_rgb is not None: - debug_img = self.latest_rgb.copy() - - # Draw a large crosshair at the requested pixel - cv2.drawMarker(debug_img, (u, v), (0, 0, 255), cv2.MARKER_CROSS, 40, 3) - - # Draw a circle around it - cv2.circle(debug_img, (u, v), 20, (0, 255, 0), 2) - - # Add text label with pixel coordinates - label_pixel = f"Pixel: ({u}, {v})" - cv2.putText(debug_img, label_pixel, (u + 25, v - 45), - cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2) - - # Add text label with world coordinates - label_world = f"World: ({x_w:.3f}, {y_w:.3f}, {z_w:.3f})m" - cv2.putText(debug_img, label_world, (u + 25, v - 20), - cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2) - - # Add depth label - label_depth = f"Depth: {depth_m:.3f}m" - cv2.putText(debug_img, label_depth, (u + 25, v + 5), - cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2) - - # Draw frame-centre marker (approximate optical centre) - origin_u, origin_v = 320, 240 - if 0 <= origin_u < debug_img.shape[1] and 0 <= origin_v < debug_img.shape[0]: - cv2.drawMarker(debug_img, (origin_u, origin_v), (255, 255, 0), - cv2.MARKER_TILTED_CROSS, 30, 2) - cv2.putText(debug_img, "Origin (0,0)", (origin_u + 10, origin_v - 10), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) - - # Draw coordinate axes for reference - cv2.putText(debug_img, "u -> (right)", (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) - cv2.putText(debug_img, "v", (10, 55), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) - cv2.putText(debug_img, "|", (10, 68), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) - cv2.putText(debug_img, "v (down)", (10, 85), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) - - # Publish debug image - try: - debug_msg = self.bridge.cv2_to_imgmsg(debug_img, encoding='bgr8') - debug_msg.header.stamp = self.get_clock().now().to_msg() - debug_msg.header.frame_id = 'camera_link' - self.debug_pub.publish(debug_msg) - self.get_logger().info(f'Published debug image for pixel ({u}, {v})') - except Exception as e: - self.get_logger().error(f'Failed to publish debug image: {e}') - - # Validate pixel coordinates against RGB image dimensions - if self.latest_rgb is not None: - rgb_h, rgb_w = self.latest_rgb.shape[:2] - if not (0 <= u < rgb_w and 0 <= v < rgb_h): - self.get_logger().error(f'Pixel ({u},{v}) out of RGB image bounds ({rgb_w}x{rgb_h})') - # Return zero coordinates for out of bounds - resp.x = 0.0 - resp.y = 0.0 - resp.z = 0.0 - return resp - - # Fill response (float64 fields) - resp.x = float(x_w) - resp.y = float(y_w) - resp.z = float(z_w) - - # Check if this is a known calibration point and log the error - for calib_point in self.validation_points: - if abs(u - calib_point["pixel"][0]) <= 1 and abs(v - calib_point["pixel"][1]) <= 1: - expected = calib_point["world"] - error_x = x_w - expected[0] - error_y = y_w - expected[1] - error_z = z_w - expected[2] - error_dist = np.sqrt(error_x**2 + error_y**2 + error_z**2) - - self.get_logger().info( - f'Validation point "{calib_point["name"]}" at pixel ({u},{v}): ' - f'Calculated ({x_w:.3f}, {y_w:.3f}, {z_w:.3f}), ' - f'Expected ({expected[0]:.3f}, {expected[1]:.3f}, {expected[2]:.3f}), ' - f'Error: dx={error_x:.3f}m, dy={error_y:.3f}m, dz={error_z:.3f}m, dist={error_dist:.3f}m' - ) - break - - self.get_logger().info(f'Pixel ({u},{v}) -> World ({x_w:.3f}m, {y_w:.3f}m, {z_w:.3f}m)') - return resp + # ── Debug image ─────────────────────────────────────────────────────── + self._publish_debug(u, v, p_hybrid, Z) + + return response + + def _publish_debug(self, u: int, v: int, p: np.ndarray, depth: float): + if self.latest_rgb is None: + return + img = self.latest_rgb.copy() + cv2.drawMarker(img, (u, v), (0, 0, 255), cv2.MARKER_CROSS, 40, 3) + cv2.circle(img, (u, v), 20, (0, 255, 0), 2) + cv2.putText(img, f'Pixel: ({u},{v})', + (u + 25, v - 45), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2) + cv2.putText(img, f'World: ({p[0]:.3f},{p[1]:.3f},{p[2]:.3f})m', + (u + 25, v - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2) + cv2.putText(img, f'Depth: {depth:.3f}m', + (u + 25, v + 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2) + uc, vc = self._image_center + cv2.drawMarker(img, (uc, vc), (255, 255, 0), cv2.MARKER_TILTED_CROSS, 30, 2) + try: + msg = self.bridge.cv2_to_imgmsg(img, encoding='bgr8') + msg.header.stamp = self.get_clock().now().to_msg() + self.debug_pub.publish(msg) + except Exception as e: + self.get_logger().error(f'Debug publish error: {e}') def main(args=None): From 4d479260bc6f4386c83e8c267c2226d00a5529b2 Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Fri, 10 Apr 2026 01:58:56 +0700 Subject: [PATCH 10/16] fix: change empirical equation to pixel to real world --- vision/pixel_to_real.py | 639 ++++++++++++++++++++++------------ vision/pixel_to_real_world.py | 90 ++++- 2 files changed, 497 insertions(+), 232 deletions(-) diff --git a/vision/pixel_to_real.py b/vision/pixel_to_real.py index 3347431..dc5d75a 100644 --- a/vision/pixel_to_real.py +++ b/vision/pixel_to_real.py @@ -8,48 +8,76 @@ int32 u # pixel column (x-coordinate, positive to the right) int32 v # pixel row (y-coordinate, positive downward) Response: - float64 x # world X coordinate (m) - float64 y # world Y coordinate (m, positive forward/away from camera) + float64 x # world X coordinate (m, positive to the right) + float64 y # world Y coordinate (m, positive forward/away from camera) - with -0.722m offset applied float64 z # world Z coordinate (m, height above table/ground) -Calibration Methods (Hybrid): - METHOD 1 – Intrinsic: - Pinhole back-projection using camera intrinsics + known camera pose (R, t). - t_base_cam is the tunable offset; R_base_cam captures camera orientation. - - METHOD 2 – Empirical: - Least-squares fit on 22 calibration points (5 measures × 2 colours × TR/BL), - 2026-03-12, camera position [-0.0361, 0.5303, 0.6458], height 0.67 m: - x = -0.56859693 + 0.00130317*u + 0.00002114*v (RMS 12.1 mm) - y = 0.98011251 - 0.00002728*u - 0.00133088*v (RMS 16.9 mm) - 2D RMSE = 20.8 mm - - HYBRID: - Gaussian blend – empirical weighted higher near image centre (calibration - region), intrinsic weighted higher toward edges. - -Tuning: - • t_base_cam – translate camera origin in base frame (x, y, z) - • Empirical coefficients – refit from new calibration measurements - +Calibration Data: + - Origin (0, 0, 0.8) in world: pixel (320, 500) - bottom center of image + - Green box at world (0.5, 0, 0.8): pixel (320, 240) + - Gear part at world (0.83, 0.03, 0.8): pixel (305, 95) + - Drill at world (0.571546, -0.240961, 0.831898): pixel (466, 160.5) + - Monkey wrench at world (0.623673, 0.372909, 0.806652): pixel (150, 200) + - Table depth: 0.8 m from camera + - Coordinate mapping: + * u increases right → y DECREASES (u represents -y direction) + * v increases down → x DECREASES (v represents -x direction) + * x in world increases upward in image (opposite of v) + * y in world increases leftward in image (opposite of u) + +Setup: + 1. Build the custom_interfaces package: + cd ~/final_project_ws + colcon build --packages-select custom_interfaces + source install/setup.bash + + 2. Build the vision package: + colcon build --packages-select vision --symlink-install + source install/setup.bash + + 3. Start the pixel_to_real service node: + ros2 run vision pixel_to_real_service + +Usage: # Example: green box at pixel (320, 240) → world (0.5, 0.0, 0.8) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 320, v: 240}" - + # Example: gear part at pixel (305, 95) → world (0.83, 0.03, 0.8) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 305, v: 95}" - + # Example: drill at pixel (466, 160.5) → world (0.571546, -0.240961, 0.831898) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 466, v: 160}" - + # Example: monkey wrench at pixel (150, 200) → world (0.623673, 0.372909, 0.806652) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 150, v: 200}" - + # Example: origin at pixel (320, 500) → world (0.0, 0.0, 0.8) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 320, v: 500}" - + # View debug visualization: rqt_image_view /pixel_to_real/debug_image +How it works: + Uses calibration points to compute a linear pixel-to-world transformation. + Reads depth from /camera/depth/image_raw to determine z-coordinate. + Shows debug visualization with pixel location and computed world coordinates. +""" + + +""" +# linear approx. formula +# Given pixel coordinates (u, v) and depth d from sensor: + +# Step 1: Calculate pixel offset from origin +du = u - 320 # Origin u-coordinate +dv = v - 500 # Origin v-coordinate + +# Step 2: Apply scaling and sign conversion +x = -dv * 0.001923 # scale_x = 0.5/260 ≈ 0.001923 m/pixel +y = -du * 0.002 # scale_y = 0.03/15 = 0.002 m/pixel + +# Step 3: Convert depth to z-coordinate +z = 0.8 + (depth_reference - d) # depth_reference set on first call """ @@ -57,240 +85,413 @@ import rclpy from rclpy.node import Node from sensor_msgs.msg import Image, CameraInfo +from geometry_msgs.msg import PointStamped from cv_bridge import CvBridge import numpy as np import math +import tf2_ros import cv2 +import pyrealsense2 as rs -from image_geometry import PinholeCameraModel - +# Replace with your actual service type. The repo often imports custom interfaces +# from `custom_interfaces.srv`. Adjust the import if your package name differs. try: from custom_interfaces.srv import PixelToReal except Exception: + # The import may fail in editors; keep the name for when the package is built. PixelToReal = None class PixelToRealServer(Node): - - def __init__(self): + def __init__(self, + rgb_topic: str = '/camera/image_raw', + depth_topic: str = '/camera/depth/image_raw', + info_topic: str = '/camera/camera_info', + default_target_frame: str = 'world'): super().__init__('pixel_to_real_server') - self.declare_parameter('real_hardware', True) + # Parameter toggles simulated vs hardware camera topics + self.declare_parameter('real_hardware', False) self.real_hardware = bool(self.get_parameter('real_hardware').value) + # RealSense-specific variables for hardware mode + self.rs_pipeline = None + self.rs_align = None + self.rs_intrinsics = None + if self.real_hardware: - depth_topic = '/camera/depth/image_rect_raw' - info_topic = '/camera/color/camera_info' - rgb_topic = '/camera/color/image_raw' + self.rgb_topic = '/camera/color/image_raw' + self.depth_topic = '/camera/depth/image_rect_raw' + self.camera_info_topic = 'camera/color/camera_info' + self.color_encoding = 'passthrough' + self.depth_32_encoding = 'passthrough' + self.depth_16_encoding = 'passthrough' + + # Note: We DO NOT initialize RealSense pipeline here! + # The camera is published by a separate node (e.g., realsense-ros) + # We only subscribe to the published topics + self.get_logger().info('Real hardware mode: subscribing to RealSense topics') + self.get_logger().info(f' RGB: {self.rgb_topic}') + self.get_logger().info(f' Depth: {self.depth_topic}') + self.get_logger().info(f' Camera Info: {self.camera_info_topic}') else: - depth_topic = '/camera/depth/image_raw' - info_topic = '/camera/camera_info' - rgb_topic = '/camera/image_raw' - - self.bridge = CvBridge() - self.cam_model = PinholeCameraModel() - - self.latest_rgb = None + self.rgb_topic = rgb_topic or '/camera/image_raw' + self.depth_topic = depth_topic or '/camera/depth/image_raw' + self.camera_info_topic = info_topic or '/camera/camera_info' + self.color_encoding = 'bgr8' + self.depth_32_encoding = '32FC1' + self.depth_16_encoding = '16UC1' + + self.bridge = CvBridge() + self.latest_rgb = None + self.latest_rgb_header = None self.latest_depth = None - self.camera_ready = False + self.latest_depth_header = None + self.camera_info = None + self.default_target_frame = default_target_frame - self.depth_sub = self.create_subscription(Image, depth_topic, self.depth_cb, 10) - self.info_sub = self.create_subscription(CameraInfo, info_topic, self.info_cb, 10) - self.rgb_sub = self.create_subscription(Image, rgb_topic, self.rgb_cb, 10) + self.rgb_sub = self.create_subscription(Image, self.rgb_topic, self.rgb_cb, 10) + self.depth_sub = self.create_subscription(Image, self.depth_topic, self.depth_cb, 10) + self.info_sub = self.create_subscription(CameraInfo, self.camera_info_topic, self.info_cb, 10) + # Publisher for debug visualization self.debug_pub = self.create_publisher(Image, '/pixel_to_real/debug_image', 10) - # ── Depth scale (16UC1 → metres) ───────────────────────────────────── - self.depth_scale = 0.001 # mm → m - - # ── METHOD 1: Camera pose in base_link ─────────────────────────────── - # t_base_cam : camera origin expressed in base frame ← TUNE THIS - # R_base_cam : rotation that maps camera axes to base axes - # (camera Z forward → base -Z, camera Y down → base -Y) - self.t_base_cam = np.array([-0.146, 0.635, 0.8]) # <-- adjust here - - self.R_base_cam = np.array([ - [ 1.0, 0.0, 0.0], - [ 0.0, -1.0, 0.0], - [ 0.0, 0.0, -1.0], - ]) - - # ── METHOD 2: Empirical linear model ───────────────────────────────── - # Least-squares fit on 22 calibration points (5 measures × 2 colours - # × TR/BL), 2026-03-12. Model: x = a0 + a1*u + a2*v - # y = b0 + b1*u + b2*v - # 2D RMSE = 20.8 mm (x: 12.1 mm, y: 16.9 mm) - self._emp_ax = np.array([-0.56859693, +0.00130317, +0.00002114]) # [1,u,v]→x - self._emp_ay = np.array([+0.98011251, -0.00002728, -0.00133088]) # [1,u,v]→y - self._emp_z = -0.002 # table height assumption (m) - - # ── Hybrid blend parameters ─────────────────────────────────────────── - # Empirical is most accurate near image centre (calibration region). - # Gaussian decay controls how quickly we trust intrinsics at the edges. - self._image_center = (320, 240) # (u_c, v_c) - self._blend_sigma_frac = 0.6 # fraction of max diagonal for σ - + # Calibration data: pixel coordinates (u, v) -> world coordinates (x, y, z) + # Using calibration points: + # Origin: (u=320, v=500) -> (x=0, y=0, z=0.8) + # Green box: (u=320, v=240) -> (x=0.5, y=0, z=0.8) + # Gear: (u=305, v=95) -> (x=0.83, y=0.03, z=0.8) + + # Coordinate system mapping: + # u increases right -> y DECREASES (u represents -y direction) + # v increases down -> x DECREASES (v represents -x direction) + + self.u_origin = 320 # u=320 corresponds to y=0 + self.v_origin = 500 # v=500 corresponds to x=0 + + # Calculate scaling factors from calibration points: + # Green box: du=0, dv=-260 pixels -> dx=0.5, dy=0 meters + # Gear: du=-15, dv=-405 pixels -> dx=0.83, dy=0.03 meters + + # From green box (vertical movement in image): + # dv = 240 - 500 = -260 pixels (up in image) + # dx = 0.5 - 0 = 0.5 meters (positive x, which is up) + # scale_x = 0.5 / 260 = 0.00192 m/pixel + + # From gear (horizontal AND vertical movement): + # du = 305 - 320 = -15 pixels (left in image) + # dy = 0.03 - 0 = 0.03 meters (positive y, which is left) + # scale_y = 0.03 / 15 = 0.002 m/pixel + + dv_green = 240 - 500 # -260 pixels (up in image) + dx_green = 0.5 - 0 # 0.5 meters (positive x in world) + + du_gear = 305 - 320 # -15 pixels (left in image) + dy_gear = 0.03 - 0 # 0.03 meters (positive y in world) + + self.scale_x = abs(dx_green / dv_green) # 0.5/260 = 0.00192 m/pixel + self.scale_y = abs(dy_gear / du_gear) # 0.03/15 = 0.002 m/pixel + + # Depth calibration: Store reference depth for z-coordinate conversion + # At calibration points, z should be 0.8m (table height) + # We'll measure the actual depth sensor reading and use it as reference + self.z_table = 0.8 # World z-coordinate of table surface + self.depth_reference = None # Will be set from first depth reading at calibration point + + self.get_logger().info(f'Pixel-to-world calibration: scale_x={self.scale_x:.6f} m/px, scale_y={self.scale_y:.6f} m/px') + self.get_logger().info(f'Origin: pixel({self.u_origin}, {self.v_origin}) -> world(0, 0, 0.8)') + self.get_logger().info(f'Coordinate mapping: u right=-y, v down=-x') + self.get_logger().info(f'Calibrated from green box at (320,240)->(0.5,0,0.8) and gear at (305,95)->(0.83,0.03,0.8)') + self.get_logger().info(f'Validation point: drill at (466,160)->(0.572,-0.241,0.832)') + self.get_logger().info(f'Validation point: monkey_wrench at (150,200)->(0.624,0.373,0.807)') + self.get_logger().info(f'Depth calibration: Call service at (320,240) to set depth reference for z=0.8m') + self.get_logger().info(f'RGB topic: {self.rgb_topic}') + self.get_logger().info(f'Depth topic: {self.depth_topic}') + self.get_logger().info(f'Camera info topic: {self.camera_info_topic}') + self.get_logger().info(f'real_hardware parameter: {self.real_hardware}') + + # Store calibration validation points for accuracy checking + self.validation_points = [ + {"name": "green_box", "pixel": (320, 240), "world": (0.5, 0.0, 0.8)}, + {"name": "gear", "pixel": (305, 95), "world": (0.83, 0.03, 0.8)}, + {"name": "drill", "pixel": (466, 160), "world": (0.571546, -0.240961, 0.831898)}, + {"name": "monkey_wrench", "pixel": (150, 200), "world": (0.623673, 0.372909, 0.806652)}, + {"name": "origin", "pixel": (320, 500), "world": (0.0, 0.0, 0.8)} + ] + + # TF + self.tf_buffer = tf2_ros.Buffer() + self.tf_listener = tf2_ros.TransformListener(self.tf_buffer, self) + + # Create service if PixelToReal is not None: - self.srv = self.create_service( - PixelToReal, '/pixel_to_real', self.handle_pixel_to_real - ) - self.get_logger().info('Service /pixel_to_real ready') + self.srv = self.create_service(PixelToReal, 'pixel_to_real', self.handle_pixel_to_real) + self.get_logger().info('Service /pixel_to_real ready (custom_interfaces.srv.PixelToReal)') + self.get_logger().info('Usage: ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 320, v: 240}"') else: - self.get_logger().error( - 'PixelToReal srv not found – build custom_interfaces first.' - ) + self.get_logger().error('PixelToReal srv type not found!') + self.get_logger().error('Build custom_interfaces: colcon build --packages-select custom_interfaces') raise RuntimeError('custom_interfaces.srv.PixelToReal not available') - self.get_logger().info( - f't_base_cam = {self.t_base_cam.tolist()} ' - f'real_hardware={self.real_hardware}' - ) - - # ── Callbacks ───────────────────────────────────────────────────────────── - def rgb_cb(self, msg: Image): + """Store the latest RGB image for pixel coordinate validation.""" try: - self.latest_rgb = self.bridge.imgmsg_to_cv2(msg, 'bgr8') + rgb_img = self.bridge.imgmsg_to_cv2(msg, desired_encoding=self.color_encoding) + self.latest_rgb = rgb_img + self.latest_rgb_header = msg.header except Exception as e: - self.get_logger().error(f'RGB convert error: {e}') + self.get_logger().error(f"Failed to convert RGB image: {e}") def depth_cb(self, msg: Image): + # Support 32FC1 and 16UC1 encodings; convert to float32 meters. try: - if msg.encoding in ('16UC1', '16U'): - raw = self.bridge.imgmsg_to_cv2(msg, 'passthrough') - self.latest_depth = raw.astype(np.float32) * self.depth_scale - elif msg.encoding in ('32FC1', '32F'): - self.latest_depth = self.bridge.imgmsg_to_cv2( - msg, 'passthrough' - ).astype(np.float32) + if msg.encoding == '32FC1' or msg.encoding == '32F': + depth_img = self.bridge.imgmsg_to_cv2(msg, desired_encoding=self.depth_32_encoding) + depth = depth_img.astype(np.float32) + elif msg.encoding == '16UC1' or msg.encoding == '16U': + d16 = self.bridge.imgmsg_to_cv2(msg, desired_encoding=self.depth_16_encoding) + depth = d16.astype(np.float32) / 1000.0 # assume mm -> m else: - raw = self.bridge.imgmsg_to_cv2(msg, 'passthrough') - self.latest_depth = raw.astype(np.float32) + # Try a generic conversion to float32 + depth_img = self.bridge.imgmsg_to_cv2(msg, desired_encoding='passthrough') + depth = depth_img.astype(np.float32) + + self.latest_depth = depth + self.latest_depth_header = msg.header except Exception as e: - self.get_logger().error(f'Depth convert error: {e}') + self.get_logger().error(f"Failed to convert depth image: {e}") def info_cb(self, msg: CameraInfo): - self.cam_model.fromCameraInfo(msg) - self.camera_ready = True + self.camera_info = msg - # ── Depth reading ────────────────────────────────────────────────────────── + def pixel_to_world_calibrated(self, u: int, v: int, depth_m: float): + """Convert pixel (u,v) to world coordinates (x,y,z) using calibration. - def get_robust_depth(self, u: int, v: int, window: int = 5) -> float | None: - """Median depth over a small window; returns None if no valid pixels.""" - if self.latest_depth is None: - return None - h, w = self.latest_depth.shape - half = window // 2 - u0, u1 = max(u - half, 0), min(u + half + 1, w) - v0, v1 = max(v - half, 0), min(v + half + 1, h) - patch = self.latest_depth[v0:v1, u0:u1] - valid = patch[np.isfinite(patch) & (patch > 0.0)] - return float(np.median(valid)) if len(valid) > 0 else None - - # ── Conversion methods ───────────────────────────────────────────────────── - - def _intrinsic_estimate(self, u: int, v: int, Z: float) -> np.ndarray: - """METHOD 1: pinhole back-projection + rigid-body transform to base. - - ray = cam_model.projectPixelTo3dRay((u, v)) [unit vector in cam frame] - p_cam = ray * Z [scale to depth] - p_base = R @ p_cam + t [transform to base frame] + Coordinate system: + - u increases right -> y DECREASES (u represents -y direction) + - v increases down -> x DECREASES (v represents -x direction) + - Origin at pixel (320, 500) = world (0, 0, 0.8) + - Depth is inversely related to z: small depth = high z (near camera, far from ground) + + Args: + u: pixel column (positive right) + v: pixel row (positive down) + depth_m: depth in meters from camera (from depth sensor) + + Returns: + (x, y, z) in world coordinates (meters) """ - if not self.camera_ready: - # Fallback: hardcoded D435i intrinsics - fx, fy, cx, cy = 615.0, 615.0, 320.0, 240.0 - ray = np.array([(u - cx) / fx, (v - cy) / fy, 1.0]) + # Calculate pixel offset from origin + du = u - self.u_origin # positive = right in image + dv = v - self.v_origin # positive = down in image + + # Apply transformation based on coordinate mapping: + # v down (-dv up) -> x increases: x = -dv * scale_x + # u right (-du left) -> y increases: y = -du * scale_y + x = -dv * self.scale_x # Up in image -> positive x + y = -du * self.scale_y # Left in image -> positive y + + # Apply y-offset for robot reference frame + # y = y - 0.5442 # Shift y by -0.5442 meters + + # Convert depth to z-coordinate + # Depth is inversely related to z: smaller depth = further from ground = higher z + # At table (z=0.8), we need to calibrate based on actual depth reading + # If depth_reference is set, use it; otherwise estimate from depth + if self.real_hardware: + z = depth_m # Direct mapping for hardware + elif self.depth_reference is not None: + # z = z_table + (depth_reference - depth) + # When depth < depth_reference (closer to camera), z increases + # When depth > depth_reference (further from camera), z decreases + z = self.z_table + (self.depth_reference - depth_m) else: - ray = np.array(self.cam_model.projectPixelTo3dRay((u, v))) - - p_cam = ray * Z - p_base = self.R_base_cam @ p_cam + self.t_base_cam - return p_base - - def _empirical_estimate(self, u: int, v: int) -> np.ndarray: - """METHOD 2: simple linear model fitted from calibration data.""" - feat = np.array([1.0, float(u), float(v)]) - x = float(np.dot(self._emp_ax, feat)) - y = float(np.dot(self._emp_ay, feat)) - return np.array([x, y, self._emp_z]) - - def _hybrid_estimate( - self, - u: int, v: int, - p_intrinsic: np.ndarray, - p_empirical: np.ndarray, - ) -> tuple[np.ndarray, float, float]: - """HYBRID: Gaussian-weighted blend. - - Empirical weight is highest at the image centre (calibration region) - and decays towards the edges; intrinsic picks up the slack. + # First call: assume this is close to table depth, set reference + self.depth_reference = depth_m + self.get_logger().info(f'Set depth reference: {self.depth_reference:.3f}m at z={self.z_table}m') + z = self.z_table + + return (x, y, z) + + def read_depth_at(self, u: float, v: float, max_search: int = 5): + """Read depth with bilinear interpolation; if invalid, search a median window. + Returns depth in meters. Assumes table is at 0.8m depth. + + Handles invalid depth (NaN/zero/inf) common on reflective surfaces or sensor noise + by searching a 5-pixel neighborhood and taking the median of valid depth values. + + The depth sensor returns distance from camera. We assume the table surface + is at 0.8m from the camera, which should be the largest/most common depth value. """ - uc, vc = self._image_center - dist = math.sqrt((u - uc) ** 2 + (v - vc) ** 2) - sigma = math.sqrt(uc ** 2 + vc ** 2) * self._blend_sigma_frac # ~σ in pixels - - w_emp = math.exp(-dist / sigma) - w_int = 1.0 - w_emp - - p_hybrid = w_emp * p_empirical + w_int * p_intrinsic - return p_hybrid, w_emp, w_int - - # ── Service handler ──────────────────────────────────────────────────────── - - def handle_pixel_to_real(self, request, response): - u = int(request.u) - v = int(request.v) - - # ── Depth ───────────────────────────────────────────────────────────── - Z = self.get_robust_depth(u, v) - if Z is None: - self.get_logger().warn(f'No valid depth at ({u},{v}), using 0.8 m fallback.') - Z = 0.8 # fallback to estimated table depth - - # ── Compute both estimates ──────────────────────────────────────────── - p_int = self._intrinsic_estimate(u, v, Z) - p_emp = self._empirical_estimate(u, v) - - # ── Blend ───────────────────────────────────────────────────────────── - p_hybrid, w_emp, w_int = self._hybrid_estimate(u, v, p_int, p_emp) - - response.x = float(p_hybrid[0]) - response.y = float(p_hybrid[1]) - response.z = float(p_hybrid[2]) - - self.get_logger().info( - f'Pixel ({u},{v}) depth={Z:.3f}m | ' - f'Intrinsic=[{p_int[0]:.4f},{p_int[1]:.4f},{p_int[2]:.4f}] ' - f'Empirical=[{p_emp[0]:.4f},{p_emp[1]:.4f}] ' - f'Hybrid=[{p_hybrid[0]:.4f},{p_hybrid[1]:.4f},{p_hybrid[2]:.4f}] ' - f'(w_emp={w_emp:.3f} w_int={w_int:.3f})' - ) - - # ── Debug image ─────────────────────────────────────────────────────── - self._publish_debug(u, v, p_hybrid, Z) - - return response - - def _publish_debug(self, u: int, v: int, p: np.ndarray, depth: float): - if self.latest_rgb is None: - return - img = self.latest_rgb.copy() - cv2.drawMarker(img, (u, v), (0, 0, 255), cv2.MARKER_CROSS, 40, 3) - cv2.circle(img, (u, v), 20, (0, 255, 0), 2) - cv2.putText(img, f'Pixel: ({u},{v})', - (u + 25, v - 45), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2) - cv2.putText(img, f'World: ({p[0]:.3f},{p[1]:.3f},{p[2]:.3f})m', - (u + 25, v - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2) - cv2.putText(img, f'Depth: {depth:.3f}m', - (u + 25, v + 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2) - uc, vc = self._image_center - cv2.drawMarker(img, (uc, vc), (255, 255, 0), cv2.MARKER_TILTED_CROSS, 30, 2) - try: - msg = self.bridge.cv2_to_imgmsg(img, encoding='bgr8') - msg.header.stamp = self.get_clock().now().to_msg() - self.debug_pub.publish(msg) - except Exception as e: - self.get_logger().error(f'Debug publish error: {e}') + if self.latest_depth is None: + self.get_logger().warn('No depth image available, using default table depth 0.8m') + return 0.8 # Default table depth + + depth = self.latest_depth + h, w = depth.shape[:2] + if not (0 <= int(v) < h and 0 <= int(u) < w): + return 0.8 # Default if out of bounds + + def bilinear(u_, v_): + x0 = int(math.floor(u_)); x1 = min(x0 + 1, w - 1) + y0 = int(math.floor(v_)); y1 = min(y0 + 1, h - 1) + wa = (x1 - u_) * (y1 - v_) + wb = (u_ - x0) * (y1 - v_) + wc = (x1 - u_) * (v_ - y0) + wd = (u_ - x0) * (v_ - y0) + d00 = float(depth[y0, x0]); d10 = float(depth[y0, x1]) + d01 = float(depth[y1, x0]); d11 = float(depth[y1, x1]) + d = wa * d00 + wb * d10 + wc * d01 + wd * d11 + if np.isnan(d) or d <= 0.0 or np.isinf(d): + return None + return float(d) + + + #ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 220, v: 220}" + # Use topic-based depth reading for both hardware and simulation + if self.latest_depth is not None: + d = self.latest_depth[int(v), int(u)] + self.get_logger().info(f'Read depth at ({u},{v}): {d:.3f}m from topic') + else: + self.get_logger().warn('No depth data available, using default 0.8m') + return 0.8 + + + + + + + + + # Fallback: collect valid depths in neighborhood and take median + valid_depths = [] + for du in range(-max_search, max_search + 1): + for dv in range(-max_search, max_search + 1): + uu = u + du + vv = v + dv + if 0 <= int(uu) < w and 0 <= int(vv) < h: + d_val = bilinear(uu, vv) + if d_val is not None: + valid_depths.append(d_val) + + if len(valid_depths) == 0: + self.get_logger().warn(f'No valid depth found within {max_search}px of ({u:.1f},{v:.1f}), using table depth 0.8m') + return 0.8 # Default table depth + + # Use median to be robust against outliers + median_depth = float(np.median(valid_depths)) + return median_depth + + def backproject(self, u: float, v: float, d: float): + # Use camera_info intrinsics + K = self.camera_info.k + fx = K[0]; fy = K[4]; cx = K[2]; cy = K[5] + x_c = (u - cx) * d / fx + y_c = (v - cy) * d / fy + z_c = d + return np.array([x_c, y_c, z_c], dtype=np.float64) + + def handle_pixel_to_real(self, req, resp): + """Handle pixel to real coordinate conversion service request.""" + # Get pixel coordinates from request (int32 fields) + u = int(req.u) + v = int(req.v) + + # Get depth at this pixel (default to table depth 0.8m) + depth_m = self.read_depth_at(float(u), float(v)) + + # Convert pixel to world coordinates using calibration + x_w, y_w, z_w = self.pixel_to_world_calibrated(u, v, depth_m) + + # DEBUG: Visualize the pixel location and world coordinates on the image + if self.latest_rgb is not None: + debug_img = self.latest_rgb.copy() + + # Draw a large crosshair at the requested pixel + cv2.drawMarker(debug_img, (u, v), (0, 0, 255), cv2.MARKER_CROSS, 40, 3) + + # Draw a circle around it + cv2.circle(debug_img, (u, v), 20, (0, 255, 0), 2) + + # Add text label with pixel coordinates + label_pixel = f"Pixel: ({u}, {v})" + cv2.putText(debug_img, label_pixel, (u + 25, v - 45), + cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2) + + # Add text label with world coordinates + label_world = f"World: ({x_w:.3f}, {y_w:.3f}, {z_w:.3f})m" + cv2.putText(debug_img, label_world, (u + 25, v - 20), + cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2) + + # Add depth label + label_depth = f"Depth: {depth_m:.3f}m" + cv2.putText(debug_img, label_depth, (u + 25, v + 5), + cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2) + + # Draw origin marker at (320, 500) + origin_u, origin_v = self.u_origin, self.v_origin + if 0 <= origin_u < debug_img.shape[1] and 0 <= origin_v < debug_img.shape[0]: + cv2.drawMarker(debug_img, (origin_u, origin_v), (255, 255, 0), + cv2.MARKER_TILTED_CROSS, 30, 2) + cv2.putText(debug_img, "Origin (0,0)", (origin_u + 10, origin_v - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) + + # Draw coordinate axes for reference + cv2.putText(debug_img, "u -> (right)", (10, 30), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) + cv2.putText(debug_img, "v", (10, 55), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) + cv2.putText(debug_img, "|", (10, 68), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) + cv2.putText(debug_img, "v (down)", (10, 85), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) + + # Publish debug image + try: + debug_msg = self.bridge.cv2_to_imgmsg(debug_img, encoding='bgr8') + debug_msg.header.stamp = self.get_clock().now().to_msg() + debug_msg.header.frame_id = 'camera_link' + self.debug_pub.publish(debug_msg) + self.get_logger().info(f'Published debug image for pixel ({u}, {v})') + except Exception as e: + self.get_logger().error(f'Failed to publish debug image: {e}') + + # Validate pixel coordinates against RGB image dimensions + if self.latest_rgb is not None: + rgb_h, rgb_w = self.latest_rgb.shape[:2] + if not (0 <= u < rgb_w and 0 <= v < rgb_h): + self.get_logger().error(f'Pixel ({u},{v}) out of RGB image bounds ({rgb_w}x{rgb_h})') + # Return zero coordinates for out of bounds + resp.x = 0.0 + resp.y = 0.0 + resp.z = 0.0 + return resp + + # Fill response (float64 fields) + resp.x = float(x_w) + resp.y = float(y_w) + resp.z = float(z_w) + + # Check if this is a known calibration point and log the error + for calib_point in self.validation_points: + if abs(u - calib_point["pixel"][0]) <= 1 and abs(v - calib_point["pixel"][1]) <= 1: + expected = calib_point["world"] + error_x = x_w - expected[0] + error_y = y_w - expected[1] + error_z = z_w - expected[2] + error_dist = np.sqrt(error_x**2 + error_y**2 + error_z**2) + + self.get_logger().info( + f'Validation point "{calib_point["name"]}" at pixel ({u},{v}): ' + f'Calculated ({x_w:.3f}, {y_w:.3f}, {z_w:.3f}), ' + f'Expected ({expected[0]:.3f}, {expected[1]:.3f}, {expected[2]:.3f}), ' + f'Error: dx={error_x:.3f}m, dy={error_y:.3f}m, dz={error_z:.3f}m, dist={error_dist:.3f}m' + ) + break + + self.get_logger().info(f'Pixel ({u},{v}) -> World ({x_w:.3f}m, {y_w:.3f}m, {z_w:.3f}m)') + return resp def main(args=None): diff --git a/vision/pixel_to_real_world.py b/vision/pixel_to_real_world.py index 10dab48..747b10f 100644 --- a/vision/pixel_to_real_world.py +++ b/vision/pixel_to_real_world.py @@ -8,6 +8,7 @@ from cv_bridge import CvBridge from image_geometry import PinholeCameraModel import numpy as np +import math class PixelToRealNode(Node): @@ -64,7 +65,22 @@ def __init__(self): [0.0, 0.0, -1.0] ]) - self.get_logger().info("PixelToReal with TF ready.") + # ---- METHOD 2: Empirical linear model ---- + # Least-squares fit on 22 calibration points (5 measures × 2 colours + # × TR/BL), 2026-03-12. Model: x = a0 + a1*u + a2*v + # y = b0 + b1*u + b2*v + # 2D RMSE = 20.8 mm (x: 12.1 mm, y: 16.9 mm) + self._emp_ax = np.array([-0.56859693, +0.00130317, +0.00002114]) # [1,u,v]→x + self._emp_ay = np.array([+0.98011251, -0.00002728, -0.00133088]) # [1,u,v]→y + self._emp_z = 0 # table height (m) + + # ---- Hybrid blend parameters ---- + # Empirical is most accurate near image centre (calibration region). + # Gaussian decay controls how quickly we trust intrinsics at the edges. + self._image_center = (320, 240) # (u_c, v_c) + self._blend_sigma_frac = 0.6 # fraction of max diagonal for σ + + self.get_logger().info("PixelToReal with empirical + intrinsic hybrid ready.") # -------------------------------------------------- @@ -97,6 +113,49 @@ def get_robust_depth(self, u, v, window_size=5): # -------------------------------------------------- + def _intrinsic_estimate(self, u: int, v: int, Z: float) -> np.ndarray: + """METHOD 1: pinhole back-projection + rigid-body transform to base.""" + ray = self.cam_model.projectPixelTo3dRay((u, v)) + p_cam = np.array(ray) * Z + p_base = self.R_base_cam @ p_cam + self.t_base_cam + return p_base + + def _empirical_estimate(self, u: int, v: int) -> np.ndarray: + """METHOD 2: simple linear model fitted from calibration data. + + Least-squares fit on 22 calibration points, 2026-03-12: + x = -0.56859693 + 0.00130317*u + 0.00002114*v (RMS 12.1 mm) + y = 0.98011251 - 0.00002728*u - 0.00133088*v (RMS 16.9 mm) + 2D RMSE = 20.8 mm + """ + feat = np.array([1.0, float(u), float(v)]) + x = float(np.dot(self._emp_ax, feat)) + y = float(np.dot(self._emp_ay, feat)) + return np.array([x, y, self._emp_z]) + + def _hybrid_estimate( + self, + u: int, v: int, + p_intrinsic: np.ndarray, + p_empirical: np.ndarray, + ) -> tuple: + """HYBRID: Gaussian-weighted blend. + + Empirical weight is highest at the image centre (calibration region) + and decays towards the edges; intrinsic picks up the slack. + """ + uc, vc = self._image_center + dist = math.sqrt((u - uc) ** 2 + (v - vc) ** 2) + sigma = math.sqrt(uc ** 2 + vc ** 2) * self._blend_sigma_frac + + w_emp = math.exp(-dist / sigma) + w_int = 1.0 - w_emp + + p_hybrid = w_emp * p_empirical + w_int * p_intrinsic + return p_hybrid, w_emp, w_int + + # -------------------------------------------------- + def handle_pixel_to_real(self, request, response): if self.depth_image is None or not self.camera_ready: @@ -113,21 +172,26 @@ def handle_pixel_to_real(self, request, response): Z = float(depth_raw) * self.depth_scale - # Camera frame 3D - ray = self.cam_model.projectPixelTo3dRay((u, v)) + # METHOD 1: intrinsic back-projection + p_int = self._intrinsic_estimate(u, v, Z) - X_cam = ray[0] * Z - Y_cam = ray[1] * Z - Z_cam = Z + # METHOD 2: empirical linear model + p_emp = self._empirical_estimate(u, v) - p_cam = np.array([X_cam, Y_cam, Z_cam]) + # HYBRID: Gaussian-weighted blend + p_hybrid, w_emp, w_int = self._hybrid_estimate(u, v, p_int, p_emp) - # Transform to base frame - p_base = self.R_base_cam @ p_cam + self.t_base_cam + response.x = float(p_hybrid[0]) + response.y = float(p_hybrid[1]) + response.z = float(p_hybrid[2]) - response.x = float(p_base[0]) - response.y = float(p_base[1]) - response.z = float(p_base[2]) + self.get_logger().info( + f'Pixel ({u},{v}) depth={Z:.3f}m | ' + f'Intrinsic=[{p_int[0]:.4f},{p_int[1]:.4f},{p_int[2]:.4f}] ' + f'Empirical=[{p_emp[0]:.4f},{p_emp[1]:.4f}] ' + f'Hybrid=[{p_hybrid[0]:.4f},{p_hybrid[1]:.4f},{p_hybrid[2]:.4f}] ' + f'(w_emp={w_emp:.3f} w_int={w_int:.3f})' + ) return response @@ -141,4 +205,4 @@ def main(args=None): if __name__ == "__main__": - main() \ No newline at end of file + main() From f646fff6eb03316da233eb2501007e040aed8c3a Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Fri, 10 Apr 2026 02:15:32 +0700 Subject: [PATCH 11/16] feat: add benchmark branch --- vision/pixel_to_real.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vision/pixel_to_real.py b/vision/pixel_to_real.py index dc5d75a..21eec56 100644 --- a/vision/pixel_to_real.py +++ b/vision/pixel_to_real.py @@ -508,3 +508,4 @@ def main(args=None): if __name__ == '__main__': main() + From e33e24483000b19fc594cab5a912d74cb424c05f Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Fri, 10 Apr 2026 05:59:23 +0700 Subject: [PATCH 12/16] feat: develop a working dashboard for benchmark SAM --- launch/vision.launch.py | 9 + setup.py | 2 +- vision/benchmark_dashboard.py | 64 ++-- vision/simple_sam_detector.py | 106 ++++- vision_scripts/test_dashboard_data_flow.py | 426 +++++++++++++++++++++ 5 files changed, 582 insertions(+), 25 deletions(-) create mode 100644 vision_scripts/test_dashboard_data_flow.py diff --git a/launch/vision.launch.py b/launch/vision.launch.py index 21011ad..6d6119c 100644 --- a/launch/vision.launch.py +++ b/launch/vision.launch.py @@ -42,4 +42,13 @@ def generate_launch_description(): ) ld.add_action(scene_understanding_node) + benchmark_dashboard_node = Node( + package='vision', + executable='benchmark_dashboard', + name='benchmark_dashboard_node', + output='screen', + emulate_tty=True + ) + ld.add_action(benchmark_dashboard_node) + return ld diff --git a/setup.py b/setup.py index 569de07..c221795 100644 --- a/setup.py +++ b/setup.py @@ -76,7 +76,7 @@ 'find_boundary_service = vision.find_boundary_service_node:main', 'find_multi_object = vision.find_multi_object:main', # 'calibrate = calibration.calibrate:main', - # 'benchmark_dashboard = vision.benchmark_dashboard:main', + 'benchmark_dashboard = vision.benchmark_dashboard:main', # 'show_rgb_image = vision.show_rgb_image_node:main', # 'show_depth_image = vision.show_depth_image_node:main', # 'sam_vision_pipeline = vision.sam_vision_pipeline_node:main', diff --git a/vision/benchmark_dashboard.py b/vision/benchmark_dashboard.py index 552365c..beca947 100644 --- a/vision/benchmark_dashboard.py +++ b/vision/benchmark_dashboard.py @@ -25,6 +25,7 @@ from std_srvs.srv import Trigger from std_msgs.msg import String import json +import numpy as np import time from datetime import datetime from http.server import HTTPServer, SimpleHTTPRequestHandler @@ -45,6 +46,18 @@ print("Custom interfaces not available. Build custom_interfaces package first.") +class _ROSJSONEncoder(json.JSONEncoder): + """JSON encoder that handles ROS/numpy integer and float types.""" + def default(self, obj): + if isinstance(obj, (np.integer,)): + return int(obj) + if isinstance(obj, (np.floating,)): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + return super().default(obj) + + class BenchmarkDashboard(Node): """ Benchmark Dashboard - Monitors vision services and provides web interface @@ -152,20 +165,20 @@ def sam_detections_callback(self, msg): 'frame_id': msg.header.frame_id, 'obj_id': detection.object_id, 'bbox': { - 'x1': detection.bbox[0], - 'y1': detection.bbox[1], - 'x2': detection.bbox[2], - 'y2': detection.bbox[3] + 'x1': int(detection.bbox[0]), + 'y1': int(detection.bbox[1]), + 'x2': int(detection.bbox[2]), + 'y2': int(detection.bbox[3]) }, 'center': { - 'u': detection.center[0], - 'v': detection.center[1] + 'u': int(detection.center[0]), + 'v': int(detection.center[1]) }, 'confidence': float(detection.confidence), - 'area': detection.area, + 'area': int(detection.area), 'distance_cm': float(detection.distance_cm), 'iou_score': float(detection.iou_score), - 'is_stable': detection.is_stable_detection, + 'is_stable': bool(detection.is_stable_detection), 'ap_iou_threshold': 0.5 if detection.is_stable_detection else 0.0 } @@ -193,10 +206,10 @@ def scene_understanding_callback(self, msg): scene_data = { 'timestamp': timestamp, 'scene_id': msg.scene_id, - 'total_objects': msg.total_objects, + 'total_objects': int(msg.total_objects), 'relations': relations, 'object_labels': list(msg.object_labels), - 'object_counts': list(msg.object_counts), + 'object_counts': [int(c) for c in msg.object_counts], 'graspable_objects': msg.graspable_objects, 'average_distance_cm': float(msg.average_distance_cm), 'scene_description': msg.scene_description, @@ -276,10 +289,10 @@ def add_grasp_detection_record(self, grasp_pose): 'timestamp': timestamp, 'test_id': len(self.data['grasp_detections']) + 1, 'object_id': grasp_pose.object_id, - 'bbox': list(grasp_pose.bbox), + 'bbox': [int(v) for v in grasp_pose.bbox], 'pixel_position': { - 'u': grasp_pose.bbox[0] + (grasp_pose.bbox[2] - grasp_pose.bbox[0]) // 2, - 'v': grasp_pose.bbox[1] + (grasp_pose.bbox[3] - grasp_pose.bbox[1]) // 2 + 'u': int(grasp_pose.bbox[0]) + (int(grasp_pose.bbox[2]) - int(grasp_pose.bbox[0])) // 2, + 'v': int(grasp_pose.bbox[1]) + (int(grasp_pose.bbox[3]) - int(grasp_pose.bbox[1])) // 2 }, 'world_position': { 'x': float(grasp_pose.position.x), @@ -302,7 +315,7 @@ def add_grasp_detection_record(self, grasp_pose): def publish_data(self): """Publish benchmark data to topic""" msg = String() - msg.data = json.dumps(self.data) + msg.data = json.dumps(self.data, cls=_ROSJSONEncoder) self.data_publisher.publish(msg) def clear_data_callback(self, request, response): @@ -327,18 +340,25 @@ def clear_data_callback(self, request, response): def start_http_server(self): """Start HTTP server for dashboard""" - # Get the path to the HTML file + # Resolve dashboard HTML using ament share directory (correct for installed packages) + try: + from ament_index_python.packages import get_package_share_directory + share_dir = get_package_share_directory('vision') + html_dir = Path(share_dir) / 'dashboard' + except Exception: + html_dir = Path(__file__).parent.parent / 'dashboard' + + # History JSON is written by simple_sam_detector next to the installed module package_path = Path(__file__).parent.parent - html_dir = package_path / 'dashboard' - - # Create dashboard directory if it doesn't exist - html_dir.mkdir(exist_ok=True) - - # Create HTML file if it doesn't exist + + self.get_logger().info(f'Dashboard HTML dir: {html_dir}') + + # Fallback: create default HTML if share dir has no index.html html_file = html_dir / 'index.html' if not html_file.exists(): self.get_logger().warn(f'Dashboard HTML not found at {html_file}') self.get_logger().warn('Creating basic HTML file...') + html_dir.mkdir(exist_ok=True) self.create_default_html(html_file) # Custom handler that serves files from html_dir and provides data endpoint @@ -354,7 +374,7 @@ def do_GET(self): self.send_header('Content-type', 'application/json') self.send_header('Access-Control-Allow-Origin', '*') self.end_headers() - data_json = json.dumps(self.dashboard_node.data) + data_json = json.dumps(self.dashboard_node.data, cls=_ROSJSONEncoder) self.wfile.write(data_json.encode()) elif self.path == '/api/run-history': # Serve vision_runs_history.json from workspace root diff --git a/vision/simple_sam_detector.py b/vision/simple_sam_detector.py index 418fad7..b303d75 100755 --- a/vision/simple_sam_detector.py +++ b/vision/simple_sam_detector.py @@ -485,7 +485,15 @@ def detect_objects_callback(self, request, response): f"conf={confidences[i]:.2f}, dist={distances_cm[i]:.1f}cm" ) self.get_logger().info("=" * 80) - + + # Save results to vision_runs_history.json for real-time dashboard + end_inner = time.perf_counter() + self._save_detect_objects_run( + object_ids, bbox_x1, bbox_y1, bbox_x2, bbox_y2, + confidences, distances_cm, iou_scores, is_stable_array, + clip_classifications, end_inner - start + ) + except Exception as e: response.success = False response.total_detections = 0 @@ -509,9 +517,103 @@ def detect_objects_callback(self, request, response): end = time.perf_counter() latency = end - start self.get_logger().info(f"Total detection latency: {latency:.6f} seconds") - + return response + def _save_detect_objects_run(self, object_ids, bbox_x1, bbox_y1, bbox_x2, bbox_y2, + confidences, distances_cm, iou_scores, is_stable_array, + clip_classifications, latency_s): + """Save /vision/detect_objects results to vision_runs_history.json for the dashboard.""" + try: + from pathlib import Path + + # Same path used by benchmark_dashboard and collect_and_export + package_path = Path(__file__).parent.parent + history_file = package_path / 'vision_runs_history.json' + + # Load existing history + history = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + data = json.load(f) + if isinstance(data, list): + history = data + except Exception: + pass + + last_run_no = history[-1]['meta']['run_no'] if history else 0 + run_no = last_run_no + 1 + + num_dets = len(self.latest_detections) + total_sam_conf = 0.0 + objects = [] + for idx in range(num_dets): + det = self.latest_detections[idx] + clip_info = clip_classifications.get(idx, {}) + sam_conf = float(det.get('confidence', 0.0)) + total_sam_conf += sam_conf + objects.append({ + 'object_id': object_ids[idx] if idx < len(object_ids) else f'object_{idx}', + 'label': clip_info.get('label', '') if clip_info else '', + 'bbox_x1': bbox_x1[idx] if idx < len(bbox_x1) else 0, + 'bbox_y1': bbox_y1[idx] if idx < len(bbox_y1) else 0, + 'bbox_x2': bbox_x2[idx] if idx < len(bbox_x2) else 0, + 'bbox_y2': bbox_y2[idx] if idx < len(bbox_y2) else 0, + 'sam_confidence': round(sam_conf, 4), + 'clip_confidence': round(float(clip_info.get('confidence', 0.0)), 4) if clip_info else '', + 'distance_cm': distances_cm[idx] if idx < len(distances_cm) else '', + 'iou_score': iou_scores[idx] if idx < len(iou_scores) else '', + 'is_stable': is_stable_array[idx] if idx < len(is_stable_array) else '', + 'has_grasp': False, + 'grasp': {}, + 'obb_angle_deg': '', 'obb_theta_rad': '', + 'obb_width_px': '', 'obb_height_px': '', + 'obb_center_u': '', 'obb_center_v': '', + }) + + avg_sam_conf = total_sam_conf / num_dets if num_dets > 0 else 0.0 + avg_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0.0 + stability_rate = sum(1 for s in is_stable_array if s) / len(is_stable_array) if is_stable_array else 0.0 + + run = { + 'meta': { + 'run_no': run_no, + 'timestamp': datetime.utcnow().isoformat() + 'Z', + 'latency_s': round(latency_s, 3), + 'source': 'detect_objects', + }, + 'sam': { + 'success': True, + 'latency_s': round(latency_s, 3), + 'total_detections': num_dets, + 'avg_confidence': round(avg_sam_conf, 4), + 'average_iou': round(avg_iou, 4), + 'stability_rate': round(stability_rate, 4), + }, + 'clip': { + 'success': bool(clip_classifications), + 'latency_s': 0.0, + 'filtered_regions': len(clip_classifications), + }, + 'scene': {'success': False, 'latency_s': 0.0}, + 'obb': {'success': False, 'latency_s': 0.0}, + 'objects': objects, + 'relations': [], + 'grasps': [], + } + + history.append(run) + history = history[-20:] # keep last 20 runs + + with open(history_file, 'w') as f: + json.dump(history, f, indent=2) + + self.get_logger().info(f"Saved run #{run_no} to {history_file} ({num_dets} objects)") + + except Exception as e: + self.get_logger().warn(f"Failed to save run history: {e}") + def _startup_announce(self): """One-shot announce to make sure global topics appear after node startup.""" try: diff --git a/vision_scripts/test_dashboard_data_flow.py b/vision_scripts/test_dashboard_data_flow.py new file mode 100644 index 0000000..dfc754b --- /dev/null +++ b/vision_scripts/test_dashboard_data_flow.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python3 +""" +Unit test: verify detect_objects → vision_runs_history.json → dashboard data flow. + +Runs WITHOUT ROS2. Tests: + 1. JSON file is written to the correct path + 2. JSON schema matches exactly what dashboard HTML expects + 3. /api/data live path (benchmark_dashboard SAM topic callback schema) + 4. /api/run-history path (run history table schema) + +Usage: + python3 vision_scripts/test_dashboard_data_flow.py +""" + +import json +import os +import sys +import tempfile +import traceback +from pathlib import Path +from datetime import datetime + +# ── Path setup ──────────────────────────────────────────────────────────────── +SCRIPT_DIR = Path(__file__).resolve().parent +PACKAGE_ROOT = SCRIPT_DIR.parent # src/vision/ +HISTORY_FILE = PACKAGE_ROOT / "vision_runs_history.json" + +PASS = "\033[92m[PASS]\033[0m" +FAIL = "\033[91m[FAIL]\033[0m" +INFO = "\033[94m[INFO]\033[0m" + +failures = [] + +def check(name, condition, detail=""): + if condition: + print(f" {PASS} {name}") + else: + print(f" {FAIL} {name}" + (f" — {detail}" if detail else "")) + failures.append(name) + + +# ── Fake detection data (mirrors what detect_objects_callback produces) ──────── + +FAKE_DETECTIONS = [ + { + "id": "object_0", + "class_name": "object_0", + "confidence": 0.82, + "bbox": [100, 150, 300, 400], + "center": [200, 275], + "area": 45000, + "distance_cm": 55.3, + "iou_score": 0.71, + "is_stable": True, + "matched_prev_id": "", + "mask": None, # would be np array in real code + }, + { + "id": "object_1", + "class_name": "object_1", + "confidence": 0.64, + "bbox": [400, 200, 580, 450], + "center": [490, 325], + "area": 34200, + "distance_cm": 72.1, + "iou_score": 0.0, + "is_stable": False, + "matched_prev_id": "", + "mask": None, + }, +] + +FAKE_CLIP = { + 0: {"label": "cup", "confidence": 0.91, "bbox": [100, 150, 300, 400]}, + 1: {"label": "bottle", "confidence": 0.78, "bbox": [400, 200, 580, 450]}, +} + + +# ── Reproduce _save_detect_objects_run logic verbatim ───────────────────────── + +def simulate_save(latest_detections, clip_classifications, latency_s, history_file): + """Exact copy of _save_detect_objects_run from simple_sam_detector.py.""" + history = [] + if history_file.exists(): + try: + with open(history_file, "r") as f: + data = json.load(f) + if isinstance(data, list): + history = data + except Exception: + pass + + last_run_no = history[-1]["meta"]["run_no"] if history else 0 + run_no = last_run_no + 1 + + # Build parallel arrays (same logic as detect_objects_callback) + object_ids = [] + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = [], [], [], [] + confidences = [] + distances_cm = [] + iou_scores = [] + is_stable_arr = [] + + for idx, det in enumerate(latest_detections): + clip_info = clip_classifications.get(idx) + if clip_info: + object_ids.append(f"{clip_info['label']}_{idx}") + confidences.append(float(clip_info["confidence"])) + else: + object_ids.append(det["id"]) + confidences.append(float(det["confidence"])) + + bbox = det["bbox"] + bbox_x1.append(bbox[0]); bbox_y1.append(bbox[1]) + bbox_x2.append(bbox[2]); bbox_y2.append(bbox[3]) + distances_cm.append(float(det.get("distance_cm", -1.0))) + iou_scores.append(float(det.get("iou_score", 0.0))) + is_stable_arr.append(bool(det.get("is_stable", False))) + + num_dets = len(latest_detections) + total_sam_conf = 0.0 + objects = [] + for idx in range(num_dets): + det = latest_detections[idx] + clip_info = clip_classifications.get(idx, {}) + sam_conf = float(det.get("confidence", 0.0)) + total_sam_conf += sam_conf + objects.append({ + "object_id": object_ids[idx] if idx < len(object_ids) else f"object_{idx}", + "label": clip_info.get("label", "") if clip_info else "", + "bbox_x1": bbox_x1[idx] if idx < len(bbox_x1) else 0, + "bbox_y1": bbox_y1[idx] if idx < len(bbox_y1) else 0, + "bbox_x2": bbox_x2[idx] if idx < len(bbox_x2) else 0, + "bbox_y2": bbox_y2[idx] if idx < len(bbox_y2) else 0, + "sam_confidence": round(sam_conf, 4), + "clip_confidence": round(float(clip_info.get("confidence", 0.0)), 4) if clip_info else "", + "distance_cm": distances_cm[idx] if idx < len(distances_cm) else "", + "iou_score": iou_scores[idx] if idx < len(iou_scores) else "", + "is_stable": is_stable_arr[idx] if idx < len(is_stable_arr) else "", + "has_grasp": False, + "grasp": {}, + "obb_angle_deg": "", "obb_theta_rad": "", + "obb_width_px": "", "obb_height_px": "", + "obb_center_u": "", "obb_center_v": "", + }) + + avg_sam_conf = total_sam_conf / num_dets if num_dets > 0 else 0.0 + avg_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0.0 + stability_rate = sum(1 for s in is_stable_arr if s) / len(is_stable_arr) if is_stable_arr else 0.0 + + run = { + "meta": { + "run_no": run_no, + "timestamp": datetime.utcnow().isoformat() + "Z", + "latency_s": round(latency_s, 3), + "source": "detect_objects", + }, + "sam": { + "success": True, + "latency_s": round(latency_s, 3), + "total_detections": num_dets, + "avg_confidence": round(avg_sam_conf, 4), + "average_iou": round(avg_iou, 4), + "stability_rate": round(stability_rate, 4), + }, + "clip": { + "success": bool(clip_classifications), + "latency_s": 0.0, + "filtered_regions": len(clip_classifications), + }, + "scene": {"success": False, "latency_s": 0.0}, + "obb": {"success": False, "latency_s": 0.0}, + "objects": objects, + "relations": [], + "grasps": [], + } + + history.append(run) + history = history[-20:] + + with open(history_file, "w") as f: + json.dump(history, f, indent=2) + + return run, history + + +# ── Test 1: File is written ──────────────────────────────────────────────────── + +def test_file_written(): + print("\n── Test 1: File write ───────────────────────────────────────────") + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tf: + tmp = Path(tf.name) + try: + run, history = simulate_save(FAKE_DETECTIONS, FAKE_CLIP, 1.23, tmp) + check("File created", tmp.exists()) + check("File is valid JSON", True) # would have thrown above + check("History is list", isinstance(history, list)) + check("History has 1 entry", len(history) == 1) + check("run_no == 1", run["meta"]["run_no"] == 1) + print(f" {INFO} Written to: {tmp}") + return tmp, run, history + except Exception as e: + check("No exception during save", False, str(e)) + traceback.print_exc() + return None, None, None + finally: + pass # keep file for next test + + +# ── Test 2: JSON schema matches /api/run-history dashboard expectations ──────── + +def test_run_history_schema(run): + print("\n── Test 2: /api/run-history schema ─────────────────────────────") + if run is None: + print(" Skipped (previous test failed)") + return + + meta = run.get("meta", {}) + sam = run.get("sam", {}) + clip = run.get("clip", {}) + objs = run.get("objects", []) + + # meta fields (dashboard uses: run_no, timestamp, latency_s) + check("meta.run_no present", "run_no" in meta) + check("meta.timestamp present", "timestamp" in meta) + check("meta.latency_s present", "latency_s" in meta) + + # sam fields (dashboard uses: total_detections, avg_confidence, average_iou, stability_rate, latency_s, success) + for field in ["total_detections", "avg_confidence", "average_iou", "stability_rate", "latency_s", "success"]: + check(f"sam.{field} present", field in sam) + + # clip fields (dashboard uses: filtered_regions, latency_s, success) + for field in ["filtered_regions", "latency_s", "success"]: + check(f"clip.{field} present", field in clip) + + # objects array + check("objects is list", isinstance(objs, list)) + check("objects not empty", len(objs) == len(FAKE_DETECTIONS)) + + # per-object fields (dashboard uses all of these) + obj_fields = [ + "object_id", "label", "bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2", + "sam_confidence", "clip_confidence", "distance_cm", + "iou_score", "is_stable", "has_grasp", "grasp", + "obb_angle_deg", "obb_theta_rad", + ] + if objs: + for field in obj_fields: + check(f"objects[0].{field} present", field in objs[0]) + + # value spot-checks + check("sam.total_detections == 2", sam.get("total_detections") == 2) + check("clip.filtered_regions == 2", clip.get("filtered_regions") == 2) + check("object label 'cup' in object_id", "cup" in objs[0].get("object_id", "") or objs[0].get("label") == "cup") + check("bbox values correct", objs[0]["bbox_x1"] == 100 and objs[0]["bbox_y2"] == 400) + check("sam_confidence is float", isinstance(objs[0]["sam_confidence"], float)) + check("clip_confidence is float", isinstance(objs[0]["clip_confidence"], float)) + check("iou_score is float", isinstance(objs[0]["iou_score"], float)) + check("is_stable is bool", isinstance(objs[0]["is_stable"], bool)) + + +# ── Test 3: /api/data schema (benchmark_dashboard SAM topic callback) ────────── + +def test_api_data_schema(): + print("\n── Test 3: /api/data schema (sam_detections_callback output) ───") + # Replicate what benchmark_dashboard.sam_detections_callback produces + timestamp = datetime.now().isoformat() + sam_data_records = [] + for det in FAKE_DETECTIONS: + bbox = det["bbox"] + center = det["center"] + record = { + "timestamp": timestamp, + "frame_id": "camera_link", + "obj_id": det["id"], + "bbox": {"x1": bbox[0], "y1": bbox[1], "x2": bbox[2], "y2": bbox[3]}, + "center": {"u": center[0], "v": center[1]}, + "confidence": float(det["confidence"]), + "area": det["area"], + "distance_cm": float(det["distance_cm"]), + "iou_score": float(det["iou_score"]), + "is_stable": bool(det["is_stable"]), + "ap_iou_threshold": 0.5 if det["is_stable"] else 0.0, + } + sam_data_records.append(record) + + # Check each field that updateSAMDetections() accesses in the HTML + r = sam_data_records[0] + for field in ["obj_id", "bbox", "center", "confidence", "iou_score", "is_stable", "distance_cm", "timestamp"]: + check(f"sam record has '{field}'", field in r) + + check("bbox has x1/y1/x2/y2", all(k in r["bbox"] for k in ["x1","y1","x2","y2"])) + check("center has u/v", all(k in r["center"] for k in ["u","v"])) + check("confidence is numeric", isinstance(r["confidence"], float)) + check("distance_cm is numeric", isinstance(r["distance_cm"], float)) + + # Simulate full /api/data structure + api_data = { + "pixel_to_real": [], + "sam_detections": sam_data_records, + "clip_classifications":[], + "grasp_detections": [], + "scene_understanding": [], + "metadata": {"start_time": timestamp, "total_calls": len(sam_data_records)}, + } + # Check updateDashboard() fields + for key in ["pixel_to_real", "sam_detections", "clip_classifications", + "grasp_detections", "scene_understanding", "metadata"]: + check(f"/api/data has '{key}'", key in api_data) + check("metadata.total_calls present", "total_calls" in api_data["metadata"]) + + +# ── Test 4: Incremental append (run_no increments) ─────────────────────────── + +def test_incremental_append(tmp_file): + print("\n── Test 4: Incremental append (run numbers) ─────────────────────") + if tmp_file is None: + print(" Skipped") + return + simulate_save(FAKE_DETECTIONS, FAKE_CLIP, 0.9, tmp_file) + simulate_save(FAKE_DETECTIONS, FAKE_CLIP, 1.1, tmp_file) + + with open(tmp_file) as f: + history = json.load(f) + + check("3 runs in history", len(history) == 3) + check("run_no increments: 1,2,3", [r["meta"]["run_no"] for r in history] == [1, 2, 3]) + check("latest run_no is 3", history[-1]["meta"]["run_no"] == 3) + check("source == 'detect_objects'", all(r["meta"].get("source") == "detect_objects" for r in history)) + + +# ── Test 5: Actual HISTORY_FILE path resolution ─────────────────────────────── + +def test_path_resolution(): + print("\n── Test 5: Path resolution ──────────────────────────────────────") + # This is how simple_sam_detector.py resolves the path: + # Path(__file__).parent.parent where __file__ is vision/simple_sam_detector.py + sam_file = PACKAGE_ROOT / "vision" / "simple_sam_detector.py" + resolved = sam_file.parent.parent / "vision_runs_history.json" + + # And how benchmark_dashboard.py resolves it: + dash_file = PACKAGE_ROOT / "vision" / "benchmark_dashboard.py" + dash_resolved = dash_file.parent.parent / "vision_runs_history.json" + + check("simple_sam_detector path resolves correctly", + str(resolved) == str(HISTORY_FILE), + f"got {resolved}") + check("benchmark_dashboard path resolves correctly", + str(dash_resolved) == str(HISTORY_FILE), + f"got {dash_resolved}") + check("Both paths are identical", + resolved == dash_resolved) + check("simple_sam_detector.py exists", + sam_file.exists(), + f"missing: {sam_file}") + check("benchmark_dashboard.py exists", + dash_file.exists(), + f"missing: {dash_file}") + print(f" {INFO} History file path: {HISTORY_FILE}") + + +# ── Test 6: Dashboard HTML endpoints exist ──────────────────────────────────── + +def test_html_endpoints(): + print("\n── Test 6: Dashboard HTML references ────────────────────────────") + html_file = PACKAGE_ROOT / "dashboard" / "index.html" + check("index.html exists", html_file.exists()) + if html_file.exists(): + content = html_file.read_text() + check("fetches /api/data", "/api/data" in content) + check("fetches /api/run-history", "/api/run-history" in content) + check("polls every 2s", "2000" in content) + check("runHistoryBody table", "runHistoryBody" in content) + check("samBody table", "samBody" in content) + check("latestObjectsBody table", "latestObjectsBody" in content) + + +# ── Main ────────────────────────────────────────────────────────────────────── + +def main(): + print("=" * 65) + print(" Vision Dashboard Data-Flow Unit Test") + print("=" * 65) + + tmp_file, run, history = test_file_written() + test_run_history_schema(run) + test_api_data_schema() + test_incremental_append(tmp_file) + test_path_resolution() + test_html_endpoints() + + # Clean up temp file + if tmp_file and tmp_file.exists(): + tmp_file.unlink() + + print("\n" + "=" * 65) + if failures: + print(f"\033[91m {len(failures)} FAILED:\033[0m") + for f in failures: + print(f" • {f}") + print("=" * 65) + sys.exit(1) + else: + print(f"\033[92m All tests passed.\033[0m") + print("=" * 65) + + # ── Show current history file state ─────────────────────────────────────── + print(f"\n{INFO} Checking actual history file: {HISTORY_FILE}") + if HISTORY_FILE.exists(): + with open(HISTORY_FILE) as f: + data = json.load(f) + print(f" {PASS} File exists — {len(data)} run(s) stored") + if data: + latest = data[-1] + print(f" {INFO} Latest run: #{latest['meta']['run_no']} " + f"ts={latest['meta']['timestamp']} " + f"objects={latest['sam']['total_detections']}") + else: + print(f" \033[93m[WARN]\033[0m {HISTORY_FILE} does not exist yet.") + print(f" The file is created when you call:") + print(f" ros2 service call /vision/detect_objects custom_interfaces/srv/DetectObjects") + print(f" Make sure simple_sam_detector is running and a camera frame is available.") + + +if __name__ == "__main__": + main() From b328bbb0589f3db8482db30c36e4c5461653c007 Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Sat, 11 Apr 2026 04:51:13 +0700 Subject: [PATCH 13/16] feat: benchmark sam and clip and find object service --- dashboard/find_object.html | 817 +++++++++++++++++++++++++++++ dashboard/index.html | 30 +- vision/benchmark_dashboard.py | 346 +++++++++++- vision/clip_classifier.py | 375 ++++++------- vision/find_object_service_node.py | 45 +- vision/obb_angle_service_node.py | 256 ++++----- vision/simple_sam_detector.py | 614 +++++++++++++++------- 7 files changed, 1926 insertions(+), 557 deletions(-) create mode 100644 dashboard/find_object.html diff --git a/dashboard/find_object.html b/dashboard/find_object.html new file mode 100644 index 0000000..907379a --- /dev/null +++ b/dashboard/find_object.html @@ -0,0 +1,817 @@ + + + + + + Find Object — Vision Benchmark + + + + + + + + +
+ +
+
+
Find Object — Service Benchmark
+
/find_object · SAM + CLIP + OBB + Pixel-to-Real · Human-in-the-Loop accuracy
+
+
+
+
+ Auto-refresh 3 s +
+ +
+
+ +
+ + +
+
+
0
+
Total Calls
+
+
+
0
+
Succeeded
+
+
+
0
+
Failed
+
+
+
+
HITL Accuracy
+
+
+
0
+
Pending Review
+
+
+
+
Avg Confidence
+
+
+ + +
+
+
+
+
Trigger /find_object Service
+
ros2 service call /find_object custom_interfaces/srv/FindObjectReal "{label: '...'}"
+
+
+
+
+
+
Object Label
+ +
+ +
+
Ready — enter a label and press Call Service.
+ + + +
+
+ + +
+
+
+
+
Human-in-the-Loop Verification
+
Review latest detection result — mark as correct or incorrect
+
+ Waiting for call… +
+
+
+ No result to review yet. Trigger the service above first. +
+ +
+
+ + +
+
+
+
+
Call History
+
find_object_history.json · newest first · max 50 calls
+
+ 0 Calls +
+
+ + + + + + + + + + + + + + + + + + + + +
#TimestampLabelObject IDConfidenceBounding BoxX (m)Y (m)Z (m)θ (deg)Service OKHITL VerdictVerifyMessage
No calls yet — trigger the service above.
+
+
+ + +
+
+
+
+
Accuracy Report
+
Per-label breakdown from human-in-the-loop verdicts
+
+
+
+ + + + + + + + + + + + + + + +
LabelTotal CallsSucceededFailedReviewedCorrectIncorrectHITL AccuracyAvg Confidence
No data yet.
+
+
+ +
+ + +
+ + + + diff --git a/dashboard/index.html b/dashboard/index.html index a3bd85d..241e612 100644 --- a/dashboard/index.html +++ b/dashboard/index.html @@ -511,6 +511,10 @@
Object Explorer
+ @@ -539,7 +543,7 @@
0
Stored Runs
-
Rolling last 20
+
Rolling last 50
0
@@ -580,7 +584,7 @@
Vision Run Summary
-
vision_runs_history.json · newest first · max 20 runs
+
vision_runs_history.json · newest first · max 50 runs
0 Runs
@@ -1049,9 +1053,31 @@ updateCLIPClassifications(data.clip_classifications); updateGraspDetections(data.grasp_detections); updateSceneUnderstanding(data.scene_understanding); + + // OBB live records from /api/data (synced from vision_runs_history.json) + if (data.obb_detections && data.obb_detections.length) { + updateOBBFromLiveData(data.obb_detections); + } + document.getElementById('lastUpdated').textContent = new Date().toLocaleTimeString(); } + function updateOBBFromLiveData(records) { + // Populate OBB section metrics from the live /api/data feed + // (the table itself is rendered by updateOBBSection from run history) + if (!records.length) return; + const angles = records.map(r => r.angle_deg).filter(a => a != null); + if (!angles.length) return; + const avg = angles.reduce((s, v) => s + v, 0) / angles.length; + document.getElementById('obbAvgAngle').textContent = avg.toFixed(1) + '°'; + document.getElementById('obbMinAngle').textContent = Math.min(...angles).toFixed(1) + '°'; + document.getElementById('obbMaxAngle').textContent = Math.max(...angles).toFixed(1) + '°'; + document.getElementById('obbTotalObjects').textContent = records.length; + document.getElementById('obbBadge').textContent = `${records.length} Objects`; + setNav('nav-obb', records.length); + document.getElementById('obbCount').textContent = records.length; + } + // ── Pixel to Real ───────────────────────────────────────────────────────── function updatePixelToReal(records) { document.getElementById('pixelBadge').textContent = `${records.length} Records`; diff --git a/vision/benchmark_dashboard.py b/vision/benchmark_dashboard.py index beca947..67ef805 100644 --- a/vision/benchmark_dashboard.py +++ b/vision/benchmark_dashboard.py @@ -75,6 +75,7 @@ def __init__(self): 'sam_detections': [], 'clip_classifications': [], 'grasp_detections': [], + 'obb_detections': [], 'scene_understanding': [], 'metadata': { 'start_time': datetime.now().isoformat(), @@ -147,7 +148,10 @@ def __init__(self): # Timer to publish data periodically self.publish_timer = self.create_timer(1.0, self.publish_data) - + + # Timer to sync CLIP/OBB/GraspNet/Pixel-to-Real from vision_runs_history.json + self.sync_timer = self.create_timer(2.0, self.sync_from_run_history) + # Start HTTP server in separate thread self.start_http_server() @@ -312,12 +316,125 @@ def add_grasp_detection_record(self, grasp_pose): self.data['metadata']['total_calls'] += 1 + def sync_from_run_history(self): + """Sync CLIP, GraspNet, OBB, and Pixel-to-Real data from vision_runs_history.json + so the /api/data endpoint (CLIP / GraspNet / Pixel-to-Real sections) stays populated + even when those dedicated service nodes are not running.""" + try: + from pathlib import Path + history_file = Path(__file__).parent.parent / 'vision_runs_history.json' + if not history_file.exists(): + return + + with open(history_file, 'r') as f: + runs = json.load(f) + if not isinstance(runs, list) or not runs: + return + + timestamp = datetime.now().isoformat() + + # Rebuild lists from all stored runs (most recent first for display) + new_clip = [] + new_grasp = [] + new_pixel = [] + new_obb = [] + + for run in runs: + run_ts = run.get('meta', {}).get('timestamp', timestamp) + run_no = run.get('meta', {}).get('run_no', 0) + objects = run.get('objects', []) + + for obj in objects: + # ── CLIP ────────────────────────────────────────────────── + clip_conf = obj.get('clip_confidence', '') + label = obj.get('label', '') + if clip_conf != '' and label: + new_clip.append({ + 'timestamp': run_ts, + 'test_id': f"{run_no}:{obj.get('object_id', '')}", + 'bbox': { + 'x1': obj.get('bbox_x1', 0), + 'y1': obj.get('bbox_y1', 0), + 'x2': obj.get('bbox_x2', 0), + 'y2': obj.get('bbox_y2', 0), + }, + 'label': label, + 'confidence': float(clip_conf), + 'top1_accuracy': float(clip_conf) >= 0.5, + }) + + # ── GraspNet ────────────────────────────────────────────── + grasp = obj.get('grasp', {}) + if obj.get('has_grasp') and grasp: + pix = grasp.get('pixel', {}) + wld = grasp.get('world', {}) + new_grasp.append({ + 'timestamp': run_ts, + 'test_id': f"{run_no}:{obj.get('object_id', '')}", + 'object_id': obj.get('object_id', ''), + 'pixel_position': {'u': pix.get('u', 0), 'v': pix.get('v', 0)}, + 'world_position': { + 'x': wld.get('x', 0.0), + 'y': wld.get('y', 0.0), + 'z': wld.get('z', 0.0), + }, + 'quality_score': float(grasp.get('quality_score', 0.0)), + 'grasp_width': float(grasp.get('grasp_width', 0.0)), + 'approach_direction': grasp.get('approach_direction', ''), + 'bbox': [obj.get('bbox_x1', 0), obj.get('bbox_y1', 0), + obj.get('bbox_x2', 0), obj.get('bbox_y2', 0)], + }) + + # ── Pixel-to-Real ───────────────────────────────────────── + world = obj.get('world', {}) + if world and world.get('x') is not None: + new_pixel.append({ + 'timestamp': run_ts, + 'test_id': f"{run_no}:{obj.get('object_id', '')}", + 'input': {'u': world.get('u', 0), 'v': world.get('v', 0)}, + 'output': {'x': float(world.get('x', 0.0)), + 'y': float(world.get('y', 0.0)), + 'z': float(world.get('z', 0.0))}, + }) + + # ── OBB ─────────────────────────────────────────────────── + obb_angle = obj.get('obb_angle_deg', '') + if obb_angle != '': + new_obb.append({ + 'timestamp': run_ts, + 'test_id': f"{run_no}:{obj.get('object_id', '')}", + 'object_id': obj.get('object_id', ''), + 'label': obj.get('label', ''), + 'angle_deg': float(obb_angle), + 'theta_rad': float(obj.get('obb_theta_rad', 0.0)), + 'width_px': float(obj.get('obb_width_px', 0.0)), + 'height_px': float(obj.get('obb_height_px', 0.0)), + 'center_u': float(obj.get('obb_center_u', 0.0)), + 'center_v': float(obj.get('obb_center_v', 0.0)), + 'sam_confidence': float(obj.get('sam_confidence', 0.0)), + 'bbox': [obj.get('bbox_x1', 0), obj.get('bbox_y1', 0), + obj.get('bbox_x2', 0), obj.get('bbox_y2', 0)], + }) + + # Only update if we got new data (avoids overwriting live topic data with empty) + if new_clip: + self.data['clip_classifications'] = new_clip[-1000:] + if new_grasp: + self.data['grasp_detections'] = new_grasp[-1000:] + if new_pixel: + self.data['pixel_to_real'] = new_pixel[-1000:] + if new_obb: + self.data['obb_detections'] = new_obb[-1000:] + + except Exception as e: + self.get_logger().warn(f'sync_from_run_history failed: {e}') + def publish_data(self): """Publish benchmark data to topic""" msg = String() msg.data = json.dumps(self.data, cls=_ROSJSONEncoder) self.data_publisher.publish(msg) - + def clear_data_callback(self, request, response): """Clear all benchmark data""" self.data = { @@ -325,16 +442,17 @@ def clear_data_callback(self, request, response): 'sam_detections': [], 'clip_classifications': [], 'grasp_detections': [], + 'obb_detections': [], 'scene_understanding': [], 'metadata': { 'start_time': datetime.now().isoformat(), 'total_calls': 0 } } - + response.success = True response.message = "Benchmark data cleared" - + self.get_logger().info('Benchmark data cleared') return response @@ -348,7 +466,7 @@ def start_http_server(self): except Exception: html_dir = Path(__file__).parent.parent / 'dashboard' - # History JSON is written by simple_sam_detector next to the installed module + # History JSON files written next to the installed module package_path = Path(__file__).parent.parent self.get_logger().info(f'Dashboard HTML dir: {html_dir}') @@ -360,47 +478,219 @@ def start_http_server(self): self.get_logger().warn('Creating basic HTML file...') html_dir.mkdir(exist_ok=True) self.create_default_html(html_file) - - # Custom handler that serves files from html_dir and provides data endpoint + + node_logger = self.get_logger() + + # Custom handler that serves files and provides API endpoints class DashboardHandler(SimpleHTTPRequestHandler): def __init__(self, *args, dashboard_node=None, **kwargs): self.dashboard_node = dashboard_node super().__init__(*args, directory=str(html_dir), **kwargs) - + + # ── helpers ──────────────────────────────────────────────────── + def _json_response(self, data, status=200): + body = json.dumps(data, cls=_ROSJSONEncoder).encode() + self.send_response(status) + self.send_header('Content-type', 'application/json') + self.send_header('Access-Control-Allow-Origin', '*') + self.end_headers() + self.wfile.write(body) + + def _read_json_file(self, path, default): + try: + if path.exists(): + with open(path, 'r') as f: + return json.load(f) + except Exception: + pass + return default + + def _write_json_file(self, path, data): + with open(path, 'w') as f: + json.dump(data, f, indent=2) + + # ── OPTIONS (CORS pre-flight) ─────────────────────────────────── + def do_OPTIONS(self): + self.send_response(204) + self.send_header('Access-Control-Allow-Origin', '*') + self.send_header('Access-Control-Allow-Methods', 'GET, POST, OPTIONS') + self.send_header('Access-Control-Allow-Headers', 'Content-Type') + self.end_headers() + + # ── GET ──────────────────────────────────────────────────────── def do_GET(self): if self.path == '/api/data': - # Serve benchmark data as JSON - self.send_response(200) - self.send_header('Content-type', 'application/json') - self.send_header('Access-Control-Allow-Origin', '*') - self.end_headers() - data_json = json.dumps(self.dashboard_node.data, cls=_ROSJSONEncoder) - self.wfile.write(data_json.encode()) + self._json_response(self.dashboard_node.data) + elif self.path == '/api/run-history': - # Serve vision_runs_history.json from workspace root history_file = package_path / 'vision_runs_history.json' - self.send_response(200) - self.send_header('Content-type', 'application/json') - self.send_header('Access-Control-Allow-Origin', '*') - self.end_headers() - if history_file.exists(): - self.wfile.write(history_file.read_bytes()) - else: - self.wfile.write(b'[]') + data = self._read_json_file(history_file, []) + self._json_response(data) + + elif self.path == '/api/find-object-history': + fo_file = package_path / 'find_object_history.json' + data = self._read_json_file(fo_file, []) + self._json_response(data) + else: - # Serve static files super().do_GET() - + + # ── POST ─────────────────────────────────────────────────────── + def do_POST(self): + length = int(self.headers.get('Content-Length', 0)) + body = self.rfile.read(length) if length else b'{}' + try: + payload = json.loads(body) + except Exception: + payload = {} + + if self.path == '/api/find-object': + self._handle_find_object(payload) + + elif self.path == '/api/find-object-verdict': + self._handle_verdict(payload) + + elif self.path == '/api/find-object-clear': + fo_file = package_path / 'find_object_history.json' + self._write_json_file(fo_file, []) + self._json_response({'ok': True}) + + else: + self._json_response({'error': 'unknown endpoint'}, 404) + + # ── /api/find-object ────────────────────────────────────────── + def _handle_find_object(self, payload): + """Call /find_object ROS2 service and persist result.""" + import subprocess, shlex + label = payload.get('label', '').strip() + if not label: + self._json_response({'error': 'label is required'}, 400) + return + + fo_file = package_path / 'find_object_history.json' + history = self._read_json_file(fo_file, []) + call_id = len(history) + 1 + timestamp = datetime.now().isoformat() + + # Call the ROS2 service via subprocess + cmd = ( + f"ros2 service call /find_object " + f"custom_interfaces/srv/FindObjectReal " + f"\"{{label: '{label}'}}\"" + ) + try: + result = subprocess.run( + cmd, shell=True, capture_output=True, text=True, timeout=30 + ) + output = result.stdout + result.stderr + node_logger.info(f'find_object [{label}] stdout: {output[:300]}') + + # Parse the ROS2 CLI response format + entry = _parse_find_object_response(output, label, call_id, timestamp) + except subprocess.TimeoutExpired: + entry = { + 'call_id': call_id, 'timestamp': timestamp, + 'label_searched': label, 'success': False, + 'message': 'Service call timed out (30 s)', + 'object_id': '', 'bbox': [], 'confidence': 0.0, + 'x': 0.0, 'y': 0.0, 'z': 0.0, 'theta': 0.0, + 'verdict': None, + } + except Exception as e: + entry = { + 'call_id': call_id, 'timestamp': timestamp, + 'label_searched': label, 'success': False, + 'message': f'Error: {e}', + 'object_id': '', 'bbox': [], 'confidence': 0.0, + 'x': 0.0, 'y': 0.0, 'z': 0.0, 'theta': 0.0, + 'verdict': None, + } + + history.append(entry) + history = history[-50:] + self._write_json_file(fo_file, history) + self._json_response(entry) + + # ── /api/find-object-verdict ────────────────────────────────── + def _handle_verdict(self, payload): + call_id = payload.get('call_id') + verdict = payload.get('verdict') # true / false + if call_id is None or verdict is None: + self._json_response({'error': 'call_id and verdict required'}, 400) + return + + fo_file = package_path / 'find_object_history.json' + history = self._read_json_file(fo_file, []) + updated = False + for entry in history: + if entry.get('call_id') == call_id: + entry['verdict'] = bool(verdict) + updated = True + break + if updated: + self._write_json_file(fo_file, history) + self._json_response({'ok': True}) + else: + self._json_response({'error': f'call_id {call_id} not found'}, 404) + + def log_message(self, fmt, *args): + pass # suppress HTTP access log noise + + def _parse_find_object_response(output, label, call_id, timestamp): + """Parse ros2 service call CLI output into a dict.""" + import re + entry = { + 'call_id': call_id, 'timestamp': timestamp, + 'label_searched': label, 'success': False, + 'message': output.strip()[:500], + 'object_id': '', 'bbox': [], 'confidence': 0.0, + 'x': 0.0, 'y': 0.0, 'z': 0.0, 'theta': 0.0, + 'verdict': None, + } + try: + # success field + m = re.search(r'success=(\w+)', output) + if m: + entry['success'] = m.group(1).lower() == 'true' + # message field + m = re.search(r"message='([^']*)'", output) + if m: + entry['message'] = m.group(1) + # object_id + m = re.search(r"object_id='([^']*)'", output) + if m: + entry['object_id'] = m.group(1) + # bbox + m = re.search(r'bbox=\[([^\]]*)\]', output) + if m: + try: + entry['bbox'] = [int(x.strip()) for x in m.group(1).split(',') if x.strip()] + except Exception: + pass + # confidence + m = re.search(r'confidence=([\d.]+)', output) + if m: + entry['confidence'] = float(m.group(1)) + # x, y, z, theta + for field in ('x', 'y', 'z', 'theta'): + m = re.search(rf'{field}=([-\d.]+)', output) + if m: + entry[field] = float(m.group(1)) + except Exception as e: + node_logger.warn(f'Response parse error: {e}') + return entry + # Create handler with dashboard_node reference def handler_with_node(*args, **kwargs): return DashboardHandler(*args, dashboard_node=self, **kwargs) - + # Start server in separate thread server = HTTPServer(('0.0.0.0', 8080), handler_with_node) server_thread = threading.Thread(target=server.serve_forever, daemon=True) server_thread.start() - + self.get_logger().info(f'HTTP server started on http://localhost:8080') + self.get_logger().info(f'Find Object page: http://localhost:8080/find_object.html') self.get_logger().info(f'Serving files from: {html_dir}') def create_default_html(self, html_file): diff --git a/vision/clip_classifier.py b/vision/clip_classifier.py index 897094e..b44e029 100755 --- a/vision/clip_classifier.py +++ b/vision/clip_classifier.py @@ -1286,244 +1286,173 @@ def _classify_regions(self, rgb_image: np.ndarray, bboxes: List[List[int]]) -> D def visualization_callback(self): """Display camera feed with classification in OpenCV window""" - # Use latest_rgb for real-time display, fallback to captured_frame frame_to_display = self.latest_rgb if self.latest_rgb is not None else self.captured_frame - + if frame_to_display is None: - # Show waiting message blank = np.zeros((480, 640, 3), dtype=np.uint8) - cv2.putText( - blank, - f"Waiting to capture frame from {self.rgb_topic}...", - (50, 240), - cv2.FONT_HERSHEY_SIMPLEX, - 0.8, - (255, 255, 255), - 2 - ) + cv2.putText(blank, f"Waiting for {self.rgb_topic}...", (60, 240), + cv2.FONT_HERSHEY_SIMPLEX, 0.6, (160, 160, 160), 1) cv2.imshow(self.window_name, blank) cv2.waitKey(1) return - - # Create visualization image from latest live frame + vis_image = frame_to_display.copy() h, w = vis_image.shape[:2] - - # Check if we have region classifications (from SAM auto-classification) + + # ── Helpers ────────────────────────────────────────────────────────── + COLORS = [ + (0, 200, 255), # amber + (80, 255, 80), # lime + (255, 80, 80), # blue + (255, 0, 200), # magenta + (0, 230, 230), # yellow + (200, 80, 255), # violet + (0, 255, 180), # spring green + (255, 180, 0), # sky blue + ] + FOUND_COLOR = (60, 230, 60) # bright green for "found" object + + def semi_rect(img, x1, y1, x2, y2, fill=(15, 15, 15), alpha=0.75): + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(img.shape[1]-1, x2), min(img.shape[0]-1, y2) + if x2 <= x1 or y2 <= y1: + return + roi = img[y1:y2, x1:x2] + img[y1:y2, x1:x2] = cv2.addWeighted(roi, 1-alpha, np.full_like(roi, fill), alpha, 0) + + def corner_bracket(img, x1, y1, x2, y2, color, lw=2): + clen = max(10, int(min(x2-x1, y2-y1) * 0.15)) + for (px, py, dx, dy) in [(x1,y1,1,1),(x2,y1,-1,1),(x1,y2,1,-1),(x2,y2,-1,-1)]: + cv2.line(img, (px, py), (px + dx*clen, py), color, lw) + cv2.line(img, (px, py), (px, py + dy*clen), color, lw) + + def draw_label(img, text, x, y, color, fs=0.36, ft=1, pad=4, accent=3): + """Draw a semi-transparent dark label with a color accent bar.""" + (lw_px, lh_px), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, fs, ft) + bx1, by1 = x, y - lh_px - pad + bx2, by2 = x + accent + pad + lw_px + pad, y + pad + semi_rect(img, bx1, by1, bx2, by2) + cv2.rectangle(img, (max(0,bx1), max(0,by1)), (max(0,bx1)+accent, max(0,by2)), color, -1) + cv2.putText(img, text, (bx1 + accent + pad, y), + cv2.FONT_HERSHEY_SIMPLEX, fs, (235, 235, 235), ft) + + # ── Region classifications (SAM + CLIP pipeline) ────────────────── if self.latest_region_classifications: - # Draw each classified region with bounding box and label - for region in self.latest_region_classifications: - bbox = region['bbox'] - top_pred = region['top_prediction'] + for ri, region in enumerate(self.latest_region_classifications): + bbox = region['bbox'] + top_pred = region['top_prediction'] region_id = region['region_id'] - - # Draw bounding box - cv2.rectangle( - vis_image, - (bbox[0], bbox[1]), - (bbox[2], bbox[3]), - (0, 255, 255), # Yellow for classified regions - 3 - ) - - # Prepare label text - label = f"#{region_id}: {top_pred['label']}" - conf = f"{top_pred['confidence']:.1%}" - - # Calculate label position (above bbox) - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2) - - # Draw label background - cv2.rectangle( - vis_image, - (bbox[0], bbox[1] - label_size[1] - 25), - (bbox[0] + max(label_size[0], 100), bbox[1]), - (0, 255, 255), - -1 - ) - - # Draw label text - cv2.putText( - vis_image, - label, - (bbox[0] + 5, bbox[1] - 10), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (0, 0, 0), - 2 - ) - - # Draw confidence - cv2.putText( - vis_image, - conf, - (bbox[0] + 5, bbox[1] - 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.6, - (0, 0, 0), - 2 - ) - - # Add info text - info_text = f"Classified Regions: {len(self.latest_region_classifications)}" - cv2.putText( - vis_image, - info_text, - (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.8, - (0, 255, 255), - 2 - ) - - # Draw full image classification overlay (if available and no regions) + color = COLORS[ri % len(COLORS)] + x1, y1, x2, y2 = bbox + + # Corner-bracket bbox + thin outline + cv2.rectangle(vis_image, (x1, y1), (x2, y2), color, 1) + corner_bracket(vis_image, x1, y1, x2, y2, color, lw=2) + + # Label: "#id class conf%" + conf_pct = top_pred['confidence'] + label = f"#{region_id} {top_pred['label']} {conf_pct:.0%}" + + # Place above bbox, clamp to image top + ly = y1 - 5 + if ly < 16: + ly = y2 + 16 + draw_label(vis_image, label, x1, ly, color) + + # Confidence bar under bbox top edge + bar_w = x2 - x1 + filled = max(2, int(bar_w * conf_pct)) + cv2.rectangle(vis_image, (x1, y1), (x1 + bar_w, y1 + 3), (40, 40, 40), -1) + cv2.rectangle(vis_image, (x1, y1), (x1 + filled, y1 + 3), color, -1) + + # ── Full-image classification (no regions) ──────────────────────── elif self.latest_classification: - top_pred = self.latest_classification['output']['top_prediction'] - all_preds = self.latest_classification['output']['all_predictions'][:5] # Top 5 - - # Draw semi-transparent overlay at bottom - overlay = vis_image.copy() - cv2.rectangle(overlay, (0, h-150), (w, h), (0, 0, 0), -1) - vis_image = cv2.addWeighted(vis_image, 0.7, overlay, 0.3, 0) - - # Draw top prediction (large) - label_text = f"Top: {top_pred['label']}" - conf_text = f"{top_pred['confidence']:.1%}" - - cv2.putText( - vis_image, - label_text, - (20, h-100), - cv2.FONT_HERSHEY_SIMPLEX, - 1.2, - (0, 255, 0), - 3 - ) - - cv2.putText( - vis_image, - conf_text, - (20, h-60), - cv2.FONT_HERSHEY_SIMPLEX, - 1.0, - (0, 255, 0), - 2 - ) - - # Draw top 5 predictions (smaller, on right) - y_offset = h - 120 + top_pred = self.latest_classification['output']['top_prediction'] + all_preds = self.latest_classification['output']['all_predictions'][:5] + + # Bottom panel + panel_h = 110 + semi_rect(vis_image, 0, h - panel_h, w, h, fill=(12, 12, 12), alpha=0.80) + + # Top prediction + top_label = f"{top_pred['label']}" + top_conf = f"{top_pred['confidence']:.1%}" + cv2.putText(vis_image, top_label, (14, h - panel_h + 26), + cv2.FONT_HERSHEY_SIMPLEX, 0.65, (80, 255, 80), 1) + cv2.putText(vis_image, top_conf, (14, h - panel_h + 46), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (160, 255, 160), 1) + + # Confidence bar for top prediction + bar_max = min(w // 2 - 20, 260) + filled = int(bar_max * top_pred['confidence']) + cv2.rectangle(vis_image, (14, h - panel_h + 52), (14 + bar_max, h - panel_h + 56), (50, 50, 50), -1) + cv2.rectangle(vis_image, (14, h - panel_h + 52), (14 + filled, h - panel_h + 56), (80, 255, 80), -1) + + # Top-5 list on the right + col_x = w - 230 + cv2.putText(vis_image, "Top 5", (col_x, h - panel_h + 18), + cv2.FONT_HERSHEY_SIMPLEX, 0.38, (140, 140, 140), 1) for i, pred in enumerate(all_preds): - text = f"{i+1}. {pred['label']}: {pred['confidence']:.1%}" - cv2.putText( - vis_image, - text, - (w - 350, y_offset + i*30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.6, - (255, 255, 255), - 2 - ) + row_text = f"{i+1}. {pred['label']}" + row_conf = f"{pred['confidence']:.0%}" + row_y = h - panel_h + 34 + i * 16 + row_color = (200, 200, 200) if i > 0 else (80, 255, 80) + cv2.putText(vis_image, row_text, (col_x, row_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.34, row_color, 1) + cv2.putText(vis_image, row_conf, (w - 42, row_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.34, row_color, 1) + else: - # Show "Call service to classify" message - cv2.putText( - vis_image, - "Call /vision/classify_all or /vision/classify_bb", - (20, h-30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (255, 255, 255), - 2 - ) - - # Draw found object highlight (if available) + # Idle hint at bottom + hint = "Run /vision/classify_all or /vision/classify_bb" + semi_rect(vis_image, 0, h - 24, w, h, fill=(12, 12, 12), alpha=0.70) + cv2.putText(vis_image, hint, (8, h - 8), + cv2.FONT_HERSHEY_SIMPLEX, 0.38, (130, 130, 130), 1) + + # ── Found-object highlight ──────────────────────────────────────── if self.latest_found_object: found = self.latest_found_object - bbox = found['bbox'] - - # Draw thick green bounding box for found object - cv2.rectangle( - vis_image, - (bbox[0], bbox[1]), - (bbox[2], bbox[3]), - (0, 255, 0), # Green for found object - 5 - ) - - # Prepare label text - label = f"FOUND: {found['label']}" - conf = f"Conf: {found['confidence']:.2f}" - - # Calculate label position (above bbox) - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.9, 2) - - # Draw label background (green) - cv2.rectangle( - vis_image, - (bbox[0], bbox[1] - label_size[1] - 35), - (bbox[0] + max(label_size[0], 150), bbox[1]), - (0, 255, 0), - -1 - ) - - # Draw label text - cv2.putText( - vis_image, - label, - (bbox[0] + 5, bbox[1] - 15), - cv2.FONT_HERSHEY_SIMPLEX, - 0.9, - (0, 0, 0), - 2 - ) - - # Draw confidence - cv2.putText( - vis_image, - conf, - (bbox[0] + 5, bbox[1] - 40), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (0, 0, 0), - 2 - ) - - # Add corner markers - corner_size = 15 - # Top-left - cv2.line(vis_image, (bbox[0], bbox[1]), (bbox[0] + corner_size, bbox[1]), (0, 255, 0), 5) - cv2.line(vis_image, (bbox[0], bbox[1]), (bbox[0], bbox[1] + corner_size), (0, 255, 0), 5) - # Top-right - cv2.line(vis_image, (bbox[2], bbox[1]), (bbox[2] - corner_size, bbox[1]), (0, 255, 0), 5) - cv2.line(vis_image, (bbox[2], bbox[1]), (bbox[2], bbox[1] + corner_size), (0, 255, 0), 5) - # Bottom-left - cv2.line(vis_image, (bbox[0], bbox[3]), (bbox[0] + corner_size, bbox[3]), (0, 255, 0), 5) - cv2.line(vis_image, (bbox[0], bbox[3]), (bbox[0], bbox[3] - corner_size), (0, 255, 0), 5) - # Bottom-right - cv2.line(vis_image, (bbox[2], bbox[3]), (bbox[2] - corner_size, bbox[3]), (0, 255, 0), 5) - cv2.line(vis_image, (bbox[2], bbox[3]), (bbox[2], bbox[3] - corner_size), (0, 255, 0), 5) - - # Add title bar - cv2.putText( - vis_image, - f"CLIP Classifier | Frame: {self.frame_counter}", - (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (0, 0, 0), - 4 - ) - - cv2.putText( - vis_image, - f"CLIP Classifier | Frame: {self.frame_counter}", - (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (255, 255, 255), - 2 - ) - - # Show image + bbox = found['bbox'] + x1, y1, x2, y2 = bbox + conf = found['confidence'] + + # Pulsing-style: thin outline + thick corner brackets + cv2.rectangle(vis_image, (x1, y1), (x2, y2), FOUND_COLOR, 1) + clen = max(16, int(min(x2-x1, y2-y1) * 0.18)) + for (px, py, dx, dy) in [(x1,y1,1,1),(x2,y1,-1,1),(x1,y2,1,-1),(x2,y2,-1,-1)]: + cv2.line(vis_image, (px, py), (px + dx*clen, py), FOUND_COLOR, 3) + cv2.line(vis_image, (px, py), (px, py + dy*clen), FOUND_COLOR, 3) + + # Center dot + cx, cy = (x1+x2)//2, (y1+y2)//2 + cv2.circle(vis_image, (cx, cy), 5, (15, 15, 15), -1) + cv2.circle(vis_image, (cx, cy), 5, FOUND_COLOR, 2) + cv2.circle(vis_image, (cx, cy), 2, (240, 240, 240), -1) + + # Label + confidence bar + label = f"FOUND {found['label']} {conf:.0%}" + ly = y1 - 5 + if ly < 16: + ly = y2 + 16 + draw_label(vis_image, label, x1, ly, FOUND_COLOR, fs=0.40, pad=5, accent=4) + + bar_w = x2 - x1 + filled = max(2, int(bar_w * conf)) + cv2.rectangle(vis_image, (x1, y2 - 4), (x1 + bar_w, y2), (40, 40, 40), -1) + cv2.rectangle(vis_image, (x1, y2 - 4), (x1 + filled, y2), FOUND_COLOR, -1) + + # ── Top info bar ────────────────────────────────────────────────── + bar_h = 22 + semi_rect(vis_image, 0, 0, w, bar_h, fill=(12, 12, 12), alpha=0.78) + if self.latest_region_classifications: + status = f"CLIP | Regions: {len(self.latest_region_classifications)} | Frame: {self.frame_counter}" + elif self.latest_classification: + status = f"CLIP | Full-image mode | Frame: {self.frame_counter}" + else: + status = f"CLIP Classifier | Frame: {self.frame_counter} | Idle" + cv2.putText(vis_image, status, (8, 15), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (210, 210, 210), 1) + cv2.imshow(self.window_name, vis_image) cv2.waitKey(1) diff --git a/vision/find_object_service_node.py b/vision/find_object_service_node.py index ba31db3..31ec056 100644 --- a/vision/find_object_service_node.py +++ b/vision/find_object_service_node.py @@ -5,9 +5,6 @@ from rclpy.callback_groups import ReentrantCallbackGroup from rclpy.executors import MultiThreadedExecutor from custom_interfaces.srv import DetectObjects, FindObject, PixelToReal, FindObjectReal, FindObjectAngleBB, FindMultiObjectReal, FindMultiObject - - - """ ros2 service call /find_object custom_interfaces/srv/FindObjectReal "{label: 'bowl'}" @@ -77,7 +74,7 @@ def __init__(self): ) self.get_logger().info('Find Object Service Node initialized') - + # Wait for services to be available self.wait_for_services() @@ -102,6 +99,43 @@ def wait_for_services(self): self.get_logger().info('Service clients ready') + def _log_find_object_call(self, label, response): + """Append a /find_object call result to find_object_history.json for the dashboard.""" + try: + with self._history_lock: + history = [] + if self._history_file.exists(): + try: + with open(self._history_file, 'r') as f: + history = json.load(f) + if not isinstance(history, list): + history = [] + except Exception: + history = [] + + call_id = len(history) + 1 + entry = { + 'call_id': call_id, + 'timestamp': datetime.now().isoformat(), + 'label_searched': label, + 'success': bool(response.success), + 'message': response.message, + 'object_id': response.object_id, + 'bbox': [int(v) for v in response.bbox] if response.bbox else [], + 'confidence': float(response.confidence), + 'x': float(response.x), + 'y': float(response.y), + 'z': float(response.z), + 'theta': float(response.theta), + 'verdict': None, + } + history.append(entry) + history = history[-50:] # keep last 50 + with open(self._history_file, 'w') as f: + json.dump(history, f, indent=2) + except Exception as e: + self.get_logger().warn(f'Failed to log find_object call: {e}') + def find_object_callback(self, request, response): """ Main service callback for /find_object @@ -266,7 +300,8 @@ def find_object_callback(self, request, response): response.y = 0.0 response.z = 0.0 response.theta = 0.0 - + + self._log_find_object_call(label, response) return response diff --git a/vision/obb_angle_service_node.py b/vision/obb_angle_service_node.py index 41b36cd..649ee0b 100644 --- a/vision/obb_angle_service_node.py +++ b/vision/obb_angle_service_node.py @@ -401,149 +401,165 @@ def visualize_obb(self, results, mode="auto"): """ if self.latest_rgb_image is None: self.get_logger().warn('No RGB image available for visualization') - # Show a blank placeholder window - blank = np.zeros((800, 1200, 3), dtype=np.uint8) - cv2.putText(blank, "Waiting for RGB camera image...", - (300, 400), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 255, 255), 3) - cv2.putText(blank, f"RGB Topic: {self.rgb_topic}", - (350, 450), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (128, 128, 128), 2) + blank = np.zeros((480, 640, 3), dtype=np.uint8) + cv2.putText(blank, "Waiting for RGB camera...", (80, 230), + cv2.FONT_HERSHEY_SIMPLEX, 0.65, (160, 160, 160), 1) + cv2.putText(blank, self.rgb_topic, (80, 258), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (100, 100, 100), 1) cv2.imshow(self.window_name, blank) cv2.waitKey(1) return - + vis_image = self.latest_rgb_image.copy() - + img_h, img_w = vis_image.shape[:2] + # Auto-detect mode if mode == "auto": mode = "single" if len(results) == 1 else "multi" - - # Color palette for objects - colors = [ - (255, 255, 0), # Cyan - (255, 0, 255), # Magenta - (0, 255, 255), # Yellow - (255, 128, 0), # Orange - (128, 255, 0), # Lime - (0, 255, 128), # Spring Green - (255, 0, 128), # Pink - (128, 0, 255), # Purple + + # Distinct color palette (BGR) + COLORS = [ + (0, 200, 255), # amber + (80, 255, 80), # lime + (255, 80, 80), # blue + (255, 0, 200), # magenta + (0, 230, 230), # yellow + (200, 80, 255), # violet + (0, 255, 180), # spring green + (255, 180, 0), # sky blue ] - - # Process each OBB + + def draw_corner_bracket(img, pts, color, lw=2): + """Draw corner-bracket accents on an OBB polygon.""" + n = len(pts) + for i in range(n): + A = pts[(i - 1) % n].astype(float) + B = pts[i].astype(float) + C = pts[(i + 1) % n].astype(float) + ab = A - B; ab_len = np.linalg.norm(ab) + cb = C - B; cb_len = np.linalg.norm(cb) + if ab_len == 0 or cb_len == 0: + continue + clen = max(10, int(min(ab_len, cb_len) * 0.22)) + p1 = (B + (ab / ab_len) * clen).astype(int) + p2 = (B + (cb / cb_len) * clen).astype(int) + cv2.line(img, tuple(B.astype(int)), tuple(p1), color, lw) + cv2.line(img, tuple(B.astype(int)), tuple(p2), color, lw) + + def semi_transparent_rect(img, x1, y1, x2, y2, fill, alpha=0.72): + """Blend a dark rectangle over a sub-region.""" + x1 = max(0, x1); y1 = max(0, y1) + x2 = min(img.shape[1] - 1, x2); y2 = min(img.shape[0] - 1, y2) + if x2 <= x1 or y2 <= y1: + return + roi = img[y1:y2, x1:x2] + bg = np.full_like(roi, fill) + img[y1:y2, x1:x2] = cv2.addWeighted(roi, 1 - alpha, bg, alpha, 0) + + # ── Process each OBB ───────────────────────────────────────────────── for idx, result_tuple in enumerate(results): - # Unpack (handle optional bbox) if len(result_tuple) == 7: object_id, u, v, theta, width, height, bbox = result_tuple else: object_id, u, v, theta, width, height = result_tuple bbox = None - - color = colors[idx % len(colors)] - - # Draw AABB first if provided (for single object mode) - if bbox is not None and mode == "single": - x1, y1, x2, y2 = bbox - cv2.rectangle(vis_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) - cv2.putText(vis_image, "Input AABB", (int(x1), int(y1) - 5), - cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2) - - # Get OBB corner points - box_points = self.get_obb_corner_points(u, v, theta, width, height) - - # Draw OBB - cv2.drawContours(vis_image, [box_points], 0, color, 3) - - # Draw center point - if mode == "single": - cv2.circle(vis_image, (int(u), int(v)), 8, (0, 0, 255), -1) # Red center - cv2.circle(vis_image, (int(u), int(v)), 10, (255, 255, 255), 2) # White outline - else: - cv2.circle(vis_image, (int(u), int(v)), 6, (255, 255, 255), -1) - cv2.circle(vis_image, (int(u), int(v)), 8, color, 2) - - # Draw angle arrow perpendicular to WIDTH (shorter dimension), with -90° offset so 0° points UP - arrow_length = height # Use height (shorter dimension) for arrow length - # Arrow perpendicular to width = add 90° to theta, then -90° for visualization - visual_theta = theta + np.pi / 2 - np.pi / 2 # Perpendicular, then visualization offset - end_x = int(u + arrow_length * np.cos(visual_theta)) - end_y = int(v + arrow_length * np.sin(visual_theta)) - arrow_thickness = 3 if mode == "single" else 2 - cv2.arrowedLine(vis_image, (int(u), int(v)), (end_x, end_y), - (255, 0, 255) if mode == "single" else color, - arrow_thickness, tipLength=0.3) - - # Draw label (use remapped angle: 90deg - original geometry angle) + + color = COLORS[idx % len(COLORS)] + cx, cy = int(u), int(v) angle_geom_deg = np.rad2deg(theta) angle_deg = 90.0 - angle_geom_deg - + + # ── Input AABB (single mode only) ───────────────────────────── + if bbox is not None and mode == "single": + ax1, ay1, ax2, ay2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) + # Dashed-style thin rectangle (draw as corner brackets only) + aabb_pts = np.array([[ax1,ay1],[ax2,ay1],[ax2,ay2],[ax1,ay2]]) + draw_corner_bracket(vis_image, aabb_pts, (80, 200, 80), lw=1) + cv2.rectangle(vis_image, (ax1, ay1), (ax2, ay2), (80, 200, 80), 1) + # Small label + ts, _ = cv2.getTextSize("AABB", cv2.FONT_HERSHEY_SIMPLEX, 0.33, 1) + semi_transparent_rect(vis_image, ax1, ay1 - ts[1] - 6, ax1 + ts[0] + 4, ay1, (18, 18, 18)) + cv2.putText(vis_image, "AABB", (ax1 + 2, ay1 - 3), + cv2.FONT_HERSHEY_SIMPLEX, 0.33, (80, 200, 80), 1) + + # ── OBB outline ─────────────────────────────────────────────── + box_pts = self.get_obb_corner_points(u, v, theta, width, height) + # Thin full outline + cv2.drawContours(vis_image, [box_pts], 0, color, 1) + # Corner bracket accents + draw_corner_bracket(vis_image, box_pts, color, lw=2) + + # ── Center dot ──────────────────────────────────────────────── + cv2.circle(vis_image, (cx, cy), 5, (15, 15, 15), -1) # dark fill + cv2.circle(vis_image, (cx, cy), 5, color, 2) # color ring + cv2.circle(vis_image, (cx, cy), 2, (240, 240, 240), -1) # white center + + # ── Angle arrow ─────────────────────────────────────────────── + visual_theta = theta + np.pi / 2 - np.pi / 2 + arrow_len = int(max(height * 0.55, 20)) + end_x = int(cx + arrow_len * np.cos(visual_theta)) + end_y = int(cy + arrow_len * np.sin(visual_theta)) + cv2.arrowedLine(vis_image, (cx, cy), (end_x, end_y), + color, 2, tipLength=0.35) + + # ── Info panel (single) / compact label (multi) ─────────────── if mode == "single": - # Concise info box for single object info_lines = [ f"{object_id}", - f"Center: ({int(u)}, {int(v)})", - f"Angle: {angle_deg:.1f}deg", - f"Size: {width:.0f}x{height:.0f}" + f"Center ({cx}, {cy})", + f"Angle {angle_deg:.1f} deg", + f"Size {width:.0f} x {height:.0f} px", ] - - # Draw compact info box at top-right - font_scale = 0.6 - font_thickness = 2 - line_spacing = 25 - - max_width = 0 - for line in info_lines: - (text_w, text_h), _ = cv2.getTextSize(line, cv2.FONT_HERSHEY_SIMPLEX, - font_scale, font_thickness) - max_width = max(max_width, text_w) - - box_height = len(info_lines) * line_spacing + 15 - box_x = vis_image.shape[1] - max_width - 25 - box_y = 15 - - # Background with transparency effect - overlay = vis_image.copy() - cv2.rectangle(overlay, (box_x - 8, box_y - 8), - (box_x + max_width + 8, box_y + box_height), (0, 0, 0), -1) - cv2.addWeighted(overlay, 0.7, vis_image, 0.3, 0, vis_image) - + fs, ft = 0.42, 1 + pad = 8 + line_h = 20 + max_w = max(cv2.getTextSize(l, cv2.FONT_HERSHEY_SIMPLEX, fs, ft)[0][0] + for l in info_lines) + panel_w = max_w + pad * 2 + 4 # +4 for accent bar + panel_h = len(info_lines) * line_h + pad + px = img_w - panel_w - 10 + py = 28 # sits below top bar + + # Background + semi_transparent_rect(vis_image, px - 2, py, px + panel_w, py + panel_h, (15, 15, 15)) + # Left accent bar + cv2.rectangle(vis_image, (px - 2, py), (px + 2, py + panel_h), color, -1) # Border - cv2.rectangle(vis_image, (box_x - 8, box_y - 8), - (box_x + max_width + 8, box_y + box_height), color, 2) - - # Text lines + cv2.rectangle(vis_image, (px - 2, py), (px + panel_w, py + panel_h), color, 1) + for i, line in enumerate(info_lines): - y_pos = box_y + (i * line_spacing) + 18 - cv2.putText(vis_image, line, (box_x, y_pos), - cv2.FONT_HERSHEY_SIMPLEX, font_scale, color, font_thickness) + ty = py + pad + i * line_h + line_h // 2 + # Dim the label for the first line (object id) — draw in color + text_color = color if i == 0 else (210, 210, 210) + cv2.putText(vis_image, line, (px + 6, ty), + cv2.FONT_HERSHEY_SIMPLEX, fs, text_color, ft) else: - # Compact label for multi-object - label = f"#{idx} {angle_deg:.1f}deg" - label_x = int(u + 15) - label_y = int(v) - - # Text with outline - cv2.putText(vis_image, label, (label_x, label_y), - cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 5) - cv2.putText(vis_image, label, (label_x, label_y), - cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2) - - # Add concise title - if mode == "single": - title = "OBB Detection" - else: - title = f"OBB Detection ({len(results)} objects)" - - cv2.putText(vis_image, title, (10, 35), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 3) - cv2.putText(vis_image, title, (10, 35), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 255), 2) - - # Add concise legend at bottom - legend_y = vis_image.shape[0] - 20 - legend_text = "0deg = Vertical | Range: -90deg to +90deg" - - cv2.putText(vis_image, legend_text, (10, legend_y), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2) - - # Display + # Compact floating label near center + label = f"#{idx} {angle_deg:.1f}°" + fs, ft = 0.38, 1 + (lw_px, lh_px), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, fs, ft) + pad = 4 + lx = cx + 12 + ly = cy - 6 + # Clamp + lx = min(lx, img_w - lw_px - pad * 2 - 5) + ly = max(ly, lh_px + pad + 2) + + semi_transparent_rect(vis_image, lx - 2, ly - lh_px - pad, + lx + lw_px + pad * 2 + 3, ly + pad, (15, 15, 15)) + cv2.rectangle(vis_image, (lx - 2, ly - lh_px - pad), + (lx + 3, ly + pad), color, -1) # accent bar + cv2.putText(vis_image, label, (lx + 5, ly), + cv2.FONT_HERSHEY_SIMPLEX, fs, (230, 230, 230), ft) + + # ── Top info bar ───────────────────────────────────────────────────── + bar_h = 24 + obj_count = len(results) + title = f"OBB Detection | Objects: {obj_count} | 0 deg = Vertical" + semi_transparent_rect(vis_image, 0, 0, img_w, bar_h, (12, 12, 12), alpha=0.78) + cv2.putText(vis_image, title, (8, 16), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (210, 210, 210), 1) + cv2.imshow(self.window_name, vis_image) cv2.waitKey(1) diff --git a/vision/simple_sam_detector.py b/vision/simple_sam_detector.py index b303d75..fdff486 100755 --- a/vision/simple_sam_detector.py +++ b/vision/simple_sam_detector.py @@ -29,7 +29,7 @@ from rclpy.node import Node from rclpy.qos import QoSProfile, QoSReliabilityPolicy, QoSHistoryPolicy from custom_interfaces.msg import SAMDetections, SAMDetection -from custom_interfaces.srv import DetectObjects +from custom_interfaces.srv import DetectObjects, FindObjectAngle, PixelToReal from sensor_msgs.msg import Image from std_srvs.srv import Trigger from std_msgs.msg import Header @@ -165,7 +165,25 @@ def __init__(self, single_shot_mode=False): Trigger, '/vision/classify_bbox_filtered' ) - + + # Service client for GraspNet detection + self.grasp_client = self.create_client( + Trigger, + '/vision/detect_grasp' + ) + + # Service client for OBB angle detection + self.obb_client = self.create_client( + FindObjectAngle, + '/obb/find_object_angle' + ) + + # Service client for pixel-to-real conversion + self.pixel_to_real_client = self.create_client( + PixelToReal, + '/pixel_to_real' + ) + # OpenCV window setup self.window_name = f"SAM Object Detection - {self.rgb_topic}" cv2.namedWindow(self.window_name, cv2.WINDOW_NORMAL) @@ -240,11 +258,13 @@ def depth_callback(self, msg: Image): self.get_logger().error(f"Failed to convert depth image: {e}") def run_pipeline_callback(self, request, response): - """Service callback for /vision/run_pipeline - triggers detection and publishes to topic""" + """Service callback for /vision/run_pipeline - runs SAM+CLIP+GraspNet+OBB+Pixel-to-Real""" + import time + pipeline_start = time.perf_counter() + try: - # Use captured frame instead of latest_rgb for consistency frame_to_use = self.captured_frame if self.frame_captured else self.latest_rgb - + if frame_to_use is None: response.success = False response.message = json.dumps({ @@ -254,53 +274,207 @@ def run_pipeline_callback(self, request, response): }, indent=2) self.get_logger().warn("No image received yet") return response - + self.get_logger().info("=" * 80) - self.get_logger().info("Running SAM detection on captured frame...") - self.get_logger().info(f"Frame shape: {frame_to_use.shape}") + self.get_logger().info("Running full pipeline: SAM + CLIP + GraspNet + OBB + Pixel-to-Real") self.get_logger().info("=" * 80) - - # Run detection on captured frame (with IoU tracking from previous frame) + + # ── Step 1: SAM detection ────────────────────────────────────────── + sam_start = time.perf_counter() self.latest_detections = self._detect_objects(frame_to_use) - - # Store current detections as previous for next frame IoU calculation self.previous_detections = self.latest_detections.copy() - - # Build JSON response in the requested schema - detection_data = self._build_detection_schema() - - # Publish detections as ROS2 message self._publish_detections_ros() - - response.success = True - response.message = json.dumps(detection_data, indent=2) - - self.get_logger().info("=" * 80) - self.get_logger().info(f"Detection complete: {len(self.latest_detections)} objects found") - self.get_logger().info("=" * 80) - - # Print JSON output with bounding boxes - self.get_logger().info("JSON OUTPUT (with bounding boxes):") - self.get_logger().info("=" * 80) - self.get_logger().info(response.message) - self.get_logger().info("=" * 80) - - # Print detection details in readable format - self.get_logger().info("Bounding Boxes Summary:") - for i, det in enumerate(self.latest_detections): + sam_latency = time.perf_counter() - sam_start + self.get_logger().info(f"[1/5] SAM: {len(self.latest_detections)} objects in {sam_latency:.3f}s") + + # ── Step 2: CLIP classification ──────────────────────────────────── + time.sleep(0.3) # allow CLIP to process published detections + clip_start = time.perf_counter() + clip_classifications = {} + clip_success = False + if self.clip_filter_client.wait_for_service(timeout_sec=2.0): + try: + fut = self.clip_filter_client.call_async(Trigger.Request()) + rclpy.spin_until_future_complete(self, fut, timeout_sec=5.0) + clip_resp = fut.result() + if clip_resp and clip_resp.success: + clip_json = json.loads(clip_resp.message) + for region in clip_json.get('regions', []): + rid = region.get('region_id') + clip_classifications[rid] = { + 'label': region.get('label'), + 'confidence': float(region.get('confidence', 0.0)), + } + clip_success = True + except Exception as e: + self.get_logger().warn(f"CLIP call failed: {e}") + clip_latency = time.perf_counter() - clip_start + self.get_logger().info(f"[2/5] CLIP: {len(clip_classifications)} labelled in {clip_latency:.3f}s") + + # ── Step 3: GraspNet detection ───────────────────────────────────── + grasp_start = time.perf_counter() + grasp_map = {} + grasp_success = False + if self.grasp_client.wait_for_service(timeout_sec=2.0): + try: + fut = self.grasp_client.call_async(Trigger.Request()) + rclpy.spin_until_future_complete(self, fut, timeout_sec=10.0) + grasp_resp = fut.result() + if grasp_resp and grasp_resp.success: + grasp_json = json.loads(grasp_resp.message) + for g in grasp_json.get('grasps', []): + obj_id_str = g.get('object_id', '') + try: + oid = int(obj_id_str.split('_')[1]) if '_' in obj_id_str else int(obj_id_str) + except (ValueError, IndexError): + continue + if oid in grasp_map: + continue + pix = g.get('pixel_location', [0, 0]) + pos = g.get('position', {}) + grasp_map[oid] = { + 'pixel': {'u': int(pix[0]) if len(pix) > 0 else 0, + 'v': int(pix[1]) if len(pix) > 1 else 0}, + 'world': {'x': float(pos.get('x', 0.0)), + 'y': float(pos.get('y', 0.0)), + 'z': float(pos.get('z', 0.0))}, + 'quality_score': float(g.get('quality_score', 0.0)), + 'grasp_width': float(g.get('grasp_width', 0.0)), + 'approach_direction': g.get('approach_direction', ''), + } + grasp_success = True + except Exception as e: + self.get_logger().warn(f"GraspNet call failed: {e}") + grasp_latency = time.perf_counter() - grasp_start + self.get_logger().info(f"[3/5] GraspNet: {len(grasp_map)} grasps in {grasp_latency:.3f}s") + + # ── Step 4: OBB angle detection ──────────────────────────────────── + obb_start = time.perf_counter() + obb_map = {} + obb_success = False + if self.obb_client.wait_for_service(timeout_sec=2.0): + try: + fut = self.obb_client.call_async(FindObjectAngle.Request()) + rclpy.spin_until_future_complete(self, fut, timeout_sec=5.0) + obb_resp = fut.result() + if obb_resp and obb_resp.success: + n_obb = obb_resp.total_objects + bboxes_flat = list(obb_resp.bboxes) + for i in range(n_obb): + bx1 = bboxes_flat[i * 4] if len(bboxes_flat) > i * 4 else 0 + by1 = bboxes_flat[i * 4 + 1] if len(bboxes_flat) > i * 4 + 1 else 0 + bx2 = bboxes_flat[i * 4 + 2] if len(bboxes_flat) > i * 4 + 2 else 0 + by2 = bboxes_flat[i * 4 + 3] if len(bboxes_flat) > i * 4 + 3 else 0 + theta = float(obb_resp.thetas[i]) if i < len(obb_resp.thetas) else 0.0 + obb_entry = { + 'theta_rad': theta, + 'angle_deg': float(theta * 180.0 / 3.141592653589793), + 'width_px': float(obb_resp.widths[i]) if i < len(obb_resp.widths) else 0.0, + 'height_px': float(obb_resp.heights[i]) if i < len(obb_resp.heights) else 0.0, + 'center_u': float(obb_resp.centers_u[i]) if i < len(obb_resp.centers_u) else 0.0, + 'center_v': float(obb_resp.centers_v[i]) if i < len(obb_resp.centers_v) else 0.0, + } + # Match OBB result to SAM detection by IoU + obb_bbox = [bx1, by1, bx2, by2] + best_iou, best_idx = 0.0, i + for sam_idx, det in enumerate(self.latest_detections): + iou = self._calculate_iou(obb_bbox, det['bbox']) + if iou > best_iou: + best_iou, best_idx = iou, sam_idx + obb_map[best_idx] = obb_entry + obb_success = True + except Exception as e: + self.get_logger().warn(f"OBB call failed: {e}") + obb_latency = time.perf_counter() - obb_start + self.get_logger().info(f"[4/5] OBB: {len(obb_map)} objects in {obb_latency:.3f}s") + + # ── Step 5: Pixel-to-Real for each detected object center ────────── + p2r_start = time.perf_counter() + p2r_map = {} + p2r_success = False + if self.pixel_to_real_client.wait_for_service(timeout_sec=2.0): + p2r_success = True + for idx, det in enumerate(self.latest_detections): + cx, cy = det['center'] + try: + req = PixelToReal.Request() + req.u = int(cx) + req.v = int(cy) + fut = self.pixel_to_real_client.call_async(req) + rclpy.spin_until_future_complete(self, fut, timeout_sec=2.0) + p2r_resp = fut.result() + if p2r_resp: + p2r_map[idx] = { + 'u': int(cx), 'v': int(cy), + 'x': float(p2r_resp.x), + 'y': float(p2r_resp.y), + 'z': float(p2r_resp.z), + } + except Exception as e: + self.get_logger().warn(f"Pixel-to-real idx={idx} failed: {e}") + p2r_latency = time.perf_counter() - p2r_start + self.get_logger().info(f"[5/5] Pixel-to-Real: {len(p2r_map)} objects in {p2r_latency:.3f}s") + + # ── Build unified JSON response ──────────────────────────────────── + total_latency = time.perf_counter() - pipeline_start + objects_out = [] + for idx, det in enumerate(self.latest_detections): + clip_info = clip_classifications.get(idx, {}) + grasp_info = grasp_map.get(idx, {}) + obb_info = obb_map.get(idx, {}) + p2r_info = p2r_map.get(idx, {}) bbox = det['bbox'] - distance = det.get('distance_cm', 'N/A') - self.get_logger().info( - f" [{i}] {det['class_name']}: bbox={bbox}, " - f"confidence={det['confidence']:.2f}, distance={distance}" - ) + objects_out.append({ + 'object_id': det['id'], + 'label': clip_info.get('label', '') if clip_info else '', + 'bbox': {'x1': bbox[0], 'y1': bbox[1], 'x2': bbox[2], 'y2': bbox[3]}, + 'center': {'u': det['center'][0], 'v': det['center'][1]}, + 'sam_confidence': round(float(det['confidence']), 4), + 'clip_confidence': round(float(clip_info.get('confidence', 0.0)), 4) if clip_info else '', + 'iou_score': round(float(det.get('iou_score', 0.0)), 4), + 'is_stable': bool(det.get('is_stable', False)), + 'distance_cm': float(det['distance_cm']) if det.get('distance_cm') is not None else '', + 'world': p2r_info if p2r_info else {}, + 'has_grasp': bool(grasp_info), + 'grasp': grasp_info if grasp_info else {}, + 'obb': { + 'angle_deg': round(obb_info['angle_deg'], 2), + 'theta_rad': round(obb_info['theta_rad'], 4), + 'width_px': round(obb_info['width_px'], 1), + 'height_px': round(obb_info['height_px'], 1), + 'center_u': round(obb_info['center_u'], 1), + 'center_v': round(obb_info['center_v'], 1), + } if obb_info else {}, + }) + + pipeline_result = { + 'pipeline': 'run_pipeline', + 'timestamp': datetime.utcnow().isoformat() + 'Z', + 'total_objects': len(objects_out), + 'latency_s': round(total_latency, 3), + 'services': { + 'sam': {'success': True, 'latency_s': round(sam_latency, 3), 'total_detections': len(self.latest_detections)}, + 'clip': {'success': clip_success, 'latency_s': round(clip_latency, 3), 'filtered_regions': len(clip_classifications)}, + 'graspnet': {'success': grasp_success, 'latency_s': round(grasp_latency, 3), 'total_grasps': len(grasp_map)}, + 'obb': {'success': obb_success, 'latency_s': round(obb_latency, 3), 'total_objects': len(obb_map)}, + 'pixel_to_real': {'success': p2r_success, 'latency_s': round(p2r_latency, 3), 'total_converted': len(p2r_map)}, + }, + 'objects': objects_out, + } + + # Save to vision_runs_history.json (dashboard reads this) + self._save_run_pipeline_result( + pipeline_result, clip_classifications, grasp_map, obb_map, p2r_map, + sam_latency, clip_latency, grasp_latency, obb_latency + ) + + response.success = True + response.message = json.dumps(pipeline_result, indent=2) + self.get_logger().info("=" * 80) - - # Verify bounding boxes are in output - bbox_count = len([d for d in detection_data.get('detections', [{}])[0].get('detections', []) if 'bbox' in d]) - self.get_logger().info(f"Verified: {bbox_count} bounding boxes included in JSON output") + self.get_logger().info(f"Full pipeline done: {len(objects_out)} objects, {total_latency:.2f}s total") self.get_logger().info("=" * 80) - + except Exception as e: response.success = False response.message = json.dumps({ @@ -308,12 +482,119 @@ def run_pipeline_callback(self, request, response): "error": str(e), "timestamp": datetime.utcnow().isoformat() + "Z" }, indent=2) - self.get_logger().error(f"Detection error: {e}") + self.get_logger().error(f"Pipeline error: {e}") import traceback self.get_logger().error(traceback.format_exc()) - + return response + def _save_run_pipeline_result(self, pipeline_result, clip_classifications, grasp_map, obb_map, p2r_map, + sam_latency, clip_latency, grasp_latency, obb_latency): + """Save /vision/run_pipeline results to vision_runs_history.json for the dashboard.""" + try: + from pathlib import Path + package_path = Path(__file__).parent.parent + history_file = package_path / 'vision_runs_history.json' + + history = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + data = json.load(f) + if isinstance(data, list): + history = data + except Exception: + pass + + last_run_no = history[-1]['meta']['run_no'] if history else 0 + run_no = last_run_no + 1 + + num_dets = len(self.latest_detections) + iou_scores = [float(d.get('iou_score', 0.0)) for d in self.latest_detections] + is_stable_array = [bool(d.get('is_stable', False)) for d in self.latest_detections] + avg_sam_conf = (sum(float(d.get('confidence', 0.0)) for d in self.latest_detections) / num_dets + if num_dets > 0 else 0.0) + avg_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0.0 + stability_rate = (sum(1 for s in is_stable_array if s) / len(is_stable_array) + if is_stable_array else 0.0) + + objects = [] + for idx, det in enumerate(self.latest_detections): + clip_info = clip_classifications.get(idx, {}) + grasp_info = grasp_map.get(idx, {}) + obb_info = obb_map.get(idx, {}) + p2r_info = p2r_map.get(idx, {}) + bbox = det['bbox'] + objects.append({ + 'object_id': det['id'], + 'label': clip_info.get('label', '') if clip_info else '', + 'bbox_x1': bbox[0], + 'bbox_y1': bbox[1], + 'bbox_x2': bbox[2], + 'bbox_y2': bbox[3], + 'sam_confidence': round(float(det.get('confidence', 0.0)), 4), + 'clip_confidence': round(float(clip_info.get('confidence', 0.0)), 4) if clip_info else '', + 'distance_cm': float(det['distance_cm']) if det.get('distance_cm') is not None else '', + 'iou_score': iou_scores[idx], + 'is_stable': is_stable_array[idx], + 'has_grasp': bool(grasp_info), + 'grasp': grasp_info if grasp_info else {}, + 'world': p2r_info if p2r_info else {}, + 'obb_angle_deg': round(obb_info['angle_deg'], 2) if obb_info else '', + 'obb_theta_rad': round(obb_info['theta_rad'], 4) if obb_info else '', + 'obb_width_px': round(obb_info['width_px'], 1) if obb_info else '', + 'obb_height_px': round(obb_info['height_px'], 1) if obb_info else '', + 'obb_center_u': round(obb_info['center_u'], 1) if obb_info else '', + 'obb_center_v': round(obb_info['center_v'], 1) if obb_info else '', + }) + + run = { + 'meta': { + 'run_no': run_no, + 'timestamp': datetime.utcnow().isoformat() + 'Z', + 'latency_s': pipeline_result['latency_s'], + 'source': 'run_pipeline', + }, + 'sam': { + 'success': True, + 'latency_s': round(sam_latency, 3), + 'total_detections': num_dets, + 'avg_confidence': round(avg_sam_conf, 4), + 'average_iou': round(avg_iou, 4), + 'stability_rate': round(stability_rate, 4), + }, + 'clip': { + 'success': bool(clip_classifications), + 'latency_s': round(clip_latency, 3), + 'filtered_regions': len(clip_classifications), + }, + 'scene': {'success': False, 'latency_s': 0.0}, + 'obb': { + 'success': bool(obb_map), + 'latency_s': round(obb_latency, 3), + 'total_objects': len(obb_map), + }, + 'graspnet': { + 'success': bool(grasp_map), + 'latency_s': round(grasp_latency, 3), + 'total_grasps': len(grasp_map), + }, + 'objects': objects, + 'relations': [], + 'grasps': list(grasp_map.values()), + } + + history.append(run) + history = history[-50:] # keep last 50 runs + + with open(history_file, 'w') as f: + json.dump(history, f, indent=2) + + self.get_logger().info(f"Saved run #{run_no} to {history_file} ({num_dets} objects)") + + except Exception as e: + self.get_logger().warn(f"Failed to save run pipeline history: {e}") + def detect_objects_callback(self, request, response): """Service callback for /vision/detect_objects - returns detection results directly""" try: @@ -604,7 +885,7 @@ def _save_detect_objects_run(self, object_ids, bbox_x1, bbox_y1, bbox_x2, bbox_y } history.append(run) - history = history[-20:] # keep last 20 runs + history = history[-50:] # keep last 50 runs with open(history_file, 'w') as f: json.dump(history, f, indent=2) @@ -1090,157 +1371,132 @@ def _publish_detections_ros(self): def visualization_callback(self): """Display camera feed with detections in OpenCV window""" - # Use captured frame if available, otherwise latest_rgb frame_to_display = self.captured_frame if self.frame_captured else self.latest_rgb - + if frame_to_display is None: - # Show waiting message blank = np.zeros((480, 640, 3), dtype=np.uint8) - cv2.putText( - blank, - f"Waiting for {self.rgb_topic}...", - (100, 240), - cv2.FONT_HERSHEY_SIMPLEX, - 1.0, - (255, 255, 255), - 2 - ) + cv2.putText(blank, f"Waiting for {self.rgb_topic}...", (80, 240), + cv2.FONT_HERSHEY_SIMPLEX, 0.65, (160, 160, 160), 1) cv2.imshow(self.window_name, blank) cv2.waitKey(1) return - - # Create visualization image + vis_image = frame_to_display.copy() - - # Determine if we should show corner points (when detections < 3) - # For debug and finding u,v in image + + # Distinct color palette per object (BGR) + COLORS = [ + (0, 200, 255), # amber + (80, 255, 80), # lime + (255, 80, 80), # blue + (255, 0, 200), # magenta + (0, 230, 230), # yellow + (200, 80, 255), # violet + (0, 255, 180), # spring green + (255, 180, 0), # sky blue + ] + show_corner_points = len(self.latest_detections) < 5 - - # Draw detections + + # ── Pass 1: draw all masks first (underneath boxes) ────────────────── + mask_overlay = vis_image.copy() + for idx, det in enumerate(self.latest_detections): + color = COLORS[idx % len(COLORS)] + mask = det['mask'] + mask_overlay[mask > 0] = ( + int(color[0] * 0.55), + int(color[1] * 0.55), + int(color[2] * 0.55), + ) + vis_image = cv2.addWeighted(vis_image, 0.6, mask_overlay, 0.4, 0) + + # ── Pass 2: boxes, labels, debug corners ───────────────────────────── for idx, det in enumerate(self.latest_detections): bbox = det['bbox'] confidence = det['confidence'] distance = det.get('distance_cm') - obj_no = idx # Object number - - # Draw bounding box - cv2.rectangle( - vis_image, - (bbox[0], bbox[1]), - (bbox[2], bbox[3]), - (0, 255, 0), # Green - 2 - ) - - # Display 4 corner coordinates if detections < 3 - # Debug: Show corner coordinates when detections < 5 (for u,v verification) + color = COLORS[idx % len(COLORS)] + x1, y1, x2, y2 = bbox + + # Corner-bracket bounding box (professional look) + clen = max(14, int(min(x2 - x1, y2 - y1) * 0.15)) + lw = 2 + for (px, py, dx, dy) in [ + (x1, y1, 1, 1), (x2, y1, -1, 1), + (x1, y2, 1, -1), (x2, y2, -1, -1), + ]: + cv2.line(vis_image, (px, py), (px + dx * clen, py), color, lw) + cv2.line(vis_image, (px, py), (px, py + dy * clen), color, lw) + # Thin dim full rectangle + cv2.rectangle(vis_image, (x1, y1), (x2, y2), color, 1) + + # ── Debug corner coordinates ────────────────────────────────── if show_corner_points: - font_scale = 0.4 - font_thickness = 1 - text_color = (0, 255, 255) # Yellow text for visibility - bg_color = (0, 0, 0) # Black background - padding = 2 - - # Draw all 4 corners in multi-line format for better readability corners_text = [ - f"TL:({bbox[0]},{bbox[1]})", - f"TR:({bbox[2]},{bbox[1]})", - f"BL:({bbox[0]},{bbox[3]})", - f"BR:({bbox[2]},{bbox[3]})" + f"TL:({x1},{y1})", f"TR:({x2},{y1})", + f"BL:({x1},{y2})", f"BR:({x2},{y2})", ] - - # Position text block above bbox - start_y = max(bbox[1] - 80, 20) # Ensure it stays on screen - line_height = 18 - + start_y = max(y1 - 68, 26) for i, line_text in enumerate(corners_text): - text_size, _ = cv2.getTextSize(line_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness) - y_pos = start_y + (i * line_height) - - # Draw background - cv2.rectangle( - vis_image, - (bbox[0] - padding, y_pos - text_size[1] - padding), - (bbox[0] + text_size[0] + padding, y_pos + padding), - bg_color, - -1 - ) - - # Draw text - cv2.putText( - vis_image, - line_text, - (bbox[0], y_pos), - cv2.FONT_HERSHEY_SIMPLEX, - font_scale, - text_color, - font_thickness - ) - - # Draw filled mask with transparency - mask = det['mask'] - colored_mask = np.zeros_like(vis_image) - colored_mask[:, :] = (0, 255, 0) # Green overlay - vis_image = np.where( - mask[..., None] > 0, - cv2.addWeighted(vis_image, 0.7, colored_mask, 0.3, 0), - vis_image - ) - - # Draw label with object number and distance + ts, _ = cv2.getTextSize(line_text, cv2.FONT_HERSHEY_SIMPLEX, 0.33, 1) + ty = start_y + i * 16 + cv2.rectangle(vis_image, (x1 - 2, ty - ts[1] - 2), + (x1 + ts[0] + 2, ty + 2), (15, 15, 15), -1) + cv2.putText(vis_image, line_text, (x1, ty), + cv2.FONT_HERSHEY_SIMPLEX, 0.33, color, 1) + + # ── Label ───────────────────────────────────────────────────── if distance is not None: - label = f"#{obj_no} {det.get('class_name', det['id'])}: {confidence:.2f} ({distance:.1f}cm)" + label = f"#{idx} {det.get('class_name', det['id'])} {confidence:.2f} {distance:.1f}cm" else: - label = f"#{obj_no} {det.get('class_name', det['id'])}: {confidence:.2f}" - - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2) - - # Label background - cv2.rectangle( - vis_image, - (bbox[0], bbox[1] - label_size[1] - 10), - (bbox[0] + label_size[0], bbox[1]), - (0, 255, 0), - -1 - ) - - # Label text - cv2.putText( - vis_image, - label, - (bbox[0], bbox[1] - 5), - cv2.FONT_HERSHEY_SIMPLEX, - 0.6, - (0, 0, 0), # Black text - 2 - ) - - # Add info overlay + label = f"#{idx} {det.get('class_name', det['id'])} {confidence:.2f}" + + fs, ft = 0.38, 1 + (lw_px, lh_px), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, fs, ft) + pad = 4 + accent = 3 # color accent bar width + + # Place label above bbox; clamp to image top + ly = y1 - pad - 2 + if ly - lh_px - pad < 0: + ly = y1 + lh_px + pad + 2 + + bx1 = x1 + bx2 = x1 + accent + pad + lw_px + pad + by1 = ly - lh_px - pad + by2 = ly + pad + + # Clamp to image bounds + bx1 = max(0, bx1); by1 = max(0, by1) + bx2 = min(vis_image.shape[1] - 1, bx2) + by2 = min(vis_image.shape[0] - 1, by2) + + # Semi-transparent dark background + roi = vis_image[by1:by2, bx1:bx2] + if roi.size > 0: + dark = np.full_like(roi, (18, 18, 18)) + vis_image[by1:by2, bx1:bx2] = cv2.addWeighted(roi, 0.25, dark, 0.75, 0) + + # Color accent bar on left + cv2.rectangle(vis_image, (bx1, by1), (bx1 + accent, by2), color, -1) + + # White label text + cv2.putText(vis_image, label, (bx1 + accent + pad, ly), + cv2.FONT_HERSHEY_SIMPLEX, fs, (240, 240, 240), ft) + + # ── Top info bar ───────────────────────────────────────────────────── + h, w = vis_image.shape[:2] + bar_h = 22 mode_text = "CONTINUOUS" if self.continuous_detection else "SINGLE SHOT" - corner_indicator = " [CORNER POINTS ON]" if show_corner_points else "" - info_text = f"Mode: {mode_text} | Objects: {len(self.latest_detections)}{corner_indicator}" - - cv2.putText( - vis_image, - info_text, - (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (255, 255, 255), # White - 2 - ) - - cv2.putText( - vis_image, - info_text, - (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (0, 0, 0), # Black outline - 4 - ) - - # Show image + obj_count = len(self.latest_detections) + debug_tag = " [DEBUG]" if (show_corner_points and obj_count > 0) else "" + info_text = f"Mode: {mode_text} | Objects detected: {obj_count}{debug_tag}" + + roi = vis_image[0:bar_h, 0:w] + dark_bar = np.full_like(roi, (12, 12, 12)) + vis_image[0:bar_h, 0:w] = cv2.addWeighted(roi, 0.25, dark_bar, 0.75, 0) + cv2.putText(vis_image, info_text, (8, 15), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (210, 210, 210), 1) + cv2.imshow(self.window_name, vis_image) cv2.waitKey(1) From cb8d46dce3dd5b5596627353fe490843f6da7e87 Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Sat, 11 Apr 2026 08:56:12 +0700 Subject: [PATCH 14/16] feat: find object dsahboard fix --- vision/clip_classifier.py | 276 ++++++++++++++++++----------- vision/find_object_service_node.py | 28 +++ vision/simple_sam_detector.py | 20 ++- vision/unified_pipeline.py | 5 +- 4 files changed, 222 insertions(+), 107 deletions(-) diff --git a/vision/clip_classifier.py b/vision/clip_classifier.py index b44e029..312e3db 100755 --- a/vision/clip_classifier.py +++ b/vision/clip_classifier.py @@ -112,6 +112,13 @@ def __init__(self, candidate_labels: List[str] = None): self.declare_parameter('real_hardware', False) self.real_hardware = bool(self.get_parameter('real_hardware').value) + # Minimum raw cosine similarity to even consider a region a candidate. + # Applied before inter-region softmax normalisation; regions below this + # threshold are discarded entirely. Default 0.1 rejects strongly + # mismatched regions while keeping borderline ones in the softmax pool. + self.declare_parameter('clip_min_confidence', 0.1) + self.clip_min_confidence = float(self.get_parameter('clip_min_confidence').value) + self.rgb_topic = '/camera/color/image_raw' if self.real_hardware else '/camera/image_raw' self.depth_topic = '/camera/depth/image_rect_raw' if self.real_hardware else '/camera/depth/image_raw' self.camera_info_topic = 'camera/color/camera_info' if self.real_hardware else '/camera/camera_info' @@ -716,11 +723,17 @@ def _call_detect_objects(self) -> Tuple[List[List[int]], Optional[str]]: return bboxes, None def _compute_similarity(self, image_bgr: np.ndarray, label: str) -> float: - """Compute CLIP cosine similarity between crop and label text.""" + """Compute raw CLIP cosine similarity between a single crop and label text. + + Returns a value roughly in [-1, 1]. Prefer + _compute_inter_region_confidences() when you have multiple regions, + as that method normalises across regions via softmax to produce proper + probabilities in [0, 1]. + """ region_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB) pil_image = PILImage.fromarray(region_rgb) image_input = self.preprocess(pil_image).unsqueeze(0).to(self.device) - text_tokens = clip.tokenize([label]).to(self.device) + text_tokens = clip.tokenize([f"a photo of a {label}"]).to(self.device) with torch.no_grad(): image_features = self.model.encode_image(image_input) @@ -731,6 +744,55 @@ def _compute_similarity(self, image_bgr: np.ndarray, label: str) -> float: return float(similarity) + def _compute_inter_region_confidences( + self, + crops_bgr: List[np.ndarray], + label: str, + ) -> List[float]: + """Return per-region confidence values in [0, 1] for a query label. + + Rather than returning raw cosine similarity (which lives in roughly + [0.15, 0.35] for typical queries and is hard to threshold), this method + applies a temperature-scaled softmax *across all candidate regions*. + The result is a proper probability distribution: the region most likely + containing ``label`` gets a high score, the rest share the remainder. + + Args: + crops_bgr: List of BGR image crops (one per detected region). + label: Text query, e.g. ``"bowl"`` or ``"green_cube"``. + + Returns: + List of float probabilities, same length as ``crops_bgr``, + summing to 1.0. An empty list is returned when ``crops_bgr`` + is empty. + """ + if not crops_bgr: + return [] + + # Use the same prompt template as _classify_regions / _classify_image + text_tokens = clip.tokenize([f"a photo of a {label}"]).to(self.device) + with torch.no_grad(): + text_features = self.model.encode_text(text_tokens) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + image_features_list = [] + for crop_bgr in crops_bgr: + region_rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB) + pil_image = PILImage.fromarray(region_rgb) + image_input = self.preprocess(pil_image).unsqueeze(0).to(self.device) + img_feat = self.model.encode_image(image_input) + img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True) + image_features_list.append(img_feat) + + # Stack → (N, D), then cosine similarities → (N,) + all_image_features = torch.cat(image_features_list, dim=0) # (N, D) + raw_sims = (all_image_features @ text_features.T).squeeze(1) # (N,) + + # Temperature-scaled softmax over regions: a factor of 100 matches the + # scale used in standard CLIP logits and keeps the distribution sharp. + probs = (raw_sims * 100.0).softmax(dim=0) + return [float(p.item()) for p in probs] + def find_multi_object_callback(self, request, response): """ Service callback for /vision/find_multi_object. @@ -815,8 +877,10 @@ def find_multi_object_callback(self, request, response): frame = self.captured_frame frame_h, frame_w = frame.shape[:2] - matches: List[Dict] = [] + # --- Pass 1: collect valid crops pre-filtered by raw cosine --- + candidates: List[Dict] = [] + crops: List[np.ndarray] = [] for region_id, bbox in enumerate(bboxes): x1, y1, x2, y2 = bbox x1 = max(0, min(x1, frame_w)) @@ -828,12 +892,25 @@ def find_multi_object_callback(self, request, response): continue region_bgr = frame[y1:y2, x1:x2] - similarity = self._compute_similarity(region_bgr, target_label) - matches.append({ + raw_sim = self._compute_similarity(region_bgr, target_label) + if raw_sim < self.clip_min_confidence: + continue + candidates.append({ 'region_id': region_id, 'bbox': [int(x1), int(y1), int(x2), int(y2)], - 'confidence': round(float(similarity), 4), }) + crops.append(region_bgr) + + matches: List[Dict] = [] + if candidates: + # --- Pass 2: inter-region softmax → proper [0,1] probabilities --- + probs = self._compute_inter_region_confidences(crops, target_label) + for i, cand in enumerate(candidates): + matches.append({ + 'region_id': cand['region_id'], + 'bbox': cand['bbox'], + 'confidence': round(probs[i], 4), + }) matches.sort(key=lambda item: item['confidence'], reverse=True) top_matches = matches[:top_k] @@ -996,80 +1073,83 @@ def find_object_callback(self, request, response): self.get_logger().info(f"Classification complete for {len(self.latest_region_classifications)} regions") - self.get_logger().info(f"Computing CLIP embeddings for '{target_label}' and {len(self.latest_region_classifications)} regions") - - # Compute image-text similarity for each bounding box using CLIP's high-level API - best_match = None - best_similarity = -1.0 # Cosine similarity ranges from -1 to 1 - + self.get_logger().info(f"Computing CLIP confidences for '{target_label}' across {len(self.latest_region_classifications)} regions") + + # --- Pass 1: collect valid crops and their metadata --------- + h, w = self.captured_frame.shape[:2] + valid_regions = [] + crops = [] for region in self.latest_region_classifications: bbox = region['bbox'] x1, y1, x2, y2 = bbox - - # Clamp bbox to image bounds - h, w = self.captured_frame.shape[:2] x1 = max(0, min(x1, w)) x2 = max(0, min(x2, w)) y1 = max(0, min(y1, h)) y2 = max(0, min(y2, h)) - - # Skip invalid boxes if x2 <= x1 or y2 <= y1: self.get_logger().warn(f"Skipping invalid bbox: {bbox}") continue - - # Crop and convert region - region_bgr = self.captured_frame[y1:y2, x1:x2] - region_rgb = cv2.cvtColor(region_bgr, cv2.COLOR_BGR2RGB) - pil_image = PILImage.fromarray(region_rgb) - - # Use OpenAI CLIP for similarity computation - image_input = self.preprocess(pil_image).unsqueeze(0).to(self.device) - text_tokens = clip.tokenize([target_label]).to(self.device) - - with torch.no_grad(): - image_features = self.model.encode_image(image_input) - text_features = self.model.encode_text(text_tokens) - - # Normalize features - image_features = image_features / image_features.norm(dim=-1, keepdim=True) - text_features = text_features / text_features.norm(dim=-1, keepdim=True) - - # Compute cosine similarity (normalized dot product) - similarity = (image_features @ text_features.T)[0, 0].item() - - self.get_logger().debug(f"Region {region['region_id']}: similarity = {similarity:.4f}") - - # Track best match - if similarity > best_similarity: - best_similarity = similarity - best_match = { - 'bbox': bbox, - 'confidence': similarity, - 'region_id': region['region_id'] - } - - # Check if any match was found - if best_match is None: + # Pre-filter by raw cosine similarity so obviously wrong regions + # don't pollute the softmax pool. + raw_sim = self._compute_similarity( + self.captured_frame[y1:y2, x1:x2], target_label + ) + if raw_sim < self.clip_min_confidence: + self.get_logger().debug( + f"Region {region['region_id']} discarded: raw_sim={raw_sim:.4f} " + f"< clip_min_confidence={self.clip_min_confidence}" + ) + continue + valid_regions.append({ + 'bbox': [x1, y1, x2, y2], + 'region_id': region['region_id'], + }) + crops.append(self.captured_frame[y1:y2, x1:x2]) + + # Check if any valid region survived the pre-filter + if not valid_regions: response.success = False response.message = f"No valid regions found to compare with '{target_label}'" response.bbox = [] response.confidence = 0.0 self.latest_found_object = None - self.get_logger().info(f"No valid regions to compare") + self.get_logger().info("No valid regions to compare") return response - - # Optional: Set a minimum similarity threshold - min_similarity_threshold = 0.2 # Adjust based on your needs - if best_match['confidence'] < min_similarity_threshold: + + # --- Pass 2: inter-region softmax → proper probabilities ---- + # Each crop gets a probability in [0, 1] that sums to 1 across + # all regions. A high score means "this region most likely + # contains the queried label", which is a meaningful confidence. + probs = self._compute_inter_region_confidences(crops, target_label) + + for i, region_info in enumerate(valid_regions): + region_info['confidence'] = probs[i] + self.get_logger().debug( + f"Region {region_info['region_id']}: confidence={probs[i]:.4f}" + ) + + # Best region = highest softmax probability + best_match = max(valid_regions, key=lambda r: r['confidence']) + + # Minimum confidence guard (now in softmax probability space). + # With clip_min_confidence already filtering raw similarities, + # this catches the degenerate single-region case where probability + # is trivially 1.0 but the raw match was borderline. + min_prob_threshold = self.clip_min_confidence + if best_match['confidence'] < min_prob_threshold: response.success = False - response.message = f"Label '{target_label}' found but similarity too low ({best_match['confidence']:.3f} < {min_similarity_threshold})" + response.message = ( + f"Label '{target_label}' found but confidence too low " + f"({best_match['confidence']:.3f} < {min_prob_threshold})" + ) response.bbox = [] response.confidence = float(best_match['confidence']) self.latest_found_object = None - self.get_logger().info(f"Label '{target_label}' similarity too low: {best_match['confidence']:.3f}") + self.get_logger().info( + f"Label '{target_label}' confidence too low: {best_match['confidence']:.3f}" + ) return response - + # Store for visualization self.latest_found_object = { 'label': target_label, @@ -1077,17 +1157,17 @@ def find_object_callback(self, request, response): 'confidence': best_match['confidence'], 'region_id': best_match['region_id'] } - + # Return success with bbox response.success = True - response.message = f"Found '{target_label}' with similarity {best_match['confidence']:.3f}" + response.message = f"Found '{target_label}' with confidence {best_match['confidence']:.3f}" response.bbox = best_match['bbox'] response.confidence = float(best_match['confidence']) # Note: FindObject.srv doesn't have object_id field (only FindObjectReal.srv does) - + self.get_logger().info( f"Found '{target_label}': bbox={best_match['bbox']}, " - f"similarity={best_match['confidence']:.3f}, region_id={best_match['region_id']}" + f"confidence={best_match['confidence']:.3f}, region_id={best_match['region_id']}" ) except Exception as e: @@ -1117,21 +1197,18 @@ def _classify_image(self, rgb_image: np.ndarray) -> Dict: # Convert BGR to RGB rgb = cv2.cvtColor(rgb_image, cv2.COLOR_BGR2RGB) pil_image = PILImage.fromarray(rgb) - - # Prepare inputs using OpenAI CLIP + + # Prompt template improves accuracy over bare label names (CLIP paper) + prompted_labels = [f"a photo of a {lbl}" for lbl in self.candidate_labels] + image_input = self.preprocess(pil_image).unsqueeze(0).to(self.device) - text_tokens = clip.tokenize(self.candidate_labels).to(self.device) - - # Get predictions + text_tokens = clip.tokenize(prompted_labels).to(self.device) + with torch.no_grad(): image_features = self.model.encode_image(image_input) text_features = self.model.encode_text(text_tokens) - - # Normalize features image_features = image_features / image_features.norm(dim=-1, keepdim=True) text_features = text_features / text_features.norm(dim=-1, keepdim=True) - - # Calculate similarity (logits) logits_per_image = (100.0 * image_features @ text_features.T) probs = logits_per_image.softmax(dim=-1)[0] @@ -1190,61 +1267,60 @@ def _classify_regions(self, rgb_image: np.ndarray, bboxes: List[List[int]]) -> D start_time = time.time() classified_regions = [] - + + # Encode text once for all regions. The candidate labels never change + # within a call, so there is no reason to re-tokenize per region. + # Prompt template "a photo of a {label}" matches CLIP's training + # distribution far better than bare label names and measurably improves + # zero-shot accuracy (CLIP paper: +~13 pp on ImageNet). + prompted_labels = [f"a photo of a {lbl}" for lbl in self.candidate_labels] + with torch.no_grad(): + text_tokens = clip.tokenize(prompted_labels).to(self.device) + text_features = self.model.encode_text(text_tokens) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + h, w = rgb_image.shape[:2] + for region_id, bbox in enumerate(bboxes): x1, y1, x2, y2 = bbox - + # Clamp bbox to image bounds - h, w = rgb_image.shape[:2] x1 = max(0, min(x1, w)) x2 = max(0, min(x2, w)) y1 = max(0, min(y1, h)) y2 = max(0, min(y2, h)) - + # Skip invalid boxes if x2 <= x1 or y2 <= y1: self.get_logger().warn(f"Skipping invalid bbox: {bbox}") continue - + # Crop region region_bgr = rgb_image[y1:y2, x1:x2] - - # Convert BGR to RGB region_rgb = cv2.cvtColor(region_bgr, cv2.COLOR_BGR2RGB) pil_image = PILImage.fromarray(region_rgb) - - # Prepare inputs using OpenAI CLIP + image_input = self.preprocess(pil_image).unsqueeze(0).to(self.device) - text_tokens = clip.tokenize(self.candidate_labels).to(self.device) - - # Get predictions + with torch.no_grad(): image_features = self.model.encode_image(image_input) - text_features = self.model.encode_text(text_tokens) - - # Normalize features image_features = image_features / image_features.norm(dim=-1, keepdim=True) - text_features = text_features / text_features.norm(dim=-1, keepdim=True) - - # Calculate similarity (logits) + + # Intra-label softmax: probability over the closed label set. + # Range [0, 1], sums to 1 across labels for this region. logits_per_image = (100.0 * image_features @ text_features.T) probs = logits_per_image.softmax(dim=-1)[0] - - # Convert to numpy + probs_np = probs.cpu().numpy() - - # Sort predictions by confidence sorted_indices = np.argsort(probs_np)[::-1] - - # Build predictions list + all_predictions = [] for idx in sorted_indices: all_predictions.append({ "label": self.candidate_labels[idx], "confidence": round(float(probs_np[idx]), 2) }) - - # Build region result + region_result = { "region_id": region_id, "bbox": [int(x1), int(y1), int(x2), int(y2)], @@ -1252,9 +1328,9 @@ def _classify_regions(self, rgb_image: np.ndarray, bboxes: List[List[int]]) -> D "label": all_predictions[0]["label"], "confidence": all_predictions[0]["confidence"] }, - "all_predictions": all_predictions[:10] # Top 10 + "all_predictions": all_predictions[:10] } - + classified_regions.append(region_result) # Calculate processing time diff --git a/vision/find_object_service_node.py b/vision/find_object_service_node.py index 31ec056..aca35af 100644 --- a/vision/find_object_service_node.py +++ b/vision/find_object_service_node.py @@ -9,6 +9,10 @@ ros2 service call /find_object custom_interfaces/srv/FindObjectReal "{label: 'bowl'}" confidence calculate by SAM detection based + +json file kept at +/home/group11/final_project_ws/install/vision/lib/python3.12/site-packages/find_object_history.json + """ # TCP_OFFSET = 0.157 # Actual TCP_OFFSET value from teach pendant @@ -23,6 +27,10 @@ def __init__(self): self.declare_parameter('tcp_offset', False) self.tcp_offset = bool(self.get_parameter('tcp_offset').value) + # Minimum confidence threshold for find_object responses + self.declare_parameter('find_object_min_confidence', 0.255) + self.find_object_min_confidence = float(self.get_parameter('find_object_min_confidence').value) + # Use reentrant callback group to allow nested service calls self.callback_group = ReentrantCallbackGroup() @@ -207,6 +215,26 @@ def find_object_callback(self, request, response): response.z = 0.0 response.theta = 0.0 return response + + if find_response.confidence < self.find_object_min_confidence: + response.success = False + response.message = ( + f'Object confidence too low ({find_response.confidence:.3f} < ' + f'{self.find_object_min_confidence:.3f}); cannot reliably find object in camera' + ) + response.object_id = '' + response.bbox = [] + response.confidence = float(find_response.confidence) + response.x = 0.0 + response.y = 0.0 + response.z = 0.0 + response.theta = 0.0 + self.get_logger().warn( + f'find_object confidence below threshold: {find_response.confidence:.3f} ' + f'(threshold={self.find_object_min_confidence:.3f})' + ) + self._log_find_object_call(label, response) + return response # Extract object_id from find_response object_id = find_response.object_id if hasattr(find_response, 'object_id') else '' diff --git a/vision/simple_sam_detector.py b/vision/simple_sam_detector.py index fdff486..ca05e1d 100755 --- a/vision/simple_sam_detector.py +++ b/vision/simple_sam_detector.py @@ -689,14 +689,17 @@ def detect_objects_callback(self, request, response): clip_info = clip_classifications.get(idx) if clip_info: - # Use CLIP label and confidence + # Confidence = CLIP softmax probability over candidate labels [0, 1] object_ids.append(f"{clip_info['label']}_{idx}") confidences.append(float(clip_info['confidence'])) self.get_logger().info(f" Region {idx}: {clip_info['label']} (CLIP confidence: {clip_info['confidence']:.2f})") else: - # Use SAM generic label + # No CLIP result — fall back to SAM shape score (circularity-based, + # range [0.50, 0.95]). This is NOT a semantic confidence; it only + # reflects contour regularity. object_ids.append(det['id']) confidences.append(float(det['confidence'])) + self.get_logger().debug(f" Region {idx}: no CLIP result, using SAM shape score={det['confidence']:.2f}") bbox = det['bbox'] bbox_x1.append(bbox[0]) @@ -1104,12 +1107,17 @@ def _detect_objects(self, rgb_image: np.ndarray) -> List[Dict]: mask = np.zeros((h, w), dtype=np.uint8) cv2.drawContours(mask, [contour], -1, 255, -1) - # Calculate confidence based on contour properties + # Shape-based segmentation score (NOT a model prediction probability). + # Circularity (4π·area / perimeter²) measures how "blob-like" the + # contour is: 1.0 = perfect circle, <1.0 = irregular shape. + # Score range: [0.50, 0.95]. This is used as a proxy for + # segmentation quality when no model IoU score is available. + # Downstream consumers should treat this as `sam_shape_score`, + # NOT as a semantic classification confidence. perimeter = cv2.arcLength(contour, True) circularity = 4 * np.pi * area / (perimeter * perimeter) if perimeter > 0 else 0 - confidence = min(0.95, 0.50 + circularity * 0.45) # More lenient baseline - - # Relaxed confidence threshold (was 0.4, now 0.3) + confidence = min(0.95, 0.50 + circularity * 0.45) + if confidence <= 0.3: continue diff --git a/vision/unified_pipeline.py b/vision/unified_pipeline.py index e47cefa..f3a9111 100644 --- a/vision/unified_pipeline.py +++ b/vision/unified_pipeline.py @@ -287,7 +287,10 @@ def _call_clip_classification(self) -> Dict[int, Dict]: clip_map[region_id] = { "label": region.get('label'), "confidence": float(region.get('confidence', 0.0)), - "is_top1_accurate": region.get('confidence', 0.0) >= 0.5 # Top-1 if confidence >= 0.5 + # confidence here is a softmax probability over candidate labels. + # >= 0.5 means the model assigns more probability to this label + # than to all other candidates combined — a high-precision bar. + "is_top1_accurate": region.get('confidence', 0.0) >= 0.5 } return clip_map From f903d3441381aa1b10c3edf939f1cb21fd85d6ec Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Sat, 11 Apr 2026 21:42:58 +0700 Subject: [PATCH 15/16] feat: add clip and obb to dashboard --- dashboard/index.html | 267 ++++++++++++++++++++++++++++--- vision/benchmark_dashboard.py | 104 ++++++++++-- vision/clip_classifier.py | 138 +++++++++++++--- vision/obb_angle_service_node.py | 66 +++++++- 4 files changed, 514 insertions(+), 61 deletions(-) diff --git a/dashboard/index.html b/dashboard/index.html index 241e612..65cfee0 100644 --- a/dashboard/index.html +++ b/dashboard/index.html @@ -764,6 +764,46 @@
+ + +
+
+ /obb/find_object_angle_bb — Direct Call History + 0 +
+
+
+
+
Latest Angle (°)
+
+
+
+
Avg Angle (°)
+
+
+
+
Latest Object
+
+
+
+ + + + + + + + + + + + + + + +
#Object IDAngle (°)Theta (rad)Center (u, v)Size (w × h px)Input BBoxIoUTimestamp
No data — call ros2 service call /obb/find_object_angle_bb ...
+
+
@@ -834,17 +874,59 @@ - + + - +
Test IDLabelCLIP Label Confidence Top-1 AccuracyHuman Verdict Bounding Box Timestamp
No CLIP data — start /vision/classify_bbox_filtered
No CLIP data — call ros2 service call /vision/classify_bbox_filtered std_srvs/srv/Trigger
+ + +
+
+ /vision/classify_all — Direct Call History + 0 +
+
+
+
+
Latest Top Label
+
+
+
+
Latest Confidence
+
+
+
+
Avg Confidence
+
+
+
+
Avg Latency (s)
+
+
+
+ + + + + + + + + + + + +
#Top LabelConfidenceTop-5 PredictionsLatency (s)Timestamp
No data — call ros2 service call /vision/classify_all std_srvs/srv/Trigger
+
+
@@ -1024,10 +1106,18 @@ // ── /api/data ───────────────────────────────────────────────────────────── let lastData = null; + // Holds latest filtered records so updateDashboard can merge them + let _latestFilteredRecords = []; + async function fetchData() { try { const res = await fetch('/api/data'); lastData = await res.json(); + // Also refresh filtered history in the same tick + try { + const fr = await fetch('/api/classify-filtered-history'); + _latestFilteredRecords = await fr.json(); + } catch (_) { _latestFilteredRecords = []; } updateDashboard(lastData); } catch (e) { console.warn('Error fetching /api/data:', e); @@ -1035,22 +1125,26 @@ } function updateDashboard(data) { + // Merge /vision/classify_bbox_filtered records directly into clip table + const clipRecords = (data.clip_classifications || []).concat(_latestFilteredRecords); + const clipCount = clipRecords.length; + document.getElementById('totalCalls').textContent = data.metadata.total_calls; document.getElementById('pixelCount').textContent = data.pixel_to_real.length; document.getElementById('samCount').textContent = data.sam_detections.length; - document.getElementById('clipCount').textContent = data.clip_classifications.length; + document.getElementById('clipCount').textContent = clipCount; document.getElementById('graspCount').textContent = data.grasp_detections.length; document.getElementById('sceneCount').textContent = data.scene_understanding.length; setNav('nav-sam', data.sam_detections.length); - setNav('nav-clip', data.clip_classifications.length); + setNav('nav-clip', clipCount); setNav('nav-grasp', data.grasp_detections.length); setNav('nav-scene', data.scene_understanding.length); setNav('nav-pixel', data.pixel_to_real.length); updatePixelToReal(data.pixel_to_real); updateSAMDetections(data.sam_detections); - updateCLIPClassifications(data.clip_classifications); + updateCLIPClassifications(clipRecords); updateGraspDetections(data.grasp_detections); updateSceneUnderstanding(data.scene_understanding); @@ -1127,32 +1221,71 @@ `).join(''); } + // ── CLIP verdict buttons ────────────────────────────────────────────────── + async function clipVerdict(testId, verdict) { + try { + await fetch('/api/clip-verdict', { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({test_id: testId, verdict}) + }); + // Refresh immediately so button state updates + _latestFilteredRecords = []; + try { + const fr = await fetch('/api/classify-filtered-history'); + _latestFilteredRecords = await fr.json(); + } catch (_) {} + if (lastData) updateCLIPClassifications( + (lastData.clip_classifications || []).concat(_latestFilteredRecords) + ); + } catch (e) { console.warn('clip verdict error:', e); } + } + // ── CLIP Classifications ────────────────────────────────────────────────── function updateCLIPClassifications(records) { document.getElementById('clipBadge').textContent = `${records.length} Records`; const tbody = document.getElementById('clipBody'); if (!records.length) { - tbody.innerHTML = 'No CLIP data — start /vision/classify_bbox_filtered'; + tbody.innerHTML = 'No CLIP data — call ros2 service call /vision/classify_bbox_filtered std_srvs/srv/Trigger'; ['clipTop1','clipAvgConf'].forEach(id => document.getElementById(id).textContent = '—'); return; } - const withAcc = records.filter(r => r.top1_accuracy !== null); - const top1Acc = withAcc.length - ? (withAcc.filter(r => r.top1_accuracy).length / withAcc.length * 100).toFixed(1) + '%' - : 'N/A'; + // Top-1 accuracy computed only from records where human has given verdict + const withVerdict = records.filter(r => r.top1_accuracy !== null && r.top1_accuracy !== undefined); + const top1Acc = withVerdict.length + ? (withVerdict.filter(r => r.top1_accuracy).length / withVerdict.length * 100).toFixed(1) + '%' + : 'Pending'; const avgConf = records.reduce((s, r) => s + r.confidence, 0) / records.length; - document.getElementById('clipTop1').textContent = top1Acc; + document.getElementById('clipTop1').textContent = top1Acc; document.getElementById('clipAvgConf').textContent = avgConf.toFixed(3); - tbody.innerHTML = records.slice(-50).reverse().map(r => ` - ${r.test_id} - ${r.label} - ${confBar(r.confidence)} - ${formatAccuracy(r.top1_accuracy)} - ${JSON.stringify(r.bbox)} - ${tsShort(r.timestamp)} - `).join(''); + tbody.innerHTML = records.slice(-50).reverse().map(r => { + // Verdict display + let verdictCell; + if (r.top1_accuracy === true) { + verdictCell = `✓ Correct + `; + } else if (r.top1_accuracy === false) { + verdictCell = `✗ Wrong + `; + } else { + verdictCell = ` + `; + } + const bboxStr = r.bbox ? (typeof r.bbox === 'object' && !Array.isArray(r.bbox) + ? `(${r.bbox.x1},${r.bbox.y1})→(${r.bbox.x2},${r.bbox.y2})` + : JSON.stringify(r.bbox)) : '—'; + return ` + ${r.test_id} + ${r.label} + ${confBar(r.confidence)} + ${r.top1_accuracy === true ? 'TRUE' : r.top1_accuracy === false ? 'FALSE' : ''} + ${verdictCell} + ${bboxStr} + ${tsShort(r.timestamp)} + `; + }).join(''); } // ── Grasp Detections ────────────────────────────────────────────────────── @@ -1386,6 +1519,94 @@ }).join(''); } + // ── /vision/classify_all history ───────────────────────────────────────── + async function fetchClassifyAllHistory() { + try { + const res = await fetch('/api/classify-all-history'); + if (!res.ok) return; + const records = await res.json(); + updateClassifyAllHistory(records); + } catch (_) {} + } + + function updateClassifyAllHistory(records) { + document.getElementById('caCount').textContent = records.length; + const tbody = document.getElementById('caBody'); + if (!records.length) { + tbody.innerHTML = 'No data — call ros2 service call /vision/classify_all std_srvs/srv/Trigger'; + ['caLatestLabel','caLatestConf','caAvgConf','caAvgLatency'].forEach(id => document.getElementById(id).textContent = '—'); + return; + } + const latest = records[records.length - 1]; + document.getElementById('caLatestLabel').textContent = latest.top_label || '—'; + document.getElementById('caLatestConf').textContent = latest.top_confidence != null ? (latest.top_confidence * 100).toFixed(1) + '%' : '—'; + const avgConf = records.reduce((s, r) => s + (r.top_confidence || 0), 0) / records.length; + document.getElementById('caAvgConf').textContent = (avgConf * 100).toFixed(1) + '%'; + const avgLat = records.reduce((s, r) => s + (r.latency_s || 0), 0) / records.length; + document.getElementById('caAvgLatency').textContent = avgLat.toFixed(3) + ' s'; + + tbody.innerHTML = records.slice(-50).reverse().map(r => { + const top5 = (r.all_predictions || []).slice(0, 5).map(p => + `${p.label} ${(p.confidence*100).toFixed(0)}%` + ).join(' '); + return ` + ${r.call_id} + ${r.top_label} + ${confBar(r.top_confidence)} + ${top5} + ${r.latency_s != null ? r.latency_s.toFixed(3) : '—'} + ${tsShort(r.timestamp)} + `; + }).join(''); + } + + // ── /obb/find_object_angle_bb history ──────────────────────────────────── + async function fetchObbBBHistory() { + try { + const res = await fetch('/api/obb-bb-history'); + if (!res.ok) return; + const records = await res.json(); + updateObbBBHistory(records); + } catch (_) {} + } + + function updateObbBBHistory(records) { + document.getElementById('obbBBCount').textContent = records.length; + const tbody = document.getElementById('obbBBBody'); + if (!records.length) { + tbody.innerHTML = 'No data — call ros2 service call /obb/find_object_angle_bb ...'; + ['obbBBLatestAngle','obbBBAvgAngle','obbBBLatestObject'].forEach(id => document.getElementById(id).textContent = '—'); + return; + } + const latest = records[records.length - 1]; + document.getElementById('obbBBLatestAngle').textContent = latest.angle_deg != null ? latest.angle_deg.toFixed(1) + '°' : '—'; + document.getElementById('obbBBLatestObject').textContent = latest.object_id || '—'; + const avgAngle = records.reduce((s, r) => s + (r.angle_deg || 0), 0) / records.length; + document.getElementById('obbBBAvgAngle').textContent = avgAngle.toFixed(1) + '°'; + + tbody.innerHTML = records.slice(-50).reverse().map(r => { + const pct = Math.round(((r.angle_deg + 90) / 180) * 100); + const barColor = Math.abs(r.angle_deg) < 20 ? 'var(--ok)' : Math.abs(r.angle_deg) < 60 ? 'var(--warn)' : 'var(--fail)'; + const bb = r.input_bbox || []; + return ` + ${r.call_id} + ${r.object_id ?? '—'} + +
+
+ ${r.angle_deg.toFixed(1)}° +
+ + ${r.theta_rad != null ? r.theta_rad.toFixed(4) : '—'} + ${r.center_u != null ? Math.round(r.center_u) : '—'}, ${r.center_v != null ? Math.round(r.center_v) : '—'} + ${r.width_px != null ? Math.round(r.width_px) : '—'} × ${r.height_px != null ? Math.round(r.height_px) : '—'} + ${bb.length === 4 ? `(${bb[0]},${bb[1]})→(${bb[2]},${bb[3]})` : '—'} + ${r.iou != null ? r.iou.toFixed(3) : '—'} + ${tsShort(r.timestamp)} + `; + }).join(''); + } + // ── Clear data (legacy) ─────────────────────────────────────────────────── async function clearData() { if (confirm('Clear all in-session benchmark data?')) { @@ -1394,10 +1615,14 @@ } // ── Polling ─────────────────────────────────────────────────────────────── - setInterval(fetchData, 2000); - setInterval(fetchRunHistory, 2000); + setInterval(fetchData, 2000); + setInterval(fetchRunHistory, 2000); + setInterval(fetchClassifyAllHistory, 3000); + setInterval(fetchObbBBHistory, 3000); fetchData(); fetchRunHistory(); + fetchClassifyAllHistory(); + fetchObbBBHistory(); diff --git a/vision/benchmark_dashboard.py b/vision/benchmark_dashboard.py index 67ef805..4958021 100644 --- a/vision/benchmark_dashboard.py +++ b/vision/benchmark_dashboard.py @@ -16,6 +16,10 @@ Usage: ros2 run vision benchmark_dashboard + + ros2 service call /benchmark/clear_data std_srvs/srv/Trigger + + Then open browser: http://localhost:8080 """ @@ -317,19 +321,11 @@ def add_grasp_detection_record(self, grasp_pose): self.data['metadata']['total_calls'] += 1 def sync_from_run_history(self): - """Sync CLIP, GraspNet, OBB, and Pixel-to-Real data from vision_runs_history.json - so the /api/data endpoint (CLIP / GraspNet / Pixel-to-Real sections) stays populated - even when those dedicated service nodes are not running.""" + """Sync CLIP, GraspNet, OBB, and Pixel-to-Real data from history files + so the /api/data endpoint stays populated even when service nodes are not running.""" try: from pathlib import Path - history_file = Path(__file__).parent.parent / 'vision_runs_history.json' - if not history_file.exists(): - return - - with open(history_file, 'r') as f: - runs = json.load(f) - if not isinstance(runs, list) or not runs: - return + package_path = Path(__file__).parent.parent timestamp = datetime.now().isoformat() @@ -339,6 +335,17 @@ def sync_from_run_history(self): new_pixel = [] new_obb = [] + history_file = package_path / 'vision_runs_history.json' + runs = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + data = json.load(f) + if isinstance(data, list): + runs = data + except Exception: + runs = [] + for run in runs: run_ts = run.get('meta', {}).get('timestamp', timestamp) run_no = run.get('meta', {}).get('run_no', 0) @@ -416,6 +423,17 @@ def sync_from_run_history(self): obj.get('bbox_x2', 0), obj.get('bbox_y2', 0)], }) + # Also merge records from /vision/classify_bbox_filtered direct calls + filtered_file = Path(__file__).parent.parent / 'classify_filtered_history.json' + if filtered_file.exists(): + try: + with open(filtered_file, 'r') as f: + filtered_records = json.load(f) + if filtered_records: + new_clip = (new_clip + filtered_records)[-1000:] + except Exception: + pass + # Only update if we got new data (avoids overwriting live topic data with empty) if new_clip: self.data['clip_classifications'] = new_clip[-1000:] @@ -436,7 +454,7 @@ def publish_data(self): self.data_publisher.publish(msg) def clear_data_callback(self, request, response): - """Clear all benchmark data""" + """Clear all benchmark data (in-memory + persistent JSON files)""" self.data = { 'pixel_to_real': [], 'sam_detections': [], @@ -450,10 +468,27 @@ def clear_data_callback(self, request, response): } } + # Also wipe the persistent history files so sync_from_run_history + # doesn't immediately repopulate from stale data + package_path = Path(__file__).parent.parent + files_to_clear = [ + 'vision_runs_history.json', + 'classify_filtered_history.json', + 'classify_all_history.json', + 'obb_bb_history.json', + ] + for fname in files_to_clear: + fpath = package_path / fname + if fpath.exists(): + try: + with open(fpath, 'w') as f: + json.dump([], f) + except Exception as e: + self.get_logger().warn(f'Could not clear {fname}: {e}') + response.success = True response.message = "Benchmark data cleared" - - self.get_logger().info('Benchmark data cleared') + self.get_logger().info('Benchmark data cleared (memory + files)') return response def start_http_server(self): @@ -532,6 +567,21 @@ def do_GET(self): data = self._read_json_file(fo_file, []) self._json_response(data) + elif self.path == '/api/classify-all-history': + ca_file = package_path / 'classify_all_history.json' + data = self._read_json_file(ca_file, []) + self._json_response(data) + + elif self.path == '/api/classify-filtered-history': + cf_file = package_path / 'classify_filtered_history.json' + data = self._read_json_file(cf_file, []) + self._json_response(data) + + elif self.path == '/api/obb-bb-history': + obb_file = package_path / 'obb_bb_history.json' + data = self._read_json_file(obb_file, []) + self._json_response(data) + else: super().do_GET() @@ -555,9 +605,35 @@ def do_POST(self): self._write_json_file(fo_file, []) self._json_response({'ok': True}) + elif self.path == '/api/clip-verdict': + self._handle_clip_verdict(payload) + else: self._json_response({'error': 'unknown endpoint'}, 404) + # ── /api/clip-verdict ───────────────────────────────────────── + def _handle_clip_verdict(self, payload): + """Set human-in-the-loop top1_accuracy verdict for a CLIP record.""" + test_id = payload.get('test_id') + verdict = payload.get('verdict') # True / False + if test_id is None or verdict is None: + self._json_response({'error': 'test_id and verdict required'}, 400) + return + + cf_file = package_path / 'classify_filtered_history.json' + history = self._read_json_file(cf_file, []) + updated = False + for entry in history: + if entry.get('test_id') == test_id: + entry['top1_accuracy'] = bool(verdict) + updated = True + break + if updated: + self._write_json_file(cf_file, history) + self._json_response({'ok': True}) + else: + self._json_response({'error': f'test_id {test_id} not found'}, 404) + # ── /api/find-object ────────────────────────────────────────── def _handle_find_object(self, payload): """Call /find_object ROS2 service and persist result.""" diff --git a/vision/clip_classifier.py b/vision/clip_classifier.py index 312e3db..9004e42 100755 --- a/vision/clip_classifier.py +++ b/vision/clip_classifier.py @@ -129,6 +129,14 @@ def __init__(self, candidate_labels: List[str] = None): # "cobot", "green_cube", "drill", + "remote_control", + "orange_cube", + "orange_cylinder", + "arduino_board", + "mouse", + "light_blue_cube", + "blue_star", + "purple_triangle", "pink_cube", "measuring_tape", "screwdriver", @@ -141,11 +149,11 @@ def __init__(self, candidate_labels: List[str] = None): # "door_handle", # "red_ball", # "gasket_part", - "beer_can", + # "beer_can", "bowl", - "cinder_block", - "coke_can", - "roomba", + # "cinder_block", + # "coke_can", + # "roomba", # "plastic_cup", # "hammer", # "robotic_arm", @@ -387,13 +395,16 @@ def classify_all_callback(self, request, response): response.success = True response.message = json.dumps(classification_data, indent=2) - + top_pred = classification_data['output']['top_prediction'] self.get_logger().info( f"Classification complete: {top_pred['label']} " f"(confidence: {top_pred['confidence']:.2f})" ) - + + # Persist result for dashboard + self._save_classify_all_record(classification_data, latency_s=(time.perf_counter() - start)) + except Exception as e: response.success = False response.message = json.dumps({ @@ -563,17 +574,31 @@ def classify_bbox_filtered_callback(self, request, response): self.get_logger().error("CLIP model not available") return response - # Check if we have classified regions from SAM subscription + # If no SAM regions cached, trigger a fresh detection + classification now if not self.latest_region_classifications: - response.success = False - response.message = json.dumps({ - "error": "No classified regions available. Call '/vision/run_pipeline' first to trigger SAM detection.", - "hint": "ros2 service call /vision/run_pipeline std_srvs/srv/Trigger", - "timestamp": datetime.utcnow().isoformat() + "Z" - }, indent=2) - self.get_logger().warn("No classified regions. Run SAM pipeline first.") - return response - + self.get_logger().info("No cached regions — calling /vision/detect_objects automatically...") + bboxes, err = self._call_detect_objects() + if err: + response.success = False + response.message = json.dumps({ + "error": err, + "hint": "Ensure simple_sam_detector is running", + "timestamp": datetime.utcnow().isoformat() + "Z" + }, indent=2) + self.get_logger().warn(f"Detection failed: {err}") + return response + if not bboxes: + response.success = False + response.message = json.dumps({ + "error": "No objects detected in the scene", + "timestamp": datetime.utcnow().isoformat() + "Z" + }, indent=2) + self.get_logger().warn("No bboxes returned from detection") + return response + self.get_logger().info(f"Got {len(bboxes)} bboxes from detection — classifying with CLIP...") + classification_data = self._classify_regions(self.captured_frame, bboxes) + self.latest_region_classifications = classification_data['output']['classified_regions'] + self.get_logger().info(f"Filtering {len(self.latest_region_classifications)} classified regions by confidence > 0.5") # Filter regions by confidence >= 0.5 @@ -602,18 +627,21 @@ def classify_bbox_filtered_callback(self, request, response): response.success = True response.message = json.dumps(result, indent=2) - + self.get_logger().info( f"Filtered classification complete: {len(filtered_regions)}/{len(self.latest_region_classifications)} " f"regions passed confidence threshold" ) - + # Log each filtered region for region in filtered_regions: self.get_logger().info( f"Region #{region['region_id']}: {region['label']} " f"(confidence: {region['confidence']:.2f})" ) + + # Persist to file so benchmark_dashboard can show results in CLIP table + self._save_filtered_records(filtered_regions) except Exception as e: response.success = False @@ -1252,7 +1280,79 @@ def _classify_image(self, rgb_image: np.ndarray) -> Dict: } return schema - + + def _save_classify_all_record(self, classification_data: Dict, latency_s: float = 0.0): + """Append a /vision/classify_all result to classify_all_history.json for the dashboard.""" + try: + from pathlib import Path + history_file = Path(__file__).parent.parent / 'classify_all_history.json' + history = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + history = json.load(f) + except Exception: + history = [] + + top_pred = classification_data.get('output', {}).get('top_prediction', {}) + all_preds = classification_data.get('output', {}).get('all_predictions', []) + meta = classification_data.get('output', {}).get('metadata', {}) + + record = { + 'call_id': len(history) + 1, + 'timestamp': datetime.utcnow().isoformat() + 'Z', + 'top_label': top_pred.get('label', ''), + 'top_confidence': float(top_pred.get('confidence', 0.0)), + 'all_predictions': all_preds[:10], # store top-10 + 'processing_time_ms': meta.get('processing_time_ms', 0), + 'latency_s': round(latency_s, 4), + 'device': meta.get('device', ''), + } + + history.append(record) + # Keep only the last 500 records + if len(history) > 500: + history = history[-500:] + + with open(history_file, 'w') as f: + json.dump(history, f, indent=2) + except Exception as e: + self.get_logger().warn(f'Failed to save classify_all record: {e}') + + def _save_filtered_records(self, filtered_regions: list): + """Append /vision/classify_bbox_filtered results to classify_filtered_history.json. + Each region becomes one record in the format the dashboard CLIP table expects.""" + try: + from pathlib import Path + history_file = Path(__file__).parent.parent / 'classify_filtered_history.json' + history = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + history = json.load(f) + except Exception: + history = [] + + timestamp = datetime.utcnow().isoformat() + 'Z' + call_id_base = len(history) + 1 + for i, region in enumerate(filtered_regions): + history.append({ + 'test_id': call_id_base + i, + 'timestamp': timestamp, + 'label': region['label'], + 'confidence': float(region['confidence']), + 'top1_accuracy': None, # set by human-in-the-loop verdict + 'bbox': region.get('bbox', {}), + }) + + if len(history) > 1000: + history = history[-1000:] + + with open(history_file, 'w') as f: + json.dump(history, f, indent=2) + except Exception as e: + self.get_logger().warn(f'Failed to save filtered records: {e}') + def _classify_regions(self, rgb_image: np.ndarray, bboxes: List[List[int]]) -> Dict: """ Classify multiple image regions using CLIP model diff --git a/vision/obb_angle_service_node.py b/vision/obb_angle_service_node.py index 649ee0b..0518fa8 100644 --- a/vision/obb_angle_service_node.py +++ b/vision/obb_angle_service_node.py @@ -42,10 +42,13 @@ from custom_interfaces.msg import SAMDetections from sensor_msgs.msg import Image, CameraInfo from cv_bridge import CvBridge +import json import numpy as np import cv2 import time import threading +from datetime import datetime +from pathlib import Path class OBBAngleServiceNode(Node): @@ -715,20 +718,34 @@ def find_object_angle_bb_callback(self, request, response): theta_result = np.deg2rad(angle_result_deg) # Populate response (use remapped theta, keep geometry unchanged) + angle_deg = angle_result_deg response.success = True - response.message = f'OBB calculated for {best_detection.object_id} within bbox [{request.x1}, {request.y1}, {request.x2}, {request.y2}]' + response.message = json.dumps({ + "success": True, + "object_id": best_detection.object_id, + "input_bbox": [request.x1, request.y1, request.x2, request.y2], + "center": {"u": round(u, 2), "v": round(v, 2)}, + "theta_rad": round(float(theta_result), 6), + "angle_deg": round(float(angle_deg), 2), + "width_px": round(float(width), 2), + "height_px": round(float(height), 2), + "iou_with_request": round(float(best_iou), 4), + "timestamp": datetime.utcnow().isoformat() + "Z", + }) response.u = u response.v = v response.theta = theta_result response.width = width response.height = height - + # Log results concisely - angle_deg = angle_result_deg self.get_logger().info('=' * 60) self.get_logger().info(f'OBB Result ({best_detection.object_id}): center=({u:.1f},{v:.1f}), angle={angle_deg:.1f}deg, size={width:.0f}x{height:.0f}') self.get_logger().info('=' * 60) - + + # Persist to history file for dashboard + self._save_obb_bb_record(best_detection.object_id, request, u, v, theta_result, angle_deg, width, height, best_iou) + # Queue visualization; do not block service response path. viz_data = [(best_detection.object_id, u, v, theta_geom, width, height, [request.x1, request.y1, request.x2, request.y2])] self.queue_visualization(viz_data, mode="single") @@ -738,15 +755,50 @@ def find_object_angle_bb_callback(self, request, response): import traceback self.get_logger().error(traceback.format_exc()) response.success = False - response.message = f'Internal error: {str(e)}' + response.message = json.dumps({"success": False, "error": str(e)}) response.u = 0.0 response.v = 0.0 response.theta = 0.0 response.width = 0.0 response.height = 0.0 - + return response - + + def _save_obb_bb_record(self, object_id, request, u, v, theta_rad, angle_deg, width, height, iou): + """Persist /obb/find_object_angle_bb result to obb_bb_history.json for the dashboard.""" + try: + history_file = Path(__file__).parent.parent / 'obb_bb_history.json' + history = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + history = json.load(f) + except Exception: + history = [] + + record = { + 'call_id': len(history) + 1, + 'timestamp': datetime.utcnow().isoformat() + 'Z', + 'object_id': object_id, + 'input_bbox': [request.x1, request.y1, request.x2, request.y2], + 'center_u': round(float(u), 2), + 'center_v': round(float(v), 2), + 'theta_rad': round(float(theta_rad), 6), + 'angle_deg': round(float(angle_deg), 2), + 'width_px': round(float(width), 2), + 'height_px': round(float(height), 2), + 'iou': round(float(iou), 4), + } + + history.append(record) + if len(history) > 500: + history = history[-500:] + + with open(history_file, 'w') as f: + json.dump(history, f, indent=2) + except Exception as e: + self.get_logger().warn(f'Failed to save obb_bb record: {e}') + def find_object_angle_callback(self, request, response): """ Service callback for /obb/find_object_angle From 3fdf50c936dfe69c86836f99a20859d8fdd9acb0 Mon Sep 17 00:00:00 2001 From: Kanisorn-S Date: Sat, 11 Apr 2026 21:57:07 +0700 Subject: [PATCH 16/16] feat: add test for clip in dashboard and fix path clip issue dashboard --- test/test_clip_filtered_dashboard.py | 184 +++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 test/test_clip_filtered_dashboard.py diff --git a/test/test_clip_filtered_dashboard.py b/test/test_clip_filtered_dashboard.py new file mode 100644 index 0000000..995b5a0 --- /dev/null +++ b/test/test_clip_filtered_dashboard.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Unit tests: verify classify_filtered_history.json path, format, and HTTP endpoint. + +Run directly (no ROS needed): + python3 test/test_clip_filtered_dashboard.py +""" + +import json +import unittest +import urllib.request +import urllib.error +from pathlib import Path + +# ── Path constants ──────────────────────────────────────────────────────────── +INSTALL_VISION = Path( + '/home/group11/final_project_ws/install/vision' + '/lib/python3.12/site-packages/vision' +) +INSTALL_PKG = INSTALL_VISION.parent # site-packages/ +SRC_VISION = Path( + '/home/group11/final_project_ws/src/vision/vision' +) +SRC_PKG = SRC_VISION.parent # src/vision/ + +DASHBOARD_URL = 'http://localhost:8080' + + +def _resolve_package_path(py_file: Path) -> Path: + """Replicate Path(__file__).parent.parent used in the nodes.""" + return py_file.parent.parent + + +class TestPathConsistency(unittest.TestCase): + """Writer and reader must resolve to the same JSON file.""" + + def test_install_clip_and_dashboard_same_path(self): + clip_path = _resolve_package_path(INSTALL_VISION / 'clip_classifier.py') \ + / 'classify_filtered_history.json' + dash_path = _resolve_package_path(INSTALL_VISION / 'benchmark_dashboard.py') \ + / 'classify_filtered_history.json' + self.assertEqual(clip_path, dash_path, + f"clip writes to {clip_path} but dashboard reads from {dash_path}") + + def test_src_and_install_expected_dir(self): + """Install package_path == site-packages, NOT src/vision.""" + install_pkg_path = _resolve_package_path(INSTALL_VISION / 'clip_classifier.py') + self.assertIn('site-packages', str(install_pkg_path), + f"install path looks wrong: {install_pkg_path}") + + def test_filtered_file_exists(self): + fpath = INSTALL_PKG / 'classify_filtered_history.json' + self.assertTrue(fpath.exists(), + f"classify_filtered_history.json not found at {fpath}") + + +class TestFilteredFileFormat(unittest.TestCase): + """Each record must have the fields the dashboard expects.""" + + @classmethod + def setUpClass(cls): + fpath = INSTALL_PKG / 'classify_filtered_history.json' + if fpath.exists(): + with open(fpath) as f: + cls.records = json.load(f) + else: + cls.records = [] + + def test_file_not_empty(self): + self.assertGreater(len(self.records), 0, + "classify_filtered_history.json is empty – call " + "/vision/classify_bbox_filtered first") + + def test_required_fields_present(self): + required = {'test_id', 'timestamp', 'label', 'confidence', 'top1_accuracy', 'bbox'} + for i, rec in enumerate(self.records[:5]): + missing = required - rec.keys() + self.assertFalse(missing, + f"Record #{i} missing fields: {missing}\nRecord: {rec}") + + def test_confidence_is_float_in_range(self): + for i, rec in enumerate(self.records[:20]): + c = rec.get('confidence') + self.assertIsInstance(c, float, + f"Record #{i}: confidence should be float, got {type(c)}") + self.assertGreaterEqual(c, 0.0) + self.assertLessEqual(c, 1.0) + + def test_top1_accuracy_is_null_or_bool(self): + for i, rec in enumerate(self.records): + acc = rec.get('top1_accuracy') + self.assertIn(type(acc), (type(None), bool), + f"Record #{i}: top1_accuracy should be None or bool, got {type(acc)}: {acc}") + + def test_bbox_is_list_or_dict(self): + for i, rec in enumerate(self.records[:10]): + bbox = rec.get('bbox') + self.assertIsNotNone(bbox, f"Record #{i}: bbox is None") + self.assertIn(type(bbox), (list, dict), + f"Record #{i}: bbox type {type(bbox)}") + + def test_no_auto_true_verdicts_in_new_records(self): + """Records saved after the human-in-the-loop fix should have top1_accuracy=None.""" + # Find the most recent record – should be null, not auto-True + if not self.records: + self.skipTest("No records to check") + last = self.records[-1] + self.assertIsNone(last.get('top1_accuracy'), + f"Latest record has auto-verdict instead of null: {last}") + + +class TestHTTPEndpoint(unittest.TestCase): + """Verify the dashboard HTTP server exposes /api/classify-filtered-history.""" + + def _get(self, path): + try: + with urllib.request.urlopen(DASHBOARD_URL + path, timeout=5) as r: + return r.status, json.loads(r.read()) + except urllib.error.HTTPError as e: + return e.code, None + except Exception as e: + self.skipTest(f"Dashboard not reachable ({e})") + + def test_endpoint_exists_returns_200(self): + status, data = self._get('/api/classify-filtered-history') + self.assertEqual(status, 200, + f"/api/classify-filtered-history returned {status} – endpoint missing or not deployed") + + def test_endpoint_returns_list(self): + status, data = self._get('/api/classify-filtered-history') + if status != 200: + self.skipTest(f"Endpoint returned {status}") + self.assertIsInstance(data, list, + f"Expected list, got {type(data)}") + + def test_endpoint_data_matches_file(self): + fpath = INSTALL_PKG / 'classify_filtered_history.json' + if not fpath.exists(): + self.skipTest("classify_filtered_history.json not found") + with open(fpath) as f: + file_records = json.load(f) + + status, http_records = self._get('/api/classify-filtered-history') + if status != 200: + self.skipTest(f"Endpoint returned {status}") + + self.assertEqual(len(file_records), len(http_records), + f"File has {len(file_records)} records but endpoint returned {len(http_records)}") + + def test_api_data_merges_filtered(self): + """fetchData() merges clip_classifications + filtered; verify /api/data exists.""" + status, data = self._get('/api/data') + if status != 200: + self.skipTest("/api/data not reachable") + self.assertIn('clip_classifications', data, + "clip_classifications key missing from /api/data") + + def test_clip_verdict_endpoint_accessible(self): + """POST /api/clip-verdict must return 4xx for bad payload, not 404.""" + try: + req = urllib.request.Request( + DASHBOARD_URL + '/api/clip-verdict', + data=b'{}', + headers={'Content-Type': 'application/json'}, + method='POST' + ) + with urllib.request.urlopen(req, timeout=5) as r: + status = r.status + except urllib.error.HTTPError as e: + status = e.code + except Exception as e: + self.skipTest(f"Dashboard not reachable ({e})") + + self.assertNotEqual(status, 404, + "/api/clip-verdict returned 404 – endpoint not deployed in running process") + self.assertIn(status, (200, 400), + f"Unexpected status {status} for /api/clip-verdict") + + +if __name__ == '__main__': + print("=" * 60) + print("CLIP Filtered Dashboard – Path & Endpoint Tests") + print("=" * 60) + unittest.main(verbosity=2)