diff --git a/dashboard/find_object.html b/dashboard/find_object.html new file mode 100644 index 0000000..907379a --- /dev/null +++ b/dashboard/find_object.html @@ -0,0 +1,817 @@ + + + + + + Find Object — Vision Benchmark + + + + + + + + +
+ +
+
+
Find Object — Service Benchmark
+
/find_object · SAM + CLIP + OBB + Pixel-to-Real · Human-in-the-Loop accuracy
+
+
+
+
+ Auto-refresh 3 s +
+ +
+
+ +
+ + +
+
+
0
+
Total Calls
+
+
+
0
+
Succeeded
+
+
+
0
+
Failed
+
+
+
+
HITL Accuracy
+
+
+
0
+
Pending Review
+
+
+
+
Avg Confidence
+
+
+ + +
+
+
+
+
Trigger /find_object Service
+
ros2 service call /find_object custom_interfaces/srv/FindObjectReal "{label: '...'}"
+
+
+
+
+
+
Object Label
+ +
+ +
+
Ready — enter a label and press Call Service.
+ + + +
+
+ + +
+
+
+
+
Human-in-the-Loop Verification
+
Review latest detection result — mark as correct or incorrect
+
+ Waiting for call… +
+
+
+ No result to review yet. Trigger the service above first. +
+ +
+
+ + +
+
+
+
+
Call History
+
find_object_history.json · newest first · max 50 calls
+
+ 0 Calls +
+
+ + + + + + + + + + + + + + + + + + + + +
#TimestampLabelObject IDConfidenceBounding BoxX (m)Y (m)Z (m)θ (deg)Service OKHITL VerdictVerifyMessage
No calls yet — trigger the service above.
+
+
+ + +
+
+
+
+
Accuracy Report
+
Per-label breakdown from human-in-the-loop verdicts
+
+
+
+ + + + + + + + + + + + + + + +
LabelTotal CallsSucceededFailedReviewedCorrectIncorrectHITL AccuracyAvg Confidence
No data yet.
+
+
+ +
+ + +
+ + + + diff --git a/dashboard/index.html b/dashboard/index.html index f8d2cfa..65cfee0 100644 --- a/dashboard/index.html +++ b/dashboard/index.html @@ -3,745 +3,1626 @@ - Vision Benchmark Dashboard + Vision Pipeline Research Dashboard -
-
-

🎯 Vision Benchmark Dashboard

-

Real-time monitoring of ROS2 Vision Pipeline Services

-
- -
-
- Auto-refreshing every 2 seconds - + + + + + +
+ + +
+
+
Vision Pipeline Research Dashboard
+
Segment Anything Model · CLIP · GraspNet · Scene Understanding · ROS 2
+
+
+
+
+ Auto-refresh every 2 s +
+
+
+ +
-
-
-
0
-
Total Service Calls
+ + +
+
+
0
+
Stored Runs
+
Rolling last 50
+
+
+
0
+
SAM Records
+
+
+
0
+
CLIP Records
-
-
0
-
Pixel to Real
+
+
0
+
Grasp Records
-
-
0
-
SAM Detections
+
+
0
+
Scene Records
-
-
0
-
CLIP Classifications
+
+
0
+
Pixel→Real
-
-
0
-
Grasp Detections
+
+
0
+
OBB Objects
+
Latest run
-
-
0
-
Scene Analysis
+
+
0
+
Total Svc Calls
+
This session
- -
-
-

📍 Pixel to Real Conversion

- 0 Records -
-
- - - - - - - - - - - - - - + + +
+
+
+
+
Vision Run Summary
+
vision_runs_history.json · newest first · max 50 runs
+
+ 0 Runs +
+
+
+
+
Total Runs
+
+
+
+
Last Objects
+
+
+
+
Last Relations
+
+
+
+
Total Latency
+
+
+
+
SAM Latency
+
+
+
+
CLIP Latency
+
+
+
+
Scene Latency
+
+
+
+
OBB Latency
+
+
+
+
Test IDTimestampInput U (px)Input V (px)Output X (m)Output Y (m)Output Z (m)
No data available
+ + + + + + + + + + + + + + + + + + + + + +
RunTimestampTotal (s)SAM (s)CLIP (s)Scene (s)OBB (s)SAM ObjectsSAM Avg ConfCLIP FilteredScene ObjectsRelationsGraspableOBB ObjectsSAMCLIPSceneOBB
No run history yet — run collect_and_export.py
- -
-
-

🎯 SAM Object Detection

- 0 Records + + +
+
+
+
+
Detected Objects — Latest Run
+
Merged SAM · CLIP · GraspNet per-object data
+
+ 0 Objects
-
-
-
0.00
-
Avg IoU
+
+ + + + + + + + + + + + + + + + + + + +
Object IDLabelBounding BoxSAM ConfidenceCLIP ConfidenceDistance (cm)IoUStableHas GraspGrasp QualityOBB Angle (°)OBB Width (px)OBB Height (px)
No data yet
+
+
+ + +
+
+
+
+
Spatial Relations — Latest Run
+
Object-level spatial relation graph · Scene Understanding output
-
-
0%
-
Stability Rate
+ 0 Relations +
+
+ + + + + + + + + + + + +
SubjectRelationTargetConfidenceDistance 2DDescription
No data yet
+
+
+ + +
+
+
+
+
OBB — Oriented Bounding Box Angle Benchmark
+
/obb/find_object_angle · angle parallel to width · 0° = vertical · range −90° to +90°
+
+ 0 Objects +
+
+
+
+
Objects (Latest)
+
+
+
+
Avg Angle (°)
-
-
0.00
-
Avg Confidence
+
+
+
Min Angle (°)
+
+
+
+
Max Angle (°)
+
+
+
+
OBB Latency (s)
-
- - - +
+
+ + + + + + + + + + + + + + + +
Object IDLabelOBB Angle (°)OBB Theta (rad)OBB Width (px)OBB Height (px)Center (u, v)Bounding BoxSAM ConfidenceDistance (cm)
No OBB data — ensure obb_angle_service_node is running
+
+ + +
+
+ /obb/find_object_angle_bb — Direct Call History + 0 +
+
+
+
+
Latest Angle (°)
+
+
+
+
Avg Angle (°)
+
+
+
+
Latest Object
+
+
+
+ + + - - - - - - + + + + + + - - + + + + +
# Object IDBounding BoxCenter PointConfidenceIoU ScoreAP (IoU≥0.5)Distance (cm)Angle (°)Theta (rad)Center (u, v)Size (w × h px)Input BBoxIoU Timestamp
No data — call ros2 service call /obb/find_object_angle_bb ...
+
+
+
+ + + +
+
+
+
+
SAM — Segment Anything Model
+
/vision/run_pipeline · /vision/detect_objects
+
+ 0 Records +
+
+
+
+
Avg Confidence
+
+
+
+
Avg IoU
+
+
+
+
Stability Rate
+
+
+
+ + + + + + + + + + + - +
Object IDBounding BoxCenter (u, v)ConfidenceIoU ScoreAP ≥ 0.5Distance (cm)Timestamp
No data available
No SAM data — start /vision/run_pipeline
- -
-
-

🏷️ CLIP Classification

- 0 Records + +
+
+
+
+
CLIP — Contrastive Language–Image Pre-training
+
/vision/classify_bbox_filtered · /vision/classify_all
+
+ 0 Records
-
-
-
N/A
-
Top-1 Accuracy
+
+
+
+
Top-1 Accuracy
-
-
0.00
-
Avg Confidence
+
+
+
Avg Confidence
-
- - - - - - - - - - - +
+
Test IDLabelConfidenceTop-1 AccuracyBounding BoxTimestamp
+ + + + + + + + + - +
Test IDCLIP LabelConfidenceTop-1 AccuracyHuman VerdictBounding BoxTimestamp
No data available
No CLIP data — call ros2 service call /vision/classify_bbox_filtered std_srvs/srv/Trigger
+ + +
+
+ /vision/classify_all — Direct Call History + 0 +
+
+
+
+
Latest Top Label
+
+
+
+
Latest Confidence
+
+
+
+
Avg Confidence
+
+
+
+
Avg Latency (s)
+
+
+
+ + + + + + + + + + + + +
#Top LabelConfidenceTop-5 PredictionsLatency (s)Timestamp
No data — call ros2 service call /vision/classify_all std_srvs/srv/Trigger
+
+
- -
-
-

🤖 Grasp Detection

- 0 Records + +
+
+
+
+
GraspNet — 6-DoF Grasp Pose Estimation
+
/vision/detect_grasp · /vision/detect_grasp_bbox
+
+ 0 Records
-
-
-
0.00
-
Avg Quality Score
+
+
+
+
Avg Quality Score
-
-
0.00m
-
Avg Grasp Width
+
+
+
Avg Grasp Width
-
- - - - - - - - - - - - - +
+
Test IDObject IDPosition (u,v)Position (x,y,z)Quality ScoreGrasp WidthApproachTimestamp
+ + + + + + + + + + - +
Test IDObject IDPixel (u, v)World (x, y, z) mQuality ScoreWidth (m)ApproachTimestamp
No data available
No grasp data — start /vision/detect_grasp
- -
-
-

🌐 Scene Understanding

- 0 Records -
-
-
-
0
-
Total Objects
-
-
-
0
-
Relations
-
-
-
0%
-
Spatial Accuracy
-
-
-
0%
-
Adjacency Accuracy
-
-
-
- - - - - - - - - - - - + +
+
+
+
+
Scene Understanding — Holistic Analysis
+
/vision/understand_scene · spatial reasoning + object relations
+
+ 0 Records +
+
+
+
+
Total Objects
+
+
+
+
Relations
+
+
+
+
Spatial Accuracy
+
+
+
+
Adjacency Accuracy
+
+
+
+
Scene IDObjectsRelationsDescriptionSpatial AccAdjacency AccTimestamp
+ + + + + + + + + - + + +
Scene IDObjectsRelationsScene DescriptionSpatial AccAdjacency AccTimestamp
No data available
No scene data — start /vision/understand_scene
+
+
+ + +
+
+
+
+
Pixel → Real World Conversion
+
/pixel_to_real · camera-to-world coordinate projection
+
+ 0 Records +
+
+ + + + + + + + + + + +
Test IDTimestampInput U (px)Input V (px)Output X (m)Output Y (m)Output Z (m)
No data — call /pixel_to_real
-
- + const angles = objects.map(o => o.obb_angle_deg); + const avg = angles.reduce((s, v) => s + v, 0) / angles.length; + document.getElementById('obbAvgAngle').textContent = avg.toFixed(1) + '°'; + document.getElementById('obbMinAngle').textContent = Math.min(...angles).toFixed(1) + '°'; + document.getElementById('obbMaxAngle').textContent = Math.max(...angles).toFixed(1) + '°'; + + tbody.innerHTML = objects.map(obj => { + const angleDeg = obj.obb_angle_deg; + // Angle bar: map -90..+90 to 0..100% + const pct = Math.round(((angleDeg + 90) / 180) * 100); + const barColor = Math.abs(angleDeg) < 20 ? 'var(--ok)' : Math.abs(angleDeg) < 60 ? 'var(--warn)' : 'var(--fail)'; + return ` + ${obj.object_id ?? '—'} + ${obj.label ?? 'unknown'} + +
+
+ ${angleDeg.toFixed(1)}° +
+ + ${obj.obb_theta_rad != null ? Number(obj.obb_theta_rad).toFixed(4) : '—'} + ${obj.obb_width_px != null ? Math.round(obj.obb_width_px) : '—'} + ${obj.obb_height_px != null ? Math.round(obj.obb_height_px) : '—'} + ${obj.obb_center_u != null ? Math.round(obj.obb_center_u) : '—'}, ${obj.obb_center_v != null ? Math.round(obj.obb_center_v) : '—'} + (${obj.bbox_x1??0},${obj.bbox_y1??0})→(${obj.bbox_x2??0},${obj.bbox_y2??0}) + ${confBar(obj.sam_confidence)} + ${obj.distance_cm != null ? obj.distance_cm.toFixed(1) : '—'} + `; + }).join(''); + } + + // ── /vision/classify_all history ───────────────────────────────────────── + async function fetchClassifyAllHistory() { + try { + const res = await fetch('/api/classify-all-history'); + if (!res.ok) return; + const records = await res.json(); + updateClassifyAllHistory(records); + } catch (_) {} + } + + function updateClassifyAllHistory(records) { + document.getElementById('caCount').textContent = records.length; + const tbody = document.getElementById('caBody'); + if (!records.length) { + tbody.innerHTML = 'No data — call ros2 service call /vision/classify_all std_srvs/srv/Trigger'; + ['caLatestLabel','caLatestConf','caAvgConf','caAvgLatency'].forEach(id => document.getElementById(id).textContent = '—'); + return; + } + const latest = records[records.length - 1]; + document.getElementById('caLatestLabel').textContent = latest.top_label || '—'; + document.getElementById('caLatestConf').textContent = latest.top_confidence != null ? (latest.top_confidence * 100).toFixed(1) + '%' : '—'; + const avgConf = records.reduce((s, r) => s + (r.top_confidence || 0), 0) / records.length; + document.getElementById('caAvgConf').textContent = (avgConf * 100).toFixed(1) + '%'; + const avgLat = records.reduce((s, r) => s + (r.latency_s || 0), 0) / records.length; + document.getElementById('caAvgLatency').textContent = avgLat.toFixed(3) + ' s'; + + tbody.innerHTML = records.slice(-50).reverse().map(r => { + const top5 = (r.all_predictions || []).slice(0, 5).map(p => + `${p.label} ${(p.confidence*100).toFixed(0)}%` + ).join(' '); + return ` + ${r.call_id} + ${r.top_label} + ${confBar(r.top_confidence)} + ${top5} + ${r.latency_s != null ? r.latency_s.toFixed(3) : '—'} + ${tsShort(r.timestamp)} + `; + }).join(''); + } + + // ── /obb/find_object_angle_bb history ──────────────────────────────────── + async function fetchObbBBHistory() { + try { + const res = await fetch('/api/obb-bb-history'); + if (!res.ok) return; + const records = await res.json(); + updateObbBBHistory(records); + } catch (_) {} + } + + function updateObbBBHistory(records) { + document.getElementById('obbBBCount').textContent = records.length; + const tbody = document.getElementById('obbBBBody'); + if (!records.length) { + tbody.innerHTML = 'No data — call ros2 service call /obb/find_object_angle_bb ...'; + ['obbBBLatestAngle','obbBBAvgAngle','obbBBLatestObject'].forEach(id => document.getElementById(id).textContent = '—'); + return; + } + const latest = records[records.length - 1]; + document.getElementById('obbBBLatestAngle').textContent = latest.angle_deg != null ? latest.angle_deg.toFixed(1) + '°' : '—'; + document.getElementById('obbBBLatestObject').textContent = latest.object_id || '—'; + const avgAngle = records.reduce((s, r) => s + (r.angle_deg || 0), 0) / records.length; + document.getElementById('obbBBAvgAngle').textContent = avgAngle.toFixed(1) + '°'; + + tbody.innerHTML = records.slice(-50).reverse().map(r => { + const pct = Math.round(((r.angle_deg + 90) / 180) * 100); + const barColor = Math.abs(r.angle_deg) < 20 ? 'var(--ok)' : Math.abs(r.angle_deg) < 60 ? 'var(--warn)' : 'var(--fail)'; + const bb = r.input_bbox || []; + return ` + ${r.call_id} + ${r.object_id ?? '—'} + +
+
+ ${r.angle_deg.toFixed(1)}° +
+ + ${r.theta_rad != null ? r.theta_rad.toFixed(4) : '—'} + ${r.center_u != null ? Math.round(r.center_u) : '—'}, ${r.center_v != null ? Math.round(r.center_v) : '—'} + ${r.width_px != null ? Math.round(r.width_px) : '—'} × ${r.height_px != null ? Math.round(r.height_px) : '—'} + ${bb.length === 4 ? `(${bb[0]},${bb[1]})→(${bb[2]},${bb[3]})` : '—'} + ${r.iou != null ? r.iou.toFixed(3) : '—'} + ${tsShort(r.timestamp)} + `; + }).join(''); + } + + // ── Clear data (legacy) ─────────────────────────────────────────────────── + async function clearData() { + if (confirm('Clear all in-session benchmark data?')) { + alert('Run:\nros2 service call /benchmark/clear_data std_srvs/srv/Trigger'); + } + } + + // ── Polling ─────────────────────────────────────────────────────────────── + setInterval(fetchData, 2000); + setInterval(fetchRunHistory, 2000); + setInterval(fetchClassifyAllHistory, 3000); + setInterval(fetchObbBBHistory, 3000); + fetchData(); + fetchRunHistory(); + fetchClassifyAllHistory(); + fetchObbBBHistory(); + diff --git a/dashboard/objects.html b/dashboard/objects.html new file mode 100644 index 0000000..7af533a --- /dev/null +++ b/dashboard/objects.html @@ -0,0 +1,1049 @@ + + + + + + Object Explorer — Vision Research Dashboard + + + + + + + + +
+ + +
+
+
Object Explorer
+
Sort · Group · Assign Names · Export · data pulled from vision_runs_history.json
+
+
+ Run + + +
+ Group + + +
+ + +
+ + +
+
+ +
+ + +
+
+
+ 0 + Objects +
+
+
+ + Avg SAM Conf +
+
+
+ 0 + Unique Labels +
+
+
+ 0 + Graspable +
+
+
+ + Avg OBB Angle +
+
+
+ 0 + Stable (AP≥0.5) +
+
+
+ 0 + Relations +
+
+
+ 0 + Named +
+
+ + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ObjectSAM — Segment Anything ModelCLIP ClassificationGraspNetOBB — Oriented BBoxScene Understanding
+ Assigned Name + + Obj ID + + Run # + BBox (x1,y1 → x2,y2) + Middle Point + + z / Depth (cm) + + Confidence + + IoU Score + + AP (IoU ≥ 0.5) + + Label + + Confidence + Top-1 Acc + Has Grasp + + Quality (Q) + + Width (m) + + Angle from X (°) + + θ (rad) + W × H (px)OBB Center + Relations + Relation Detail
Loading…
+
+
+ +
+ + +
+ + + + diff --git a/docs/COLLECT_AND_EXPORT.md b/docs/COLLECT_AND_EXPORT.md new file mode 100644 index 0000000..beb6287 --- /dev/null +++ b/docs/COLLECT_AND_EXPORT.md @@ -0,0 +1,264 @@ +# Vision Data Collector & Excel Exporter + +Runs the full SAM → CLIP → Scene Understanding pipeline **once**, stores the result in a rolling +JSON history (last 20 runs), and writes a ready-to-open Excel workbook. + +--- + +## Output Files + +| File | Location | Description | +|---|---|---| +| `vision_runs_history.json` | workspace root (next to `README.md`) | Rolling database of last 20 runs | +| `vision_runs_export.xlsx` | workspace root | Excel workbook, 4 sheets | + +### Excel Sheets + +| Sheet | One row per | Key columns | +|---|---|---| +| **Runs** | Each script invocation | timestamp, total objects, graspable, avg confidence, stability %, scene description, latency | +| **Objects** | Each detected object | CLIP label, bbox coords, SAM + CLIP confidence, distance cm, IoU score, grasp quality | +| **Relations** | Each spatial relation | subject → relation → target, confidence, 2D distance | +| **Grasps** | Each grasp pose | x/y/z position (m), quality score, gripper width (m) | + +--- + +## Prerequisites + +### 1. Install openpyxl (one-time) + +```bash +pip install openpyxl +``` + +### 2. ROS2 workspace must be built + +```bash +cd ~/ros2_ws # or wherever your workspace is +colcon build --packages-select vision +source install/setup.bash +``` + +--- + +## How to Run + +### Step 1 — Start the vision nodes (separate terminals) + +```bash +# Terminal 1: SAM detector (required) +ros2 run vision simple_sam_detector + +# Terminal 2: CLIP classifier (required for object labels) +ros2 run vision clip_classifier + +# Terminal 3: Scene understanding (required for relations + grasp data) +ros2 run vision scene_understanding + +# Terminal 4: GraspNet detector (optional — needed for grasp positions) +ros2 run vision graspnet_detector +``` + +Wait until you see the nodes announce their services in the terminal output before continuing. + +### Step 2 — Make sure a camera is publishing + +```bash +# Simulation (Gazebo): +ros2 launch ur_yt_sim spawn_ur5_camera_gripper_moveit.launch.py + +# Real RealSense camera: +ros2 launch realsense2_camera rs_launch.py + +# Static image (for testing without hardware): +ros2 launch vision vision_with_camera.launch.py camera_type:=file image_file:=path/to/image.jpg +``` + +### Step 3 — Run the collector + +```bash +# Option A: directly with python3 +python3 vision_scripts/collect_and_export.py + +# Option B: as a ROS2 entry point (after colcon build) +ros2 run vision collect_and_export +``` + +### Expected terminal output + +``` +============================================================ + Vision Pipeline Data Collector & Excel Exporter +============================================================ + History file : /path/to/vision/vision_runs_history.json + Excel file : /path/to/vision/vision_runs_export.xlsx + Max runs kept: 20 +============================================================ +Waiting for /vision/run_pipeline ... +RUN #1 — Step 1: SAM /vision/run_pipeline + SAM: 3 objects detected +RUN #1 — Step 2: CLIP /vision/classify_bbox_filtered + CLIP: 2 regions classified +RUN #1 — Step 3: Scene /vision/understand_scene + Scene: 3 objects, 4 relations +RUN #1 complete in 2.341s — 3 objects, 4 relations +============================================================ +[OK] History saved → .../vision_runs_history.json (1 runs stored) +[OK] Excel exported → .../vision_runs_export.xlsx + +Done. Open vision_runs_export.xlsx to view results. +``` + +--- + +## Run it multiple times + +Each time you run the script it **appends** one new entry to the JSON and **overwrites** the Excel. +After 20 runs the oldest entry is automatically dropped. + +```bash +# Run 5 times in a loop (bash) +for i in {1..5}; do + python3 vision_scripts/collect_and_export.py + sleep 2 +done +``` + +--- + +## Debugging + +### Problem: "service not available — skipping" + +The script will skip any service that does not respond within 5 seconds and +continue with the remaining services without crashing. + +**Fix:** Check that the required node is running: + +```bash +ros2 node list # should show /simple_sam_detector, /clip_classifier, etc. +ros2 service list # should show /vision/run_pipeline, /vision/classify_bbox_filtered, etc. +``` + +If a node is missing, start it (see Step 1 above). + +--- + +### Problem: No camera image / SAM returns 0 objects + +```bash +# Check camera topics are publishing +ros2 topic list | grep camera +ros2 topic hz /camera/image_raw # should show ~30 Hz + +# Check SAM subscriber got a frame +ros2 topic echo /vision/status # shows uptime and detection count +``` + +SAM only detects when a frame has been received. Wait a second after starting the camera before calling the collector. + +--- + +### Problem: CLIP returns "No classified regions" + +CLIP auto-classifies when it receives a message on `/vision/sam_detections`. +The collector calls `/vision/run_pipeline` first (which publishes to that topic), then +waits 500 ms before calling CLIP. If CLIP is slow on first run (model loading): + +```bash +# Call run_pipeline manually first to warm up CLIP +ros2 service call /vision/run_pipeline std_srvs/srv/Trigger + +# Wait a few seconds, then run the collector +python3 vision_scripts/collect_and_export.py +``` + +--- + +### Problem: Scene understanding times out + +Scene understanding calls `/vision/detect_objects` internally which can take a few +seconds. The collector allows **15 seconds** for it. If it still times out: + +```bash +# Test scene understanding manually +ros2 service call /vision/understand_scene std_srvs/srv/Trigger +``` + +If that also hangs, check that `simple_sam_detector` is running (scene understanding +calls it internally). + +--- + +### Problem: "openpyxl not installed" + +```bash +pip install openpyxl + +# Verify: +python3 -c "import openpyxl; print(openpyxl.__version__)" +``` + +The script will still save `vision_runs_history.json` even without openpyxl — only +the Excel export is skipped. + +--- + +### Problem: Excel file is open in Excel and the script fails to overwrite it + +Close the Excel file first, then re-run the collector. + +--- + +### Inspect the raw JSON history + +```bash +# Pretty-print the history file +python3 -c "import json; print(json.dumps(json.load(open('vision_runs_history.json')), indent=2))" + +# Show just the run summaries +python3 -c " +import json +runs = json.load(open('vision_runs_history.json')) +for r in runs: + m = r['meta']; s = r['sam']; sc = r['scene'] + print(f\"Run #{m['run_no']} | {m['timestamp']} | objects={s.get('total_detections','?')} | latency={m['latency_s']}s\") +" +``` + +--- + +### Manually clear the history + +```bash +rm vision_runs_history.json +``` + +The next run will start fresh from run #1. + +--- + +## File Structure + +``` +vision/ +├── vision_scripts/ +│ └── collect_and_export.py ← the collector script +├── vision_runs_history.json ← created on first run +├── vision_runs_export.xlsx ← created on first run +└── docs/ + └── COLLECT_AND_EXPORT.md ← this file +``` + +--- + +## Services Called (in order) + +| # | Service | Type | Purpose | +|---|---|---|---| +| 1 | `/vision/run_pipeline` | `std_srvs/Trigger` | SAM detects objects, publishes to topic | +| 2 | `/vision/classify_bbox_filtered` | `std_srvs/Trigger` | CLIP labels each region (conf > 0.5) | +| 3 | `/vision/understand_scene` | `std_srvs/Trigger` | Full scene: objects + relations + grasps | + +If a service is not running the step is skipped and that section of the run record will show `"success": false`. +The JSON and Excel are still saved with whatever data was collected. diff --git a/launch/vision.launch.py b/launch/vision.launch.py index 21011ad..6d6119c 100644 --- a/launch/vision.launch.py +++ b/launch/vision.launch.py @@ -42,4 +42,13 @@ def generate_launch_description(): ) ld.add_action(scene_understanding_node) + benchmark_dashboard_node = Node( + package='vision', + executable='benchmark_dashboard', + name='benchmark_dashboard_node', + output='screen', + emulate_tty=True + ) + ld.add_action(benchmark_dashboard_node) + return ld diff --git a/setup.cfg b/setup.cfg index 8a56203..ba5dc02 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [build_scripts] -executable=/usr/bin/env python3 +executable=/home/group11/final_project_ws/real_vision_venv/bin/python3 [develop] script_dir=$base/lib/vision [install] diff --git a/setup.py b/setup.py index f82099a..c221795 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ 'scipy>=1.11.0', 'scikit-learn>=1.3.0', 'pandas>=2.0.0', + 'openpyxl>=3.1.0', 'tqdm>=4.66.0', 'jsonschema>=4.19.0', ], @@ -58,6 +59,7 @@ tests_require=['pytest'], entry_points={ 'console_scripts': [ + 'collect_and_export = vision_scripts.collect_and_export:main', 'simple_sam_detector = vision.simple_sam_detector:main', 'clip_classifier = vision.clip_classifier:main', 'sam_clip_pipeline = vision.sam_clip_pipeline:main', @@ -74,7 +76,7 @@ 'find_boundary_service = vision.find_boundary_service_node:main', 'find_multi_object = vision.find_multi_object:main', # 'calibrate = calibration.calibrate:main', - # 'benchmark_dashboard = vision.benchmark_dashboard:main', + 'benchmark_dashboard = vision.benchmark_dashboard:main', # 'show_rgb_image = vision.show_rgb_image_node:main', # 'show_depth_image = vision.show_depth_image_node:main', # 'sam_vision_pipeline = vision.sam_vision_pipeline_node:main', diff --git a/test/test_clip_filtered_dashboard.py b/test/test_clip_filtered_dashboard.py new file mode 100644 index 0000000..995b5a0 --- /dev/null +++ b/test/test_clip_filtered_dashboard.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Unit tests: verify classify_filtered_history.json path, format, and HTTP endpoint. + +Run directly (no ROS needed): + python3 test/test_clip_filtered_dashboard.py +""" + +import json +import unittest +import urllib.request +import urllib.error +from pathlib import Path + +# ── Path constants ──────────────────────────────────────────────────────────── +INSTALL_VISION = Path( + '/home/group11/final_project_ws/install/vision' + '/lib/python3.12/site-packages/vision' +) +INSTALL_PKG = INSTALL_VISION.parent # site-packages/ +SRC_VISION = Path( + '/home/group11/final_project_ws/src/vision/vision' +) +SRC_PKG = SRC_VISION.parent # src/vision/ + +DASHBOARD_URL = 'http://localhost:8080' + + +def _resolve_package_path(py_file: Path) -> Path: + """Replicate Path(__file__).parent.parent used in the nodes.""" + return py_file.parent.parent + + +class TestPathConsistency(unittest.TestCase): + """Writer and reader must resolve to the same JSON file.""" + + def test_install_clip_and_dashboard_same_path(self): + clip_path = _resolve_package_path(INSTALL_VISION / 'clip_classifier.py') \ + / 'classify_filtered_history.json' + dash_path = _resolve_package_path(INSTALL_VISION / 'benchmark_dashboard.py') \ + / 'classify_filtered_history.json' + self.assertEqual(clip_path, dash_path, + f"clip writes to {clip_path} but dashboard reads from {dash_path}") + + def test_src_and_install_expected_dir(self): + """Install package_path == site-packages, NOT src/vision.""" + install_pkg_path = _resolve_package_path(INSTALL_VISION / 'clip_classifier.py') + self.assertIn('site-packages', str(install_pkg_path), + f"install path looks wrong: {install_pkg_path}") + + def test_filtered_file_exists(self): + fpath = INSTALL_PKG / 'classify_filtered_history.json' + self.assertTrue(fpath.exists(), + f"classify_filtered_history.json not found at {fpath}") + + +class TestFilteredFileFormat(unittest.TestCase): + """Each record must have the fields the dashboard expects.""" + + @classmethod + def setUpClass(cls): + fpath = INSTALL_PKG / 'classify_filtered_history.json' + if fpath.exists(): + with open(fpath) as f: + cls.records = json.load(f) + else: + cls.records = [] + + def test_file_not_empty(self): + self.assertGreater(len(self.records), 0, + "classify_filtered_history.json is empty – call " + "/vision/classify_bbox_filtered first") + + def test_required_fields_present(self): + required = {'test_id', 'timestamp', 'label', 'confidence', 'top1_accuracy', 'bbox'} + for i, rec in enumerate(self.records[:5]): + missing = required - rec.keys() + self.assertFalse(missing, + f"Record #{i} missing fields: {missing}\nRecord: {rec}") + + def test_confidence_is_float_in_range(self): + for i, rec in enumerate(self.records[:20]): + c = rec.get('confidence') + self.assertIsInstance(c, float, + f"Record #{i}: confidence should be float, got {type(c)}") + self.assertGreaterEqual(c, 0.0) + self.assertLessEqual(c, 1.0) + + def test_top1_accuracy_is_null_or_bool(self): + for i, rec in enumerate(self.records): + acc = rec.get('top1_accuracy') + self.assertIn(type(acc), (type(None), bool), + f"Record #{i}: top1_accuracy should be None or bool, got {type(acc)}: {acc}") + + def test_bbox_is_list_or_dict(self): + for i, rec in enumerate(self.records[:10]): + bbox = rec.get('bbox') + self.assertIsNotNone(bbox, f"Record #{i}: bbox is None") + self.assertIn(type(bbox), (list, dict), + f"Record #{i}: bbox type {type(bbox)}") + + def test_no_auto_true_verdicts_in_new_records(self): + """Records saved after the human-in-the-loop fix should have top1_accuracy=None.""" + # Find the most recent record – should be null, not auto-True + if not self.records: + self.skipTest("No records to check") + last = self.records[-1] + self.assertIsNone(last.get('top1_accuracy'), + f"Latest record has auto-verdict instead of null: {last}") + + +class TestHTTPEndpoint(unittest.TestCase): + """Verify the dashboard HTTP server exposes /api/classify-filtered-history.""" + + def _get(self, path): + try: + with urllib.request.urlopen(DASHBOARD_URL + path, timeout=5) as r: + return r.status, json.loads(r.read()) + except urllib.error.HTTPError as e: + return e.code, None + except Exception as e: + self.skipTest(f"Dashboard not reachable ({e})") + + def test_endpoint_exists_returns_200(self): + status, data = self._get('/api/classify-filtered-history') + self.assertEqual(status, 200, + f"/api/classify-filtered-history returned {status} – endpoint missing or not deployed") + + def test_endpoint_returns_list(self): + status, data = self._get('/api/classify-filtered-history') + if status != 200: + self.skipTest(f"Endpoint returned {status}") + self.assertIsInstance(data, list, + f"Expected list, got {type(data)}") + + def test_endpoint_data_matches_file(self): + fpath = INSTALL_PKG / 'classify_filtered_history.json' + if not fpath.exists(): + self.skipTest("classify_filtered_history.json not found") + with open(fpath) as f: + file_records = json.load(f) + + status, http_records = self._get('/api/classify-filtered-history') + if status != 200: + self.skipTest(f"Endpoint returned {status}") + + self.assertEqual(len(file_records), len(http_records), + f"File has {len(file_records)} records but endpoint returned {len(http_records)}") + + def test_api_data_merges_filtered(self): + """fetchData() merges clip_classifications + filtered; verify /api/data exists.""" + status, data = self._get('/api/data') + if status != 200: + self.skipTest("/api/data not reachable") + self.assertIn('clip_classifications', data, + "clip_classifications key missing from /api/data") + + def test_clip_verdict_endpoint_accessible(self): + """POST /api/clip-verdict must return 4xx for bad payload, not 404.""" + try: + req = urllib.request.Request( + DASHBOARD_URL + '/api/clip-verdict', + data=b'{}', + headers={'Content-Type': 'application/json'}, + method='POST' + ) + with urllib.request.urlopen(req, timeout=5) as r: + status = r.status + except urllib.error.HTTPError as e: + status = e.code + except Exception as e: + self.skipTest(f"Dashboard not reachable ({e})") + + self.assertNotEqual(status, 404, + "/api/clip-verdict returned 404 – endpoint not deployed in running process") + self.assertIn(status, (200, 400), + f"Unexpected status {status} for /api/clip-verdict") + + +if __name__ == '__main__': + print("=" * 60) + print("CLIP Filtered Dashboard – Path & Endpoint Tests") + print("=" * 60) + unittest.main(verbosity=2) diff --git a/vision/benchmark_dashboard.py b/vision/benchmark_dashboard.py index 3bc0459..4958021 100644 --- a/vision/benchmark_dashboard.py +++ b/vision/benchmark_dashboard.py @@ -16,6 +16,10 @@ Usage: ros2 run vision benchmark_dashboard + + ros2 service call /benchmark/clear_data std_srvs/srv/Trigger + + Then open browser: http://localhost:8080 """ @@ -25,6 +29,7 @@ from std_srvs.srv import Trigger from std_msgs.msg import String import json +import numpy as np import time from datetime import datetime from http.server import HTTPServer, SimpleHTTPRequestHandler @@ -45,6 +50,18 @@ print("Custom interfaces not available. Build custom_interfaces package first.") +class _ROSJSONEncoder(json.JSONEncoder): + """JSON encoder that handles ROS/numpy integer and float types.""" + def default(self, obj): + if isinstance(obj, (np.integer,)): + return int(obj) + if isinstance(obj, (np.floating,)): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + return super().default(obj) + + class BenchmarkDashboard(Node): """ Benchmark Dashboard - Monitors vision services and provides web interface @@ -62,6 +79,7 @@ def __init__(self): 'sam_detections': [], 'clip_classifications': [], 'grasp_detections': [], + 'obb_detections': [], 'scene_understanding': [], 'metadata': { 'start_time': datetime.now().isoformat(), @@ -134,7 +152,10 @@ def __init__(self): # Timer to publish data periodically self.publish_timer = self.create_timer(1.0, self.publish_data) - + + # Timer to sync CLIP/OBB/GraspNet/Pixel-to-Real from vision_runs_history.json + self.sync_timer = self.create_timer(2.0, self.sync_from_run_history) + # Start HTTP server in separate thread self.start_http_server() @@ -152,20 +173,20 @@ def sam_detections_callback(self, msg): 'frame_id': msg.header.frame_id, 'obj_id': detection.object_id, 'bbox': { - 'x1': detection.bbox[0], - 'y1': detection.bbox[1], - 'x2': detection.bbox[2], - 'y2': detection.bbox[3] + 'x1': int(detection.bbox[0]), + 'y1': int(detection.bbox[1]), + 'x2': int(detection.bbox[2]), + 'y2': int(detection.bbox[3]) }, 'center': { - 'u': detection.center[0], - 'v': detection.center[1] + 'u': int(detection.center[0]), + 'v': int(detection.center[1]) }, 'confidence': float(detection.confidence), - 'area': detection.area, + 'area': int(detection.area), 'distance_cm': float(detection.distance_cm), 'iou_score': float(detection.iou_score), - 'is_stable': detection.is_stable_detection, + 'is_stable': bool(detection.is_stable_detection), 'ap_iou_threshold': 0.5 if detection.is_stable_detection else 0.0 } @@ -193,10 +214,10 @@ def scene_understanding_callback(self, msg): scene_data = { 'timestamp': timestamp, 'scene_id': msg.scene_id, - 'total_objects': msg.total_objects, + 'total_objects': int(msg.total_objects), 'relations': relations, 'object_labels': list(msg.object_labels), - 'object_counts': list(msg.object_counts), + 'object_counts': [int(c) for c in msg.object_counts], 'graspable_objects': msg.graspable_objects, 'average_distance_cm': float(msg.average_distance_cm), 'scene_description': msg.scene_description, @@ -276,10 +297,10 @@ def add_grasp_detection_record(self, grasp_pose): 'timestamp': timestamp, 'test_id': len(self.data['grasp_detections']) + 1, 'object_id': grasp_pose.object_id, - 'bbox': list(grasp_pose.bbox), + 'bbox': [int(v) for v in grasp_pose.bbox], 'pixel_position': { - 'u': grasp_pose.bbox[0] + (grasp_pose.bbox[2] - grasp_pose.bbox[0]) // 2, - 'v': grasp_pose.bbox[1] + (grasp_pose.bbox[3] - grasp_pose.bbox[1]) // 2 + 'u': int(grasp_pose.bbox[0]) + (int(grasp_pose.bbox[2]) - int(grasp_pose.bbox[0])) // 2, + 'v': int(grasp_pose.bbox[1]) + (int(grasp_pose.bbox[3]) - int(grasp_pose.bbox[1])) // 2 }, 'world_position': { 'x': float(grasp_pose.position.x), @@ -299,77 +320,453 @@ def add_grasp_detection_record(self, grasp_pose): self.data['metadata']['total_calls'] += 1 + def sync_from_run_history(self): + """Sync CLIP, GraspNet, OBB, and Pixel-to-Real data from history files + so the /api/data endpoint stays populated even when service nodes are not running.""" + try: + from pathlib import Path + package_path = Path(__file__).parent.parent + + timestamp = datetime.now().isoformat() + + # Rebuild lists from all stored runs (most recent first for display) + new_clip = [] + new_grasp = [] + new_pixel = [] + new_obb = [] + + history_file = package_path / 'vision_runs_history.json' + runs = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + data = json.load(f) + if isinstance(data, list): + runs = data + except Exception: + runs = [] + + for run in runs: + run_ts = run.get('meta', {}).get('timestamp', timestamp) + run_no = run.get('meta', {}).get('run_no', 0) + objects = run.get('objects', []) + + for obj in objects: + # ── CLIP ────────────────────────────────────────────────── + clip_conf = obj.get('clip_confidence', '') + label = obj.get('label', '') + if clip_conf != '' and label: + new_clip.append({ + 'timestamp': run_ts, + 'test_id': f"{run_no}:{obj.get('object_id', '')}", + 'bbox': { + 'x1': obj.get('bbox_x1', 0), + 'y1': obj.get('bbox_y1', 0), + 'x2': obj.get('bbox_x2', 0), + 'y2': obj.get('bbox_y2', 0), + }, + 'label': label, + 'confidence': float(clip_conf), + 'top1_accuracy': float(clip_conf) >= 0.5, + }) + + # ── GraspNet ────────────────────────────────────────────── + grasp = obj.get('grasp', {}) + if obj.get('has_grasp') and grasp: + pix = grasp.get('pixel', {}) + wld = grasp.get('world', {}) + new_grasp.append({ + 'timestamp': run_ts, + 'test_id': f"{run_no}:{obj.get('object_id', '')}", + 'object_id': obj.get('object_id', ''), + 'pixel_position': {'u': pix.get('u', 0), 'v': pix.get('v', 0)}, + 'world_position': { + 'x': wld.get('x', 0.0), + 'y': wld.get('y', 0.0), + 'z': wld.get('z', 0.0), + }, + 'quality_score': float(grasp.get('quality_score', 0.0)), + 'grasp_width': float(grasp.get('grasp_width', 0.0)), + 'approach_direction': grasp.get('approach_direction', ''), + 'bbox': [obj.get('bbox_x1', 0), obj.get('bbox_y1', 0), + obj.get('bbox_x2', 0), obj.get('bbox_y2', 0)], + }) + + # ── Pixel-to-Real ───────────────────────────────────────── + world = obj.get('world', {}) + if world and world.get('x') is not None: + new_pixel.append({ + 'timestamp': run_ts, + 'test_id': f"{run_no}:{obj.get('object_id', '')}", + 'input': {'u': world.get('u', 0), 'v': world.get('v', 0)}, + 'output': {'x': float(world.get('x', 0.0)), + 'y': float(world.get('y', 0.0)), + 'z': float(world.get('z', 0.0))}, + }) + + # ── OBB ─────────────────────────────────────────────────── + obb_angle = obj.get('obb_angle_deg', '') + if obb_angle != '': + new_obb.append({ + 'timestamp': run_ts, + 'test_id': f"{run_no}:{obj.get('object_id', '')}", + 'object_id': obj.get('object_id', ''), + 'label': obj.get('label', ''), + 'angle_deg': float(obb_angle), + 'theta_rad': float(obj.get('obb_theta_rad', 0.0)), + 'width_px': float(obj.get('obb_width_px', 0.0)), + 'height_px': float(obj.get('obb_height_px', 0.0)), + 'center_u': float(obj.get('obb_center_u', 0.0)), + 'center_v': float(obj.get('obb_center_v', 0.0)), + 'sam_confidence': float(obj.get('sam_confidence', 0.0)), + 'bbox': [obj.get('bbox_x1', 0), obj.get('bbox_y1', 0), + obj.get('bbox_x2', 0), obj.get('bbox_y2', 0)], + }) + + # Also merge records from /vision/classify_bbox_filtered direct calls + filtered_file = Path(__file__).parent.parent / 'classify_filtered_history.json' + if filtered_file.exists(): + try: + with open(filtered_file, 'r') as f: + filtered_records = json.load(f) + if filtered_records: + new_clip = (new_clip + filtered_records)[-1000:] + except Exception: + pass + + # Only update if we got new data (avoids overwriting live topic data with empty) + if new_clip: + self.data['clip_classifications'] = new_clip[-1000:] + if new_grasp: + self.data['grasp_detections'] = new_grasp[-1000:] + if new_pixel: + self.data['pixel_to_real'] = new_pixel[-1000:] + if new_obb: + self.data['obb_detections'] = new_obb[-1000:] + + except Exception as e: + self.get_logger().warn(f'sync_from_run_history failed: {e}') + def publish_data(self): """Publish benchmark data to topic""" msg = String() - msg.data = json.dumps(self.data) + msg.data = json.dumps(self.data, cls=_ROSJSONEncoder) self.data_publisher.publish(msg) - + def clear_data_callback(self, request, response): - """Clear all benchmark data""" + """Clear all benchmark data (in-memory + persistent JSON files)""" self.data = { 'pixel_to_real': [], 'sam_detections': [], 'clip_classifications': [], 'grasp_detections': [], + 'obb_detections': [], 'scene_understanding': [], 'metadata': { 'start_time': datetime.now().isoformat(), 'total_calls': 0 } } - + + # Also wipe the persistent history files so sync_from_run_history + # doesn't immediately repopulate from stale data + package_path = Path(__file__).parent.parent + files_to_clear = [ + 'vision_runs_history.json', + 'classify_filtered_history.json', + 'classify_all_history.json', + 'obb_bb_history.json', + ] + for fname in files_to_clear: + fpath = package_path / fname + if fpath.exists(): + try: + with open(fpath, 'w') as f: + json.dump([], f) + except Exception as e: + self.get_logger().warn(f'Could not clear {fname}: {e}') + response.success = True response.message = "Benchmark data cleared" - - self.get_logger().info('Benchmark data cleared') + self.get_logger().info('Benchmark data cleared (memory + files)') return response def start_http_server(self): """Start HTTP server for dashboard""" - # Get the path to the HTML file + # Resolve dashboard HTML using ament share directory (correct for installed packages) + try: + from ament_index_python.packages import get_package_share_directory + share_dir = get_package_share_directory('vision') + html_dir = Path(share_dir) / 'dashboard' + except Exception: + html_dir = Path(__file__).parent.parent / 'dashboard' + + # History JSON files written next to the installed module package_path = Path(__file__).parent.parent - html_dir = package_path / 'dashboard' - - # Create dashboard directory if it doesn't exist - html_dir.mkdir(exist_ok=True) - - # Create HTML file if it doesn't exist + + self.get_logger().info(f'Dashboard HTML dir: {html_dir}') + + # Fallback: create default HTML if share dir has no index.html html_file = html_dir / 'index.html' if not html_file.exists(): self.get_logger().warn(f'Dashboard HTML not found at {html_file}') self.get_logger().warn('Creating basic HTML file...') + html_dir.mkdir(exist_ok=True) self.create_default_html(html_file) - - # Custom handler that serves files from html_dir and provides data endpoint + + node_logger = self.get_logger() + + # Custom handler that serves files and provides API endpoints class DashboardHandler(SimpleHTTPRequestHandler): def __init__(self, *args, dashboard_node=None, **kwargs): self.dashboard_node = dashboard_node super().__init__(*args, directory=str(html_dir), **kwargs) - + + # ── helpers ──────────────────────────────────────────────────── + def _json_response(self, data, status=200): + body = json.dumps(data, cls=_ROSJSONEncoder).encode() + self.send_response(status) + self.send_header('Content-type', 'application/json') + self.send_header('Access-Control-Allow-Origin', '*') + self.end_headers() + self.wfile.write(body) + + def _read_json_file(self, path, default): + try: + if path.exists(): + with open(path, 'r') as f: + return json.load(f) + except Exception: + pass + return default + + def _write_json_file(self, path, data): + with open(path, 'w') as f: + json.dump(data, f, indent=2) + + # ── OPTIONS (CORS pre-flight) ─────────────────────────────────── + def do_OPTIONS(self): + self.send_response(204) + self.send_header('Access-Control-Allow-Origin', '*') + self.send_header('Access-Control-Allow-Methods', 'GET, POST, OPTIONS') + self.send_header('Access-Control-Allow-Headers', 'Content-Type') + self.end_headers() + + # ── GET ──────────────────────────────────────────────────────── def do_GET(self): if self.path == '/api/data': - # Serve benchmark data as JSON - self.send_response(200) - self.send_header('Content-type', 'application/json') - self.send_header('Access-Control-Allow-Origin', '*') - self.end_headers() - data_json = json.dumps(self.dashboard_node.data) - self.wfile.write(data_json.encode()) + self._json_response(self.dashboard_node.data) + + elif self.path == '/api/run-history': + history_file = package_path / 'vision_runs_history.json' + data = self._read_json_file(history_file, []) + self._json_response(data) + + elif self.path == '/api/find-object-history': + fo_file = package_path / 'find_object_history.json' + data = self._read_json_file(fo_file, []) + self._json_response(data) + + elif self.path == '/api/classify-all-history': + ca_file = package_path / 'classify_all_history.json' + data = self._read_json_file(ca_file, []) + self._json_response(data) + + elif self.path == '/api/classify-filtered-history': + cf_file = package_path / 'classify_filtered_history.json' + data = self._read_json_file(cf_file, []) + self._json_response(data) + + elif self.path == '/api/obb-bb-history': + obb_file = package_path / 'obb_bb_history.json' + data = self._read_json_file(obb_file, []) + self._json_response(data) + else: - # Serve static files super().do_GET() - + + # ── POST ─────────────────────────────────────────────────────── + def do_POST(self): + length = int(self.headers.get('Content-Length', 0)) + body = self.rfile.read(length) if length else b'{}' + try: + payload = json.loads(body) + except Exception: + payload = {} + + if self.path == '/api/find-object': + self._handle_find_object(payload) + + elif self.path == '/api/find-object-verdict': + self._handle_verdict(payload) + + elif self.path == '/api/find-object-clear': + fo_file = package_path / 'find_object_history.json' + self._write_json_file(fo_file, []) + self._json_response({'ok': True}) + + elif self.path == '/api/clip-verdict': + self._handle_clip_verdict(payload) + + else: + self._json_response({'error': 'unknown endpoint'}, 404) + + # ── /api/clip-verdict ───────────────────────────────────────── + def _handle_clip_verdict(self, payload): + """Set human-in-the-loop top1_accuracy verdict for a CLIP record.""" + test_id = payload.get('test_id') + verdict = payload.get('verdict') # True / False + if test_id is None or verdict is None: + self._json_response({'error': 'test_id and verdict required'}, 400) + return + + cf_file = package_path / 'classify_filtered_history.json' + history = self._read_json_file(cf_file, []) + updated = False + for entry in history: + if entry.get('test_id') == test_id: + entry['top1_accuracy'] = bool(verdict) + updated = True + break + if updated: + self._write_json_file(cf_file, history) + self._json_response({'ok': True}) + else: + self._json_response({'error': f'test_id {test_id} not found'}, 404) + + # ── /api/find-object ────────────────────────────────────────── + def _handle_find_object(self, payload): + """Call /find_object ROS2 service and persist result.""" + import subprocess, shlex + label = payload.get('label', '').strip() + if not label: + self._json_response({'error': 'label is required'}, 400) + return + + fo_file = package_path / 'find_object_history.json' + history = self._read_json_file(fo_file, []) + call_id = len(history) + 1 + timestamp = datetime.now().isoformat() + + # Call the ROS2 service via subprocess + cmd = ( + f"ros2 service call /find_object " + f"custom_interfaces/srv/FindObjectReal " + f"\"{{label: '{label}'}}\"" + ) + try: + result = subprocess.run( + cmd, shell=True, capture_output=True, text=True, timeout=30 + ) + output = result.stdout + result.stderr + node_logger.info(f'find_object [{label}] stdout: {output[:300]}') + + # Parse the ROS2 CLI response format + entry = _parse_find_object_response(output, label, call_id, timestamp) + except subprocess.TimeoutExpired: + entry = { + 'call_id': call_id, 'timestamp': timestamp, + 'label_searched': label, 'success': False, + 'message': 'Service call timed out (30 s)', + 'object_id': '', 'bbox': [], 'confidence': 0.0, + 'x': 0.0, 'y': 0.0, 'z': 0.0, 'theta': 0.0, + 'verdict': None, + } + except Exception as e: + entry = { + 'call_id': call_id, 'timestamp': timestamp, + 'label_searched': label, 'success': False, + 'message': f'Error: {e}', + 'object_id': '', 'bbox': [], 'confidence': 0.0, + 'x': 0.0, 'y': 0.0, 'z': 0.0, 'theta': 0.0, + 'verdict': None, + } + + history.append(entry) + history = history[-50:] + self._write_json_file(fo_file, history) + self._json_response(entry) + + # ── /api/find-object-verdict ────────────────────────────────── + def _handle_verdict(self, payload): + call_id = payload.get('call_id') + verdict = payload.get('verdict') # true / false + if call_id is None or verdict is None: + self._json_response({'error': 'call_id and verdict required'}, 400) + return + + fo_file = package_path / 'find_object_history.json' + history = self._read_json_file(fo_file, []) + updated = False + for entry in history: + if entry.get('call_id') == call_id: + entry['verdict'] = bool(verdict) + updated = True + break + if updated: + self._write_json_file(fo_file, history) + self._json_response({'ok': True}) + else: + self._json_response({'error': f'call_id {call_id} not found'}, 404) + + def log_message(self, fmt, *args): + pass # suppress HTTP access log noise + + def _parse_find_object_response(output, label, call_id, timestamp): + """Parse ros2 service call CLI output into a dict.""" + import re + entry = { + 'call_id': call_id, 'timestamp': timestamp, + 'label_searched': label, 'success': False, + 'message': output.strip()[:500], + 'object_id': '', 'bbox': [], 'confidence': 0.0, + 'x': 0.0, 'y': 0.0, 'z': 0.0, 'theta': 0.0, + 'verdict': None, + } + try: + # success field + m = re.search(r'success=(\w+)', output) + if m: + entry['success'] = m.group(1).lower() == 'true' + # message field + m = re.search(r"message='([^']*)'", output) + if m: + entry['message'] = m.group(1) + # object_id + m = re.search(r"object_id='([^']*)'", output) + if m: + entry['object_id'] = m.group(1) + # bbox + m = re.search(r'bbox=\[([^\]]*)\]', output) + if m: + try: + entry['bbox'] = [int(x.strip()) for x in m.group(1).split(',') if x.strip()] + except Exception: + pass + # confidence + m = re.search(r'confidence=([\d.]+)', output) + if m: + entry['confidence'] = float(m.group(1)) + # x, y, z, theta + for field in ('x', 'y', 'z', 'theta'): + m = re.search(rf'{field}=([-\d.]+)', output) + if m: + entry[field] = float(m.group(1)) + except Exception as e: + node_logger.warn(f'Response parse error: {e}') + return entry + # Create handler with dashboard_node reference def handler_with_node(*args, **kwargs): return DashboardHandler(*args, dashboard_node=self, **kwargs) - + # Start server in separate thread server = HTTPServer(('0.0.0.0', 8080), handler_with_node) server_thread = threading.Thread(target=server.serve_forever, daemon=True) server_thread.start() - + self.get_logger().info(f'HTTP server started on http://localhost:8080') + self.get_logger().info(f'Find Object page: http://localhost:8080/find_object.html') self.get_logger().info(f'Serving files from: {html_dir}') def create_default_html(self, html_file): diff --git a/vision/clip_classifier.py b/vision/clip_classifier.py index 8bc0eb3..9004e42 100755 --- a/vision/clip_classifier.py +++ b/vision/clip_classifier.py @@ -112,6 +112,13 @@ def __init__(self, candidate_labels: List[str] = None): self.declare_parameter('real_hardware', False) self.real_hardware = bool(self.get_parameter('real_hardware').value) + # Minimum raw cosine similarity to even consider a region a candidate. + # Applied before inter-region softmax normalisation; regions below this + # threshold are discarded entirely. Default 0.1 rejects strongly + # mismatched regions while keeping borderline ones in the softmax pool. + self.declare_parameter('clip_min_confidence', 0.1) + self.clip_min_confidence = float(self.get_parameter('clip_min_confidence').value) + self.rgb_topic = '/camera/color/image_raw' if self.real_hardware else '/camera/image_raw' self.depth_topic = '/camera/depth/image_rect_raw' if self.real_hardware else '/camera/depth/image_raw' self.camera_info_topic = 'camera/color/camera_info' if self.real_hardware else '/camera/camera_info' @@ -122,6 +129,14 @@ def __init__(self, candidate_labels: List[str] = None): # "cobot", "green_cube", "drill", + "remote_control", + "orange_cube", + "orange_cylinder", + "arduino_board", + "mouse", + "light_blue_cube", + "blue_star", + "purple_triangle", "pink_cube", "measuring_tape", "screwdriver", @@ -134,11 +149,11 @@ def __init__(self, candidate_labels: List[str] = None): # "door_handle", # "red_ball", # "gasket_part", - "beer_can", + # "beer_can", "bowl", - "cinder_block", - "coke_can", - "roomba", + # "cinder_block", + # "coke_can", + # "roomba", # "plastic_cup", # "hammer", # "robotic_arm", @@ -282,8 +297,9 @@ def __init__(self, candidate_labels: List[str] = None): cv2.namedWindow(self.window_name, cv2.WINDOW_NORMAL) cv2.resizeWindow(self.window_name, 800, 600) - # Timer for visualization (30 Hz) - self.viz_timer = self.create_timer(0.033, self.visualization_callback) + # GUI updates are pumped from the main thread in main() to avoid + # HighGUI freezes under multi-threaded executors. + self.viz_timer = None self.get_logger().info("CLIP Classifier Started") self.get_logger().info(f"Subscribing to: {self.rgb_topic}") @@ -379,13 +395,16 @@ def classify_all_callback(self, request, response): response.success = True response.message = json.dumps(classification_data, indent=2) - + top_pred = classification_data['output']['top_prediction'] self.get_logger().info( f"Classification complete: {top_pred['label']} " f"(confidence: {top_pred['confidence']:.2f})" ) - + + # Persist result for dashboard + self._save_classify_all_record(classification_data, latency_s=(time.perf_counter() - start)) + except Exception as e: response.success = False response.message = json.dumps({ @@ -555,17 +574,31 @@ def classify_bbox_filtered_callback(self, request, response): self.get_logger().error("CLIP model not available") return response - # Check if we have classified regions from SAM subscription + # If no SAM regions cached, trigger a fresh detection + classification now if not self.latest_region_classifications: - response.success = False - response.message = json.dumps({ - "error": "No classified regions available. Call '/vision/run_pipeline' first to trigger SAM detection.", - "hint": "ros2 service call /vision/run_pipeline std_srvs/srv/Trigger", - "timestamp": datetime.utcnow().isoformat() + "Z" - }, indent=2) - self.get_logger().warn("No classified regions. Run SAM pipeline first.") - return response - + self.get_logger().info("No cached regions — calling /vision/detect_objects automatically...") + bboxes, err = self._call_detect_objects() + if err: + response.success = False + response.message = json.dumps({ + "error": err, + "hint": "Ensure simple_sam_detector is running", + "timestamp": datetime.utcnow().isoformat() + "Z" + }, indent=2) + self.get_logger().warn(f"Detection failed: {err}") + return response + if not bboxes: + response.success = False + response.message = json.dumps({ + "error": "No objects detected in the scene", + "timestamp": datetime.utcnow().isoformat() + "Z" + }, indent=2) + self.get_logger().warn("No bboxes returned from detection") + return response + self.get_logger().info(f"Got {len(bboxes)} bboxes from detection — classifying with CLIP...") + classification_data = self._classify_regions(self.captured_frame, bboxes) + self.latest_region_classifications = classification_data['output']['classified_regions'] + self.get_logger().info(f"Filtering {len(self.latest_region_classifications)} classified regions by confidence > 0.5") # Filter regions by confidence >= 0.5 @@ -594,18 +627,21 @@ def classify_bbox_filtered_callback(self, request, response): response.success = True response.message = json.dumps(result, indent=2) - + self.get_logger().info( f"Filtered classification complete: {len(filtered_regions)}/{len(self.latest_region_classifications)} " f"regions passed confidence threshold" ) - + # Log each filtered region for region in filtered_regions: self.get_logger().info( f"Region #{region['region_id']}: {region['label']} " f"(confidence: {region['confidence']:.2f})" ) + + # Persist to file so benchmark_dashboard can show results in CLIP table + self._save_filtered_records(filtered_regions) except Exception as e: response.success = False @@ -715,11 +751,17 @@ def _call_detect_objects(self) -> Tuple[List[List[int]], Optional[str]]: return bboxes, None def _compute_similarity(self, image_bgr: np.ndarray, label: str) -> float: - """Compute CLIP cosine similarity between crop and label text.""" + """Compute raw CLIP cosine similarity between a single crop and label text. + + Returns a value roughly in [-1, 1]. Prefer + _compute_inter_region_confidences() when you have multiple regions, + as that method normalises across regions via softmax to produce proper + probabilities in [0, 1]. + """ region_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB) pil_image = PILImage.fromarray(region_rgb) image_input = self.preprocess(pil_image).unsqueeze(0).to(self.device) - text_tokens = clip.tokenize([label]).to(self.device) + text_tokens = clip.tokenize([f"a photo of a {label}"]).to(self.device) with torch.no_grad(): image_features = self.model.encode_image(image_input) @@ -730,6 +772,55 @@ def _compute_similarity(self, image_bgr: np.ndarray, label: str) -> float: return float(similarity) + def _compute_inter_region_confidences( + self, + crops_bgr: List[np.ndarray], + label: str, + ) -> List[float]: + """Return per-region confidence values in [0, 1] for a query label. + + Rather than returning raw cosine similarity (which lives in roughly + [0.15, 0.35] for typical queries and is hard to threshold), this method + applies a temperature-scaled softmax *across all candidate regions*. + The result is a proper probability distribution: the region most likely + containing ``label`` gets a high score, the rest share the remainder. + + Args: + crops_bgr: List of BGR image crops (one per detected region). + label: Text query, e.g. ``"bowl"`` or ``"green_cube"``. + + Returns: + List of float probabilities, same length as ``crops_bgr``, + summing to 1.0. An empty list is returned when ``crops_bgr`` + is empty. + """ + if not crops_bgr: + return [] + + # Use the same prompt template as _classify_regions / _classify_image + text_tokens = clip.tokenize([f"a photo of a {label}"]).to(self.device) + with torch.no_grad(): + text_features = self.model.encode_text(text_tokens) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + image_features_list = [] + for crop_bgr in crops_bgr: + region_rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB) + pil_image = PILImage.fromarray(region_rgb) + image_input = self.preprocess(pil_image).unsqueeze(0).to(self.device) + img_feat = self.model.encode_image(image_input) + img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True) + image_features_list.append(img_feat) + + # Stack → (N, D), then cosine similarities → (N,) + all_image_features = torch.cat(image_features_list, dim=0) # (N, D) + raw_sims = (all_image_features @ text_features.T).squeeze(1) # (N,) + + # Temperature-scaled softmax over regions: a factor of 100 matches the + # scale used in standard CLIP logits and keeps the distribution sharp. + probs = (raw_sims * 100.0).softmax(dim=0) + return [float(p.item()) for p in probs] + def find_multi_object_callback(self, request, response): """ Service callback for /vision/find_multi_object. @@ -814,8 +905,10 @@ def find_multi_object_callback(self, request, response): frame = self.captured_frame frame_h, frame_w = frame.shape[:2] - matches: List[Dict] = [] + # --- Pass 1: collect valid crops pre-filtered by raw cosine --- + candidates: List[Dict] = [] + crops: List[np.ndarray] = [] for region_id, bbox in enumerate(bboxes): x1, y1, x2, y2 = bbox x1 = max(0, min(x1, frame_w)) @@ -827,12 +920,25 @@ def find_multi_object_callback(self, request, response): continue region_bgr = frame[y1:y2, x1:x2] - similarity = self._compute_similarity(region_bgr, target_label) - matches.append({ + raw_sim = self._compute_similarity(region_bgr, target_label) + if raw_sim < self.clip_min_confidence: + continue + candidates.append({ 'region_id': region_id, 'bbox': [int(x1), int(y1), int(x2), int(y2)], - 'confidence': round(float(similarity), 4), }) + crops.append(region_bgr) + + matches: List[Dict] = [] + if candidates: + # --- Pass 2: inter-region softmax → proper [0,1] probabilities --- + probs = self._compute_inter_region_confidences(crops, target_label) + for i, cand in enumerate(candidates): + matches.append({ + 'region_id': cand['region_id'], + 'bbox': cand['bbox'], + 'confidence': round(probs[i], 4), + }) matches.sort(key=lambda item: item['confidence'], reverse=True) top_matches = matches[:top_k] @@ -995,80 +1101,83 @@ def find_object_callback(self, request, response): self.get_logger().info(f"Classification complete for {len(self.latest_region_classifications)} regions") - self.get_logger().info(f"Computing CLIP embeddings for '{target_label}' and {len(self.latest_region_classifications)} regions") - - # Compute image-text similarity for each bounding box using CLIP's high-level API - best_match = None - best_similarity = -1.0 # Cosine similarity ranges from -1 to 1 - + self.get_logger().info(f"Computing CLIP confidences for '{target_label}' across {len(self.latest_region_classifications)} regions") + + # --- Pass 1: collect valid crops and their metadata --------- + h, w = self.captured_frame.shape[:2] + valid_regions = [] + crops = [] for region in self.latest_region_classifications: bbox = region['bbox'] x1, y1, x2, y2 = bbox - - # Clamp bbox to image bounds - h, w = self.captured_frame.shape[:2] x1 = max(0, min(x1, w)) x2 = max(0, min(x2, w)) y1 = max(0, min(y1, h)) y2 = max(0, min(y2, h)) - - # Skip invalid boxes if x2 <= x1 or y2 <= y1: self.get_logger().warn(f"Skipping invalid bbox: {bbox}") continue - - # Crop and convert region - region_bgr = self.captured_frame[y1:y2, x1:x2] - region_rgb = cv2.cvtColor(region_bgr, cv2.COLOR_BGR2RGB) - pil_image = PILImage.fromarray(region_rgb) - - # Use OpenAI CLIP for similarity computation - image_input = self.preprocess(pil_image).unsqueeze(0).to(self.device) - text_tokens = clip.tokenize([target_label]).to(self.device) - - with torch.no_grad(): - image_features = self.model.encode_image(image_input) - text_features = self.model.encode_text(text_tokens) - - # Normalize features - image_features = image_features / image_features.norm(dim=-1, keepdim=True) - text_features = text_features / text_features.norm(dim=-1, keepdim=True) - - # Compute cosine similarity (normalized dot product) - similarity = (image_features @ text_features.T)[0, 0].item() - - self.get_logger().debug(f"Region {region['region_id']}: similarity = {similarity:.4f}") - - # Track best match - if similarity > best_similarity: - best_similarity = similarity - best_match = { - 'bbox': bbox, - 'confidence': similarity, - 'region_id': region['region_id'] - } - - # Check if any match was found - if best_match is None: + # Pre-filter by raw cosine similarity so obviously wrong regions + # don't pollute the softmax pool. + raw_sim = self._compute_similarity( + self.captured_frame[y1:y2, x1:x2], target_label + ) + if raw_sim < self.clip_min_confidence: + self.get_logger().debug( + f"Region {region['region_id']} discarded: raw_sim={raw_sim:.4f} " + f"< clip_min_confidence={self.clip_min_confidence}" + ) + continue + valid_regions.append({ + 'bbox': [x1, y1, x2, y2], + 'region_id': region['region_id'], + }) + crops.append(self.captured_frame[y1:y2, x1:x2]) + + # Check if any valid region survived the pre-filter + if not valid_regions: response.success = False response.message = f"No valid regions found to compare with '{target_label}'" response.bbox = [] response.confidence = 0.0 self.latest_found_object = None - self.get_logger().info(f"No valid regions to compare") + self.get_logger().info("No valid regions to compare") return response - - # Optional: Set a minimum similarity threshold - min_similarity_threshold = 0.2 # Adjust based on your needs - if best_match['confidence'] < min_similarity_threshold: + + # --- Pass 2: inter-region softmax → proper probabilities ---- + # Each crop gets a probability in [0, 1] that sums to 1 across + # all regions. A high score means "this region most likely + # contains the queried label", which is a meaningful confidence. + probs = self._compute_inter_region_confidences(crops, target_label) + + for i, region_info in enumerate(valid_regions): + region_info['confidence'] = probs[i] + self.get_logger().debug( + f"Region {region_info['region_id']}: confidence={probs[i]:.4f}" + ) + + # Best region = highest softmax probability + best_match = max(valid_regions, key=lambda r: r['confidence']) + + # Minimum confidence guard (now in softmax probability space). + # With clip_min_confidence already filtering raw similarities, + # this catches the degenerate single-region case where probability + # is trivially 1.0 but the raw match was borderline. + min_prob_threshold = self.clip_min_confidence + if best_match['confidence'] < min_prob_threshold: response.success = False - response.message = f"Label '{target_label}' found but similarity too low ({best_match['confidence']:.3f} < {min_similarity_threshold})" + response.message = ( + f"Label '{target_label}' found but confidence too low " + f"({best_match['confidence']:.3f} < {min_prob_threshold})" + ) response.bbox = [] response.confidence = float(best_match['confidence']) self.latest_found_object = None - self.get_logger().info(f"Label '{target_label}' similarity too low: {best_match['confidence']:.3f}") + self.get_logger().info( + f"Label '{target_label}' confidence too low: {best_match['confidence']:.3f}" + ) return response - + # Store for visualization self.latest_found_object = { 'label': target_label, @@ -1076,17 +1185,17 @@ def find_object_callback(self, request, response): 'confidence': best_match['confidence'], 'region_id': best_match['region_id'] } - + # Return success with bbox response.success = True - response.message = f"Found '{target_label}' with similarity {best_match['confidence']:.3f}" + response.message = f"Found '{target_label}' with confidence {best_match['confidence']:.3f}" response.bbox = best_match['bbox'] response.confidence = float(best_match['confidence']) # Note: FindObject.srv doesn't have object_id field (only FindObjectReal.srv does) - + self.get_logger().info( f"Found '{target_label}': bbox={best_match['bbox']}, " - f"similarity={best_match['confidence']:.3f}, region_id={best_match['region_id']}" + f"confidence={best_match['confidence']:.3f}, region_id={best_match['region_id']}" ) except Exception as e: @@ -1116,21 +1225,18 @@ def _classify_image(self, rgb_image: np.ndarray) -> Dict: # Convert BGR to RGB rgb = cv2.cvtColor(rgb_image, cv2.COLOR_BGR2RGB) pil_image = PILImage.fromarray(rgb) - - # Prepare inputs using OpenAI CLIP + + # Prompt template improves accuracy over bare label names (CLIP paper) + prompted_labels = [f"a photo of a {lbl}" for lbl in self.candidate_labels] + image_input = self.preprocess(pil_image).unsqueeze(0).to(self.device) - text_tokens = clip.tokenize(self.candidate_labels).to(self.device) - - # Get predictions + text_tokens = clip.tokenize(prompted_labels).to(self.device) + with torch.no_grad(): image_features = self.model.encode_image(image_input) text_features = self.model.encode_text(text_tokens) - - # Normalize features image_features = image_features / image_features.norm(dim=-1, keepdim=True) text_features = text_features / text_features.norm(dim=-1, keepdim=True) - - # Calculate similarity (logits) logits_per_image = (100.0 * image_features @ text_features.T) probs = logits_per_image.softmax(dim=-1)[0] @@ -1174,7 +1280,79 @@ def _classify_image(self, rgb_image: np.ndarray) -> Dict: } return schema - + + def _save_classify_all_record(self, classification_data: Dict, latency_s: float = 0.0): + """Append a /vision/classify_all result to classify_all_history.json for the dashboard.""" + try: + from pathlib import Path + history_file = Path(__file__).parent.parent / 'classify_all_history.json' + history = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + history = json.load(f) + except Exception: + history = [] + + top_pred = classification_data.get('output', {}).get('top_prediction', {}) + all_preds = classification_data.get('output', {}).get('all_predictions', []) + meta = classification_data.get('output', {}).get('metadata', {}) + + record = { + 'call_id': len(history) + 1, + 'timestamp': datetime.utcnow().isoformat() + 'Z', + 'top_label': top_pred.get('label', ''), + 'top_confidence': float(top_pred.get('confidence', 0.0)), + 'all_predictions': all_preds[:10], # store top-10 + 'processing_time_ms': meta.get('processing_time_ms', 0), + 'latency_s': round(latency_s, 4), + 'device': meta.get('device', ''), + } + + history.append(record) + # Keep only the last 500 records + if len(history) > 500: + history = history[-500:] + + with open(history_file, 'w') as f: + json.dump(history, f, indent=2) + except Exception as e: + self.get_logger().warn(f'Failed to save classify_all record: {e}') + + def _save_filtered_records(self, filtered_regions: list): + """Append /vision/classify_bbox_filtered results to classify_filtered_history.json. + Each region becomes one record in the format the dashboard CLIP table expects.""" + try: + from pathlib import Path + history_file = Path(__file__).parent.parent / 'classify_filtered_history.json' + history = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + history = json.load(f) + except Exception: + history = [] + + timestamp = datetime.utcnow().isoformat() + 'Z' + call_id_base = len(history) + 1 + for i, region in enumerate(filtered_regions): + history.append({ + 'test_id': call_id_base + i, + 'timestamp': timestamp, + 'label': region['label'], + 'confidence': float(region['confidence']), + 'top1_accuracy': None, # set by human-in-the-loop verdict + 'bbox': region.get('bbox', {}), + }) + + if len(history) > 1000: + history = history[-1000:] + + with open(history_file, 'w') as f: + json.dump(history, f, indent=2) + except Exception as e: + self.get_logger().warn(f'Failed to save filtered records: {e}') + def _classify_regions(self, rgb_image: np.ndarray, bboxes: List[List[int]]) -> Dict: """ Classify multiple image regions using CLIP model @@ -1189,61 +1367,60 @@ def _classify_regions(self, rgb_image: np.ndarray, bboxes: List[List[int]]) -> D start_time = time.time() classified_regions = [] - + + # Encode text once for all regions. The candidate labels never change + # within a call, so there is no reason to re-tokenize per region. + # Prompt template "a photo of a {label}" matches CLIP's training + # distribution far better than bare label names and measurably improves + # zero-shot accuracy (CLIP paper: +~13 pp on ImageNet). + prompted_labels = [f"a photo of a {lbl}" for lbl in self.candidate_labels] + with torch.no_grad(): + text_tokens = clip.tokenize(prompted_labels).to(self.device) + text_features = self.model.encode_text(text_tokens) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + h, w = rgb_image.shape[:2] + for region_id, bbox in enumerate(bboxes): x1, y1, x2, y2 = bbox - + # Clamp bbox to image bounds - h, w = rgb_image.shape[:2] x1 = max(0, min(x1, w)) x2 = max(0, min(x2, w)) y1 = max(0, min(y1, h)) y2 = max(0, min(y2, h)) - + # Skip invalid boxes if x2 <= x1 or y2 <= y1: self.get_logger().warn(f"Skipping invalid bbox: {bbox}") continue - + # Crop region region_bgr = rgb_image[y1:y2, x1:x2] - - # Convert BGR to RGB region_rgb = cv2.cvtColor(region_bgr, cv2.COLOR_BGR2RGB) pil_image = PILImage.fromarray(region_rgb) - - # Prepare inputs using OpenAI CLIP + image_input = self.preprocess(pil_image).unsqueeze(0).to(self.device) - text_tokens = clip.tokenize(self.candidate_labels).to(self.device) - - # Get predictions + with torch.no_grad(): image_features = self.model.encode_image(image_input) - text_features = self.model.encode_text(text_tokens) - - # Normalize features image_features = image_features / image_features.norm(dim=-1, keepdim=True) - text_features = text_features / text_features.norm(dim=-1, keepdim=True) - - # Calculate similarity (logits) + + # Intra-label softmax: probability over the closed label set. + # Range [0, 1], sums to 1 across labels for this region. logits_per_image = (100.0 * image_features @ text_features.T) probs = logits_per_image.softmax(dim=-1)[0] - - # Convert to numpy + probs_np = probs.cpu().numpy() - - # Sort predictions by confidence sorted_indices = np.argsort(probs_np)[::-1] - - # Build predictions list + all_predictions = [] for idx in sorted_indices: all_predictions.append({ "label": self.candidate_labels[idx], "confidence": round(float(probs_np[idx]), 2) }) - - # Build region result + region_result = { "region_id": region_id, "bbox": [int(x1), int(y1), int(x2), int(y2)], @@ -1251,9 +1428,9 @@ def _classify_regions(self, rgb_image: np.ndarray, bboxes: List[List[int]]) -> D "label": all_predictions[0]["label"], "confidence": all_predictions[0]["confidence"] }, - "all_predictions": all_predictions[:10] # Top 10 + "all_predictions": all_predictions[:10] } - + classified_regions.append(region_result) # Calculate processing time @@ -1285,244 +1462,173 @@ def _classify_regions(self, rgb_image: np.ndarray, bboxes: List[List[int]]) -> D def visualization_callback(self): """Display camera feed with classification in OpenCV window""" - # Use latest_rgb for real-time display, fallback to captured_frame frame_to_display = self.latest_rgb if self.latest_rgb is not None else self.captured_frame - + if frame_to_display is None: - # Show waiting message blank = np.zeros((480, 640, 3), dtype=np.uint8) - cv2.putText( - blank, - f"Waiting to capture frame from {self.rgb_topic}...", - (50, 240), - cv2.FONT_HERSHEY_SIMPLEX, - 0.8, - (255, 255, 255), - 2 - ) + cv2.putText(blank, f"Waiting for {self.rgb_topic}...", (60, 240), + cv2.FONT_HERSHEY_SIMPLEX, 0.6, (160, 160, 160), 1) cv2.imshow(self.window_name, blank) cv2.waitKey(1) return - - # Create visualization image from latest live frame + vis_image = frame_to_display.copy() h, w = vis_image.shape[:2] - - # Check if we have region classifications (from SAM auto-classification) + + # ── Helpers ────────────────────────────────────────────────────────── + COLORS = [ + (0, 200, 255), # amber + (80, 255, 80), # lime + (255, 80, 80), # blue + (255, 0, 200), # magenta + (0, 230, 230), # yellow + (200, 80, 255), # violet + (0, 255, 180), # spring green + (255, 180, 0), # sky blue + ] + FOUND_COLOR = (60, 230, 60) # bright green for "found" object + + def semi_rect(img, x1, y1, x2, y2, fill=(15, 15, 15), alpha=0.75): + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(img.shape[1]-1, x2), min(img.shape[0]-1, y2) + if x2 <= x1 or y2 <= y1: + return + roi = img[y1:y2, x1:x2] + img[y1:y2, x1:x2] = cv2.addWeighted(roi, 1-alpha, np.full_like(roi, fill), alpha, 0) + + def corner_bracket(img, x1, y1, x2, y2, color, lw=2): + clen = max(10, int(min(x2-x1, y2-y1) * 0.15)) + for (px, py, dx, dy) in [(x1,y1,1,1),(x2,y1,-1,1),(x1,y2,1,-1),(x2,y2,-1,-1)]: + cv2.line(img, (px, py), (px + dx*clen, py), color, lw) + cv2.line(img, (px, py), (px, py + dy*clen), color, lw) + + def draw_label(img, text, x, y, color, fs=0.36, ft=1, pad=4, accent=3): + """Draw a semi-transparent dark label with a color accent bar.""" + (lw_px, lh_px), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, fs, ft) + bx1, by1 = x, y - lh_px - pad + bx2, by2 = x + accent + pad + lw_px + pad, y + pad + semi_rect(img, bx1, by1, bx2, by2) + cv2.rectangle(img, (max(0,bx1), max(0,by1)), (max(0,bx1)+accent, max(0,by2)), color, -1) + cv2.putText(img, text, (bx1 + accent + pad, y), + cv2.FONT_HERSHEY_SIMPLEX, fs, (235, 235, 235), ft) + + # ── Region classifications (SAM + CLIP pipeline) ────────────────── if self.latest_region_classifications: - # Draw each classified region with bounding box and label - for region in self.latest_region_classifications: - bbox = region['bbox'] - top_pred = region['top_prediction'] + for ri, region in enumerate(self.latest_region_classifications): + bbox = region['bbox'] + top_pred = region['top_prediction'] region_id = region['region_id'] - - # Draw bounding box - cv2.rectangle( - vis_image, - (bbox[0], bbox[1]), - (bbox[2], bbox[3]), - (0, 255, 255), # Yellow for classified regions - 3 - ) - - # Prepare label text - label = f"#{region_id}: {top_pred['label']}" - conf = f"{top_pred['confidence']:.1%}" - - # Calculate label position (above bbox) - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2) - - # Draw label background - cv2.rectangle( - vis_image, - (bbox[0], bbox[1] - label_size[1] - 25), - (bbox[0] + max(label_size[0], 100), bbox[1]), - (0, 255, 255), - -1 - ) - - # Draw label text - cv2.putText( - vis_image, - label, - (bbox[0] + 5, bbox[1] - 10), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (0, 0, 0), - 2 - ) - - # Draw confidence - cv2.putText( - vis_image, - conf, - (bbox[0] + 5, bbox[1] - 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.6, - (0, 0, 0), - 2 - ) - - # Add info text - info_text = f"Classified Regions: {len(self.latest_region_classifications)}" - cv2.putText( - vis_image, - info_text, - (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.8, - (0, 255, 255), - 2 - ) - - # Draw full image classification overlay (if available and no regions) + color = COLORS[ri % len(COLORS)] + x1, y1, x2, y2 = bbox + + # Corner-bracket bbox + thin outline + cv2.rectangle(vis_image, (x1, y1), (x2, y2), color, 1) + corner_bracket(vis_image, x1, y1, x2, y2, color, lw=2) + + # Label: "#id class conf%" + conf_pct = top_pred['confidence'] + label = f"#{region_id} {top_pred['label']} {conf_pct:.0%}" + + # Place above bbox, clamp to image top + ly = y1 - 5 + if ly < 16: + ly = y2 + 16 + draw_label(vis_image, label, x1, ly, color) + + # Confidence bar under bbox top edge + bar_w = x2 - x1 + filled = max(2, int(bar_w * conf_pct)) + cv2.rectangle(vis_image, (x1, y1), (x1 + bar_w, y1 + 3), (40, 40, 40), -1) + cv2.rectangle(vis_image, (x1, y1), (x1 + filled, y1 + 3), color, -1) + + # ── Full-image classification (no regions) ──────────────────────── elif self.latest_classification: - top_pred = self.latest_classification['output']['top_prediction'] - all_preds = self.latest_classification['output']['all_predictions'][:5] # Top 5 - - # Draw semi-transparent overlay at bottom - overlay = vis_image.copy() - cv2.rectangle(overlay, (0, h-150), (w, h), (0, 0, 0), -1) - vis_image = cv2.addWeighted(vis_image, 0.7, overlay, 0.3, 0) - - # Draw top prediction (large) - label_text = f"Top: {top_pred['label']}" - conf_text = f"{top_pred['confidence']:.1%}" - - cv2.putText( - vis_image, - label_text, - (20, h-100), - cv2.FONT_HERSHEY_SIMPLEX, - 1.2, - (0, 255, 0), - 3 - ) - - cv2.putText( - vis_image, - conf_text, - (20, h-60), - cv2.FONT_HERSHEY_SIMPLEX, - 1.0, - (0, 255, 0), - 2 - ) - - # Draw top 5 predictions (smaller, on right) - y_offset = h - 120 + top_pred = self.latest_classification['output']['top_prediction'] + all_preds = self.latest_classification['output']['all_predictions'][:5] + + # Bottom panel + panel_h = 110 + semi_rect(vis_image, 0, h - panel_h, w, h, fill=(12, 12, 12), alpha=0.80) + + # Top prediction + top_label = f"{top_pred['label']}" + top_conf = f"{top_pred['confidence']:.1%}" + cv2.putText(vis_image, top_label, (14, h - panel_h + 26), + cv2.FONT_HERSHEY_SIMPLEX, 0.65, (80, 255, 80), 1) + cv2.putText(vis_image, top_conf, (14, h - panel_h + 46), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (160, 255, 160), 1) + + # Confidence bar for top prediction + bar_max = min(w // 2 - 20, 260) + filled = int(bar_max * top_pred['confidence']) + cv2.rectangle(vis_image, (14, h - panel_h + 52), (14 + bar_max, h - panel_h + 56), (50, 50, 50), -1) + cv2.rectangle(vis_image, (14, h - panel_h + 52), (14 + filled, h - panel_h + 56), (80, 255, 80), -1) + + # Top-5 list on the right + col_x = w - 230 + cv2.putText(vis_image, "Top 5", (col_x, h - panel_h + 18), + cv2.FONT_HERSHEY_SIMPLEX, 0.38, (140, 140, 140), 1) for i, pred in enumerate(all_preds): - text = f"{i+1}. {pred['label']}: {pred['confidence']:.1%}" - cv2.putText( - vis_image, - text, - (w - 350, y_offset + i*30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.6, - (255, 255, 255), - 2 - ) + row_text = f"{i+1}. {pred['label']}" + row_conf = f"{pred['confidence']:.0%}" + row_y = h - panel_h + 34 + i * 16 + row_color = (200, 200, 200) if i > 0 else (80, 255, 80) + cv2.putText(vis_image, row_text, (col_x, row_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.34, row_color, 1) + cv2.putText(vis_image, row_conf, (w - 42, row_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.34, row_color, 1) + else: - # Show "Call service to classify" message - cv2.putText( - vis_image, - "Call /vision/classify_all or /vision/classify_bb", - (20, h-30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (255, 255, 255), - 2 - ) - - # Draw found object highlight (if available) + # Idle hint at bottom + hint = "Run /vision/classify_all or /vision/classify_bb" + semi_rect(vis_image, 0, h - 24, w, h, fill=(12, 12, 12), alpha=0.70) + cv2.putText(vis_image, hint, (8, h - 8), + cv2.FONT_HERSHEY_SIMPLEX, 0.38, (130, 130, 130), 1) + + # ── Found-object highlight ──────────────────────────────────────── if self.latest_found_object: found = self.latest_found_object - bbox = found['bbox'] - - # Draw thick green bounding box for found object - cv2.rectangle( - vis_image, - (bbox[0], bbox[1]), - (bbox[2], bbox[3]), - (0, 255, 0), # Green for found object - 5 - ) - - # Prepare label text - label = f"FOUND: {found['label']}" - conf = f"Conf: {found['confidence']:.2f}" - - # Calculate label position (above bbox) - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.9, 2) - - # Draw label background (green) - cv2.rectangle( - vis_image, - (bbox[0], bbox[1] - label_size[1] - 35), - (bbox[0] + max(label_size[0], 150), bbox[1]), - (0, 255, 0), - -1 - ) - - # Draw label text - cv2.putText( - vis_image, - label, - (bbox[0] + 5, bbox[1] - 15), - cv2.FONT_HERSHEY_SIMPLEX, - 0.9, - (0, 0, 0), - 2 - ) - - # Draw confidence - cv2.putText( - vis_image, - conf, - (bbox[0] + 5, bbox[1] - 40), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (0, 0, 0), - 2 - ) - - # Add corner markers - corner_size = 15 - # Top-left - cv2.line(vis_image, (bbox[0], bbox[1]), (bbox[0] + corner_size, bbox[1]), (0, 255, 0), 5) - cv2.line(vis_image, (bbox[0], bbox[1]), (bbox[0], bbox[1] + corner_size), (0, 255, 0), 5) - # Top-right - cv2.line(vis_image, (bbox[2], bbox[1]), (bbox[2] - corner_size, bbox[1]), (0, 255, 0), 5) - cv2.line(vis_image, (bbox[2], bbox[1]), (bbox[2], bbox[1] + corner_size), (0, 255, 0), 5) - # Bottom-left - cv2.line(vis_image, (bbox[0], bbox[3]), (bbox[0] + corner_size, bbox[3]), (0, 255, 0), 5) - cv2.line(vis_image, (bbox[0], bbox[3]), (bbox[0], bbox[3] - corner_size), (0, 255, 0), 5) - # Bottom-right - cv2.line(vis_image, (bbox[2], bbox[3]), (bbox[2] - corner_size, bbox[3]), (0, 255, 0), 5) - cv2.line(vis_image, (bbox[2], bbox[3]), (bbox[2], bbox[3] - corner_size), (0, 255, 0), 5) - - # Add title bar - cv2.putText( - vis_image, - f"CLIP Classifier | Frame: {self.frame_counter}", - (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (0, 0, 0), - 4 - ) - - cv2.putText( - vis_image, - f"CLIP Classifier | Frame: {self.frame_counter}", - (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (255, 255, 255), - 2 - ) - - # Show image + bbox = found['bbox'] + x1, y1, x2, y2 = bbox + conf = found['confidence'] + + # Pulsing-style: thin outline + thick corner brackets + cv2.rectangle(vis_image, (x1, y1), (x2, y2), FOUND_COLOR, 1) + clen = max(16, int(min(x2-x1, y2-y1) * 0.18)) + for (px, py, dx, dy) in [(x1,y1,1,1),(x2,y1,-1,1),(x1,y2,1,-1),(x2,y2,-1,-1)]: + cv2.line(vis_image, (px, py), (px + dx*clen, py), FOUND_COLOR, 3) + cv2.line(vis_image, (px, py), (px, py + dy*clen), FOUND_COLOR, 3) + + # Center dot + cx, cy = (x1+x2)//2, (y1+y2)//2 + cv2.circle(vis_image, (cx, cy), 5, (15, 15, 15), -1) + cv2.circle(vis_image, (cx, cy), 5, FOUND_COLOR, 2) + cv2.circle(vis_image, (cx, cy), 2, (240, 240, 240), -1) + + # Label + confidence bar + label = f"FOUND {found['label']} {conf:.0%}" + ly = y1 - 5 + if ly < 16: + ly = y2 + 16 + draw_label(vis_image, label, x1, ly, FOUND_COLOR, fs=0.40, pad=5, accent=4) + + bar_w = x2 - x1 + filled = max(2, int(bar_w * conf)) + cv2.rectangle(vis_image, (x1, y2 - 4), (x1 + bar_w, y2), (40, 40, 40), -1) + cv2.rectangle(vis_image, (x1, y2 - 4), (x1 + filled, y2), FOUND_COLOR, -1) + + # ── Top info bar ────────────────────────────────────────────────── + bar_h = 22 + semi_rect(vis_image, 0, 0, w, bar_h, fill=(12, 12, 12), alpha=0.78) + if self.latest_region_classifications: + status = f"CLIP | Regions: {len(self.latest_region_classifications)} | Frame: {self.frame_counter}" + elif self.latest_classification: + status = f"CLIP | Full-image mode | Frame: {self.frame_counter}" + else: + status = f"CLIP Classifier | Frame: {self.frame_counter} | Idle" + cv2.putText(vis_image, status, (8, 15), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (210, 210, 210), 1) + cv2.imshow(self.window_name, vis_image) cv2.waitKey(1) @@ -1550,11 +1656,13 @@ def main(args=None): node = CLIPClassifier(candidate_labels=candidate_labels) # Use MultiThreadedExecutor for ReentrantCallbackGroup - executor = MultiThreadedExecutor() + executor = MultiThreadedExecutor(num_threads=4) executor.add_node(node) try: - executor.spin() + while rclpy.ok(): + executor.spin_once(timeout_sec=0.03) + node.visualization_callback() finally: executor.shutdown() node.destroy_node() diff --git a/vision/find_object_service_node.py b/vision/find_object_service_node.py index ba31db3..aca35af 100644 --- a/vision/find_object_service_node.py +++ b/vision/find_object_service_node.py @@ -5,13 +5,14 @@ from rclpy.callback_groups import ReentrantCallbackGroup from rclpy.executors import MultiThreadedExecutor from custom_interfaces.srv import DetectObjects, FindObject, PixelToReal, FindObjectReal, FindObjectAngleBB, FindMultiObjectReal, FindMultiObject - - - """ ros2 service call /find_object custom_interfaces/srv/FindObjectReal "{label: 'bowl'}" confidence calculate by SAM detection based + +json file kept at +/home/group11/final_project_ws/install/vision/lib/python3.12/site-packages/find_object_history.json + """ # TCP_OFFSET = 0.157 # Actual TCP_OFFSET value from teach pendant @@ -26,6 +27,10 @@ def __init__(self): self.declare_parameter('tcp_offset', False) self.tcp_offset = bool(self.get_parameter('tcp_offset').value) + # Minimum confidence threshold for find_object responses + self.declare_parameter('find_object_min_confidence', 0.255) + self.find_object_min_confidence = float(self.get_parameter('find_object_min_confidence').value) + # Use reentrant callback group to allow nested service calls self.callback_group = ReentrantCallbackGroup() @@ -77,7 +82,7 @@ def __init__(self): ) self.get_logger().info('Find Object Service Node initialized') - + # Wait for services to be available self.wait_for_services() @@ -102,6 +107,43 @@ def wait_for_services(self): self.get_logger().info('Service clients ready') + def _log_find_object_call(self, label, response): + """Append a /find_object call result to find_object_history.json for the dashboard.""" + try: + with self._history_lock: + history = [] + if self._history_file.exists(): + try: + with open(self._history_file, 'r') as f: + history = json.load(f) + if not isinstance(history, list): + history = [] + except Exception: + history = [] + + call_id = len(history) + 1 + entry = { + 'call_id': call_id, + 'timestamp': datetime.now().isoformat(), + 'label_searched': label, + 'success': bool(response.success), + 'message': response.message, + 'object_id': response.object_id, + 'bbox': [int(v) for v in response.bbox] if response.bbox else [], + 'confidence': float(response.confidence), + 'x': float(response.x), + 'y': float(response.y), + 'z': float(response.z), + 'theta': float(response.theta), + 'verdict': None, + } + history.append(entry) + history = history[-50:] # keep last 50 + with open(self._history_file, 'w') as f: + json.dump(history, f, indent=2) + except Exception as e: + self.get_logger().warn(f'Failed to log find_object call: {e}') + def find_object_callback(self, request, response): """ Main service callback for /find_object @@ -173,6 +215,26 @@ def find_object_callback(self, request, response): response.z = 0.0 response.theta = 0.0 return response + + if find_response.confidence < self.find_object_min_confidence: + response.success = False + response.message = ( + f'Object confidence too low ({find_response.confidence:.3f} < ' + f'{self.find_object_min_confidence:.3f}); cannot reliably find object in camera' + ) + response.object_id = '' + response.bbox = [] + response.confidence = float(find_response.confidence) + response.x = 0.0 + response.y = 0.0 + response.z = 0.0 + response.theta = 0.0 + self.get_logger().warn( + f'find_object confidence below threshold: {find_response.confidence:.3f} ' + f'(threshold={self.find_object_min_confidence:.3f})' + ) + self._log_find_object_call(label, response) + return response # Extract object_id from find_response object_id = find_response.object_id if hasattr(find_response, 'object_id') else '' @@ -266,7 +328,8 @@ def find_object_callback(self, request, response): response.y = 0.0 response.z = 0.0 response.theta = 0.0 - + + self._log_find_object_call(label, response) return response diff --git a/vision/obb_angle_service_node.py b/vision/obb_angle_service_node.py index 9bdf7d6..0518fa8 100644 --- a/vision/obb_angle_service_node.py +++ b/vision/obb_angle_service_node.py @@ -37,13 +37,18 @@ from rclpy.node import Node from rclpy.callback_groups import ReentrantCallbackGroup from rclpy.executors import MultiThreadedExecutor +from rclpy.qos import QoSProfile, ReliabilityPolicy, DurabilityPolicy from custom_interfaces.srv import FindObjectAngleBB, FindObjectAngle, DetectObjects from custom_interfaces.msg import SAMDetections from sensor_msgs.msg import Image, CameraInfo from cv_bridge import CvBridge +import json import numpy as np import cv2 import time +import threading +from datetime import datetime +from pathlib import Path class OBBAngleServiceNode(Node): @@ -55,8 +60,10 @@ class OBBAngleServiceNode(Node): def __init__(self): super().__init__('obb_angle_service_node') - # Use reentrant callback group for nested service calls - self.callback_group = ReentrantCallbackGroup() + # Keep subscriptions and services in separate callback groups so + # detection updates can be processed while service callbacks are waiting. + self.subscription_callback_group = ReentrantCallbackGroup() + self.service_callback_group = ReentrantCallbackGroup() # Storage for latest detections and images self.latest_detections = None @@ -66,8 +73,13 @@ def __init__(self): self.bridge = CvBridge() # Thread lock for thread-safe access - import threading self.detections_lock = threading.Lock() + self.detections_condition = threading.Condition(self.detections_lock) + self.latest_detections_stamp_ns = 0 + + # Queue visualization work so service callbacks can return immediately. + self.viz_lock = threading.Lock() + self.pending_viz = None # OpenCV visualization window (unified for both single and multi-object) self.window_name = 'OBB Angle Detection' @@ -80,13 +92,21 @@ def __init__(self): self.rgb_topic = '/camera/color/image_raw' else: self.rgb_topic = '/camera/image_raw' + + # Best-effort QoS is generally more compatible with high-rate detector topics. + self.sam_qos = QoSProfile( + depth=10, + reliability=ReliabilityPolicy.BEST_EFFORT, + durability=DurabilityPolicy.VOLATILE, + ) # Subscribe to RGB camera for visualization self.rgb_subscription = self.create_subscription( Image, self.rgb_topic, self.rgb_callback, - 10 + 10, + callback_group=self.subscription_callback_group, ) # Subscribe to SAM detections for multi-object OBB @@ -94,7 +114,8 @@ def __init__(self): SAMDetections, '/vision/sam_detections', self.sam_detections_callback, - 10 + self.sam_qos, + callback_group=self.subscription_callback_group, ) # Subscribe to depth camera @@ -102,7 +123,8 @@ def __init__(self): Image, "/camera/depth/image_rect_raw", self.depth_callback, - 10 + 10, + callback_group=self.subscription_callback_group, ) # Subscribe to camera info @@ -110,14 +132,15 @@ def __init__(self): CameraInfo, "/camera/color/camera_info", self.info_callback, - 10 + 10, + callback_group=self.subscription_callback_group, ) # Service client for real-time detection self.detect_objects_client = self.create_client( DetectObjects, '/vision/detect_objects', - callback_group=self.callback_group + callback_group=self.service_callback_group ) # Create OBB angle service servers @@ -125,22 +148,23 @@ def __init__(self): FindObjectAngleBB, '/obb/find_object_angle_bb', self.find_object_angle_bb_callback, - callback_group=self.callback_group + callback_group=self.service_callback_group ) self.find_object_angle_srv = self.create_service( FindObjectAngle, '/obb/find_object_angle', self.find_object_angle_callback, - callback_group=self.callback_group + callback_group=self.service_callback_group ) # Create OpenCV window (unified) cv2.namedWindow(self.window_name, cv2.WINDOW_NORMAL) cv2.resizeWindow(self.window_name, 1200, 800) - # Create timer for continuous window update (keeps window responsive) - self.viz_timer = self.create_timer(0.1, self.keep_window_alive) + # GUI updates are pumped from the main thread in main() to avoid + # HighGUI freezes under multi-threaded executors. + self.viz_timer = None self.get_logger().info('=' * 80) self.get_logger().info('OBB Angle Service Node Started') @@ -152,6 +176,7 @@ def __init__(self): self.get_logger().info(' - /obb/find_object_angle (All objects OBB)') self.get_logger().info('Subscriptions:') self.get_logger().info(' - /vision/sam_detections (SAMDetections)') + self.get_logger().info(' - /vision/sam_detections QoS: BEST_EFFORT, VOLATILE, depth=10') self.get_logger().info(f' - {self.rgb_topic} (Image)') self.get_logger().info(' - /camera/depth/image_rect_raw (Image)') self.get_logger().info(' - /camera/color/camera_info (CameraInfo)') @@ -177,9 +202,28 @@ def rgb_callback(self, msg: Image): def sam_detections_callback(self, msg: SAMDetections): """Store latest SAM detections""" - with self.detections_lock: + with self.detections_condition: self.latest_detections = msg + self.latest_detections_stamp_ns = time.monotonic_ns() + self.detections_condition.notify_all() + if not hasattr(self, '_sam_received'): + self._sam_received = True + self.get_logger().info(f'First SAM detections received: {msg.total_detections} objects') self.get_logger().debug(f'Received SAM detections: {msg.total_detections} objects') + + def wait_for_sam_detections(self, timeout_sec=1.5, min_stamp_ns=0): + """Wait until at least one SAM detection message is available.""" + deadline = time.monotonic() + timeout_sec + with self.detections_condition: + while True: + if self.latest_detections is not None and self.latest_detections_stamp_ns >= min_stamp_ns: + return True + + remaining = deadline - time.monotonic() + if remaining <= 0.0: + return False + + self.detections_condition.wait(timeout=min(0.05, remaining)) def depth_callback(self, msg: Image): """Store latest depth image""" @@ -194,9 +238,28 @@ def info_callback(self, msg: CameraInfo): def keep_window_alive(self): """Timer callback to keep OpenCV window responsive""" - # This just calls waitKey to process window events - # Without this, the window may freeze or not display properly - cv2.waitKey(1) + # Render queued visualization outside service callbacks to avoid + # blocking service responses on GUI operations. + pending = None + with self.viz_lock: + if self.pending_viz is not None: + pending = self.pending_viz + self.pending_viz = None + + if pending is not None: + results, mode = pending + try: + self.visualize_obb(results, mode=mode) + except Exception as e: + self.get_logger().warn(f'Visualization update failed: {e}') + else: + # Process window events even when no new frame is queued. + cv2.waitKey(1) + + def queue_visualization(self, results, mode="auto"): + """Queue the latest visualization payload for timer-based rendering.""" + with self.viz_lock: + self.pending_viz = (results, mode) def calculate_obb_from_bbox(self, x1, y1, x2, y2, mask=None): """ @@ -341,149 +404,165 @@ def visualize_obb(self, results, mode="auto"): """ if self.latest_rgb_image is None: self.get_logger().warn('No RGB image available for visualization') - # Show a blank placeholder window - blank = np.zeros((800, 1200, 3), dtype=np.uint8) - cv2.putText(blank, "Waiting for RGB camera image...", - (300, 400), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 255, 255), 3) - cv2.putText(blank, f"RGB Topic: {self.rgb_topic}", - (350, 450), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (128, 128, 128), 2) + blank = np.zeros((480, 640, 3), dtype=np.uint8) + cv2.putText(blank, "Waiting for RGB camera...", (80, 230), + cv2.FONT_HERSHEY_SIMPLEX, 0.65, (160, 160, 160), 1) + cv2.putText(blank, self.rgb_topic, (80, 258), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (100, 100, 100), 1) cv2.imshow(self.window_name, blank) cv2.waitKey(1) return - + vis_image = self.latest_rgb_image.copy() - + img_h, img_w = vis_image.shape[:2] + # Auto-detect mode if mode == "auto": mode = "single" if len(results) == 1 else "multi" - - # Color palette for objects - colors = [ - (255, 255, 0), # Cyan - (255, 0, 255), # Magenta - (0, 255, 255), # Yellow - (255, 128, 0), # Orange - (128, 255, 0), # Lime - (0, 255, 128), # Spring Green - (255, 0, 128), # Pink - (128, 0, 255), # Purple + + # Distinct color palette (BGR) + COLORS = [ + (0, 200, 255), # amber + (80, 255, 80), # lime + (255, 80, 80), # blue + (255, 0, 200), # magenta + (0, 230, 230), # yellow + (200, 80, 255), # violet + (0, 255, 180), # spring green + (255, 180, 0), # sky blue ] - - # Process each OBB + + def draw_corner_bracket(img, pts, color, lw=2): + """Draw corner-bracket accents on an OBB polygon.""" + n = len(pts) + for i in range(n): + A = pts[(i - 1) % n].astype(float) + B = pts[i].astype(float) + C = pts[(i + 1) % n].astype(float) + ab = A - B; ab_len = np.linalg.norm(ab) + cb = C - B; cb_len = np.linalg.norm(cb) + if ab_len == 0 or cb_len == 0: + continue + clen = max(10, int(min(ab_len, cb_len) * 0.22)) + p1 = (B + (ab / ab_len) * clen).astype(int) + p2 = (B + (cb / cb_len) * clen).astype(int) + cv2.line(img, tuple(B.astype(int)), tuple(p1), color, lw) + cv2.line(img, tuple(B.astype(int)), tuple(p2), color, lw) + + def semi_transparent_rect(img, x1, y1, x2, y2, fill, alpha=0.72): + """Blend a dark rectangle over a sub-region.""" + x1 = max(0, x1); y1 = max(0, y1) + x2 = min(img.shape[1] - 1, x2); y2 = min(img.shape[0] - 1, y2) + if x2 <= x1 or y2 <= y1: + return + roi = img[y1:y2, x1:x2] + bg = np.full_like(roi, fill) + img[y1:y2, x1:x2] = cv2.addWeighted(roi, 1 - alpha, bg, alpha, 0) + + # ── Process each OBB ───────────────────────────────────────────────── for idx, result_tuple in enumerate(results): - # Unpack (handle optional bbox) if len(result_tuple) == 7: object_id, u, v, theta, width, height, bbox = result_tuple else: object_id, u, v, theta, width, height = result_tuple bbox = None - - color = colors[idx % len(colors)] - - # Draw AABB first if provided (for single object mode) - if bbox is not None and mode == "single": - x1, y1, x2, y2 = bbox - cv2.rectangle(vis_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) - cv2.putText(vis_image, "Input AABB", (int(x1), int(y1) - 5), - cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2) - - # Get OBB corner points - box_points = self.get_obb_corner_points(u, v, theta, width, height) - - # Draw OBB - cv2.drawContours(vis_image, [box_points], 0, color, 3) - - # Draw center point - if mode == "single": - cv2.circle(vis_image, (int(u), int(v)), 8, (0, 0, 255), -1) # Red center - cv2.circle(vis_image, (int(u), int(v)), 10, (255, 255, 255), 2) # White outline - else: - cv2.circle(vis_image, (int(u), int(v)), 6, (255, 255, 255), -1) - cv2.circle(vis_image, (int(u), int(v)), 8, color, 2) - - # Draw angle arrow perpendicular to WIDTH (shorter dimension), with -90° offset so 0° points UP - arrow_length = height # Use height (shorter dimension) for arrow length - # Arrow perpendicular to width = add 90° to theta, then -90° for visualization - visual_theta = theta + np.pi / 2 - np.pi / 2 # Perpendicular, then visualization offset - end_x = int(u + arrow_length * np.cos(visual_theta)) - end_y = int(v + arrow_length * np.sin(visual_theta)) - arrow_thickness = 3 if mode == "single" else 2 - cv2.arrowedLine(vis_image, (int(u), int(v)), (end_x, end_y), - (255, 0, 255) if mode == "single" else color, - arrow_thickness, tipLength=0.3) - - # Draw label (use remapped angle: 90deg - original geometry angle) + + color = COLORS[idx % len(COLORS)] + cx, cy = int(u), int(v) angle_geom_deg = np.rad2deg(theta) angle_deg = 90.0 - angle_geom_deg - + + # ── Input AABB (single mode only) ───────────────────────────── + if bbox is not None and mode == "single": + ax1, ay1, ax2, ay2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) + # Dashed-style thin rectangle (draw as corner brackets only) + aabb_pts = np.array([[ax1,ay1],[ax2,ay1],[ax2,ay2],[ax1,ay2]]) + draw_corner_bracket(vis_image, aabb_pts, (80, 200, 80), lw=1) + cv2.rectangle(vis_image, (ax1, ay1), (ax2, ay2), (80, 200, 80), 1) + # Small label + ts, _ = cv2.getTextSize("AABB", cv2.FONT_HERSHEY_SIMPLEX, 0.33, 1) + semi_transparent_rect(vis_image, ax1, ay1 - ts[1] - 6, ax1 + ts[0] + 4, ay1, (18, 18, 18)) + cv2.putText(vis_image, "AABB", (ax1 + 2, ay1 - 3), + cv2.FONT_HERSHEY_SIMPLEX, 0.33, (80, 200, 80), 1) + + # ── OBB outline ─────────────────────────────────────────────── + box_pts = self.get_obb_corner_points(u, v, theta, width, height) + # Thin full outline + cv2.drawContours(vis_image, [box_pts], 0, color, 1) + # Corner bracket accents + draw_corner_bracket(vis_image, box_pts, color, lw=2) + + # ── Center dot ──────────────────────────────────────────────── + cv2.circle(vis_image, (cx, cy), 5, (15, 15, 15), -1) # dark fill + cv2.circle(vis_image, (cx, cy), 5, color, 2) # color ring + cv2.circle(vis_image, (cx, cy), 2, (240, 240, 240), -1) # white center + + # ── Angle arrow ─────────────────────────────────────────────── + visual_theta = theta + np.pi / 2 - np.pi / 2 + arrow_len = int(max(height * 0.55, 20)) + end_x = int(cx + arrow_len * np.cos(visual_theta)) + end_y = int(cy + arrow_len * np.sin(visual_theta)) + cv2.arrowedLine(vis_image, (cx, cy), (end_x, end_y), + color, 2, tipLength=0.35) + + # ── Info panel (single) / compact label (multi) ─────────────── if mode == "single": - # Concise info box for single object info_lines = [ f"{object_id}", - f"Center: ({int(u)}, {int(v)})", - f"Angle: {angle_deg:.1f}deg", - f"Size: {width:.0f}x{height:.0f}" + f"Center ({cx}, {cy})", + f"Angle {angle_deg:.1f} deg", + f"Size {width:.0f} x {height:.0f} px", ] - - # Draw compact info box at top-right - font_scale = 0.6 - font_thickness = 2 - line_spacing = 25 - - max_width = 0 - for line in info_lines: - (text_w, text_h), _ = cv2.getTextSize(line, cv2.FONT_HERSHEY_SIMPLEX, - font_scale, font_thickness) - max_width = max(max_width, text_w) - - box_height = len(info_lines) * line_spacing + 15 - box_x = vis_image.shape[1] - max_width - 25 - box_y = 15 - - # Background with transparency effect - overlay = vis_image.copy() - cv2.rectangle(overlay, (box_x - 8, box_y - 8), - (box_x + max_width + 8, box_y + box_height), (0, 0, 0), -1) - cv2.addWeighted(overlay, 0.7, vis_image, 0.3, 0, vis_image) - + fs, ft = 0.42, 1 + pad = 8 + line_h = 20 + max_w = max(cv2.getTextSize(l, cv2.FONT_HERSHEY_SIMPLEX, fs, ft)[0][0] + for l in info_lines) + panel_w = max_w + pad * 2 + 4 # +4 for accent bar + panel_h = len(info_lines) * line_h + pad + px = img_w - panel_w - 10 + py = 28 # sits below top bar + + # Background + semi_transparent_rect(vis_image, px - 2, py, px + panel_w, py + panel_h, (15, 15, 15)) + # Left accent bar + cv2.rectangle(vis_image, (px - 2, py), (px + 2, py + panel_h), color, -1) # Border - cv2.rectangle(vis_image, (box_x - 8, box_y - 8), - (box_x + max_width + 8, box_y + box_height), color, 2) - - # Text lines + cv2.rectangle(vis_image, (px - 2, py), (px + panel_w, py + panel_h), color, 1) + for i, line in enumerate(info_lines): - y_pos = box_y + (i * line_spacing) + 18 - cv2.putText(vis_image, line, (box_x, y_pos), - cv2.FONT_HERSHEY_SIMPLEX, font_scale, color, font_thickness) + ty = py + pad + i * line_h + line_h // 2 + # Dim the label for the first line (object id) — draw in color + text_color = color if i == 0 else (210, 210, 210) + cv2.putText(vis_image, line, (px + 6, ty), + cv2.FONT_HERSHEY_SIMPLEX, fs, text_color, ft) else: - # Compact label for multi-object - label = f"#{idx} {angle_deg:.1f}deg" - label_x = int(u + 15) - label_y = int(v) - - # Text with outline - cv2.putText(vis_image, label, (label_x, label_y), - cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 5) - cv2.putText(vis_image, label, (label_x, label_y), - cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2) - - # Add concise title - if mode == "single": - title = "OBB Detection" - else: - title = f"OBB Detection ({len(results)} objects)" - - cv2.putText(vis_image, title, (10, 35), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 3) - cv2.putText(vis_image, title, (10, 35), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 255), 2) - - # Add concise legend at bottom - legend_y = vis_image.shape[0] - 20 - legend_text = "0deg = Vertical | Range: -90deg to +90deg" - - cv2.putText(vis_image, legend_text, (10, legend_y), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2) - - # Display + # Compact floating label near center + label = f"#{idx} {angle_deg:.1f}°" + fs, ft = 0.38, 1 + (lw_px, lh_px), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, fs, ft) + pad = 4 + lx = cx + 12 + ly = cy - 6 + # Clamp + lx = min(lx, img_w - lw_px - pad * 2 - 5) + ly = max(ly, lh_px + pad + 2) + + semi_transparent_rect(vis_image, lx - 2, ly - lh_px - pad, + lx + lw_px + pad * 2 + 3, ly + pad, (15, 15, 15)) + cv2.rectangle(vis_image, (lx - 2, ly - lh_px - pad), + (lx + 3, ly + pad), color, -1) # accent bar + cv2.putText(vis_image, label, (lx + 5, ly), + cv2.FONT_HERSHEY_SIMPLEX, fs, (230, 230, 230), ft) + + # ── Top info bar ───────────────────────────────────────────────────── + bar_h = 24 + obj_count = len(results) + title = f"OBB Detection | Objects: {obj_count} | 0 deg = Vertical" + semi_transparent_rect(vis_image, 0, 0, img_w, bar_h, (12, 12, 12), alpha=0.78) + cv2.putText(vis_image, title, (8, 16), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (210, 210, 210), 1) + cv2.imshow(self.window_name, vis_image) cv2.waitKey(1) @@ -552,11 +631,15 @@ def find_object_angle_bb_callback(self, request, response): return response # Trigger real-time detection to get fresh detections + request_stamp_ns = time.monotonic_ns() self.get_logger().info('Triggering real-time detection...') detection_success = self.trigger_real_time_detection() - - # Small delay to ensure detections are updated - time.sleep(0.1) + min_stamp_ns = request_stamp_ns if detection_success else 0 + + # Wait for SAM callback instead of fixed sleep. + got_sam = self.wait_for_sam_detections(timeout_sec=1.5, min_stamp_ns=min_stamp_ns) + if not got_sam: + self.get_logger().warn('Timed out waiting for SAM detections update') # Get latest detections with self.detections_lock: @@ -635,38 +718,87 @@ def find_object_angle_bb_callback(self, request, response): theta_result = np.deg2rad(angle_result_deg) # Populate response (use remapped theta, keep geometry unchanged) + angle_deg = angle_result_deg response.success = True - response.message = f'OBB calculated for {best_detection.object_id} within bbox [{request.x1}, {request.y1}, {request.x2}, {request.y2}]' + response.message = json.dumps({ + "success": True, + "object_id": best_detection.object_id, + "input_bbox": [request.x1, request.y1, request.x2, request.y2], + "center": {"u": round(u, 2), "v": round(v, 2)}, + "theta_rad": round(float(theta_result), 6), + "angle_deg": round(float(angle_deg), 2), + "width_px": round(float(width), 2), + "height_px": round(float(height), 2), + "iou_with_request": round(float(best_iou), 4), + "timestamp": datetime.utcnow().isoformat() + "Z", + }) response.u = u response.v = v response.theta = theta_result response.width = width response.height = height - + # Log results concisely - angle_deg = angle_result_deg self.get_logger().info('=' * 60) self.get_logger().info(f'OBB Result ({best_detection.object_id}): center=({u:.1f},{v:.1f}), angle={angle_deg:.1f}deg, size={width:.0f}x{height:.0f}') self.get_logger().info('=' * 60) - - # Visualize with unified function - pass the INPUT bbox from request for visualization + + # Persist to history file for dashboard + self._save_obb_bb_record(best_detection.object_id, request, u, v, theta_result, angle_deg, width, height, best_iou) + + # Queue visualization; do not block service response path. viz_data = [(best_detection.object_id, u, v, theta_geom, width, height, [request.x1, request.y1, request.x2, request.y2])] - self.visualize_obb(viz_data, mode="single") + self.queue_visualization(viz_data, mode="single") except Exception as e: self.get_logger().error(f'Error in find_object_angle_bb: {e}') import traceback self.get_logger().error(traceback.format_exc()) response.success = False - response.message = f'Internal error: {str(e)}' + response.message = json.dumps({"success": False, "error": str(e)}) response.u = 0.0 response.v = 0.0 response.theta = 0.0 response.width = 0.0 response.height = 0.0 - + return response - + + def _save_obb_bb_record(self, object_id, request, u, v, theta_rad, angle_deg, width, height, iou): + """Persist /obb/find_object_angle_bb result to obb_bb_history.json for the dashboard.""" + try: + history_file = Path(__file__).parent.parent / 'obb_bb_history.json' + history = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + history = json.load(f) + except Exception: + history = [] + + record = { + 'call_id': len(history) + 1, + 'timestamp': datetime.utcnow().isoformat() + 'Z', + 'object_id': object_id, + 'input_bbox': [request.x1, request.y1, request.x2, request.y2], + 'center_u': round(float(u), 2), + 'center_v': round(float(v), 2), + 'theta_rad': round(float(theta_rad), 6), + 'angle_deg': round(float(angle_deg), 2), + 'width_px': round(float(width), 2), + 'height_px': round(float(height), 2), + 'iou': round(float(iou), 4), + } + + history.append(record) + if len(history) > 500: + history = history[-500:] + + with open(history_file, 'w') as f: + json.dump(history, f, indent=2) + except Exception as e: + self.get_logger().warn(f'Failed to save obb_bb record: {e}') + def find_object_angle_callback(self, request, response): """ Service callback for /obb/find_object_angle @@ -678,11 +810,15 @@ def find_object_angle_callback(self, request, response): try: # Trigger real-time detection + request_stamp_ns = time.monotonic_ns() self.get_logger().info('Triggering real-time detection for all objects...') detection_success = self.trigger_real_time_detection() - - # Small delay to ensure detections are updated - time.sleep(0.1) + min_stamp_ns = request_stamp_ns if detection_success else 0 + + # Wait for SAM callback instead of fixed sleep. + got_sam = self.wait_for_sam_detections(timeout_sec=1.5, min_stamp_ns=min_stamp_ns) + if not got_sam: + self.get_logger().warn('Timed out waiting for SAM detections update') # Get latest detections with self.detections_lock: @@ -791,7 +927,7 @@ def find_object_angle_callback(self, request, response): # Visualize all objects with unified function if len(viz_results) > 0: - self.visualize_obb(viz_results, mode="multi") + self.queue_visualization(viz_results, mode="multi") except Exception as e: self.get_logger().error(f'Error in find_object_angle: {e}') @@ -822,15 +958,18 @@ def main(args=None): node = OBBAngleServiceNode() # Use MultiThreadedExecutor for non-blocking service calls - executor = MultiThreadedExecutor() + executor = MultiThreadedExecutor(num_threads=4) executor.add_node(node) try: node.get_logger().info('OBB Angle Service Node spinning...') - executor.spin() + while rclpy.ok(): + executor.spin_once(timeout_sec=0.03) + node.keep_window_alive() except KeyboardInterrupt: node.get_logger().info('Shutting down OBB Angle Service Node...') finally: + executor.shutdown() cv2.destroyAllWindows() node.destroy_node() rclpy.shutdown() diff --git a/vision/pixel_to_real.py b/vision/pixel_to_real.py index 1003e1c..21eec56 100644 --- a/vision/pixel_to_real.py +++ b/vision/pixel_to_real.py @@ -30,30 +30,30 @@ cd ~/final_project_ws colcon build --packages-select custom_interfaces source install/setup.bash - + 2. Build the vision package: colcon build --packages-select vision --symlink-install source install/setup.bash - + 3. Start the pixel_to_real service node: ros2 run vision pixel_to_real_service Usage: # Example: green box at pixel (320, 240) → world (0.5, 0.0, 0.8) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 320, v: 240}" - + # Example: gear part at pixel (305, 95) → world (0.83, 0.03, 0.8) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 305, v: 95}" - + # Example: drill at pixel (466, 160.5) → world (0.571546, -0.240961, 0.831898) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 466, v: 160}" - + # Example: monkey wrench at pixel (150, 200) → world (0.623673, 0.372909, 0.806652) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 150, v: 200}" - + # Example: origin at pixel (320, 500) → world (0.0, 0.0, 0.8) ros2 service call /pixel_to_real custom_interfaces/srv/PixelToReal "{u: 320, v: 500}" - + # View debug visualization: rqt_image_view /pixel_to_real/debug_image @@ -126,7 +126,7 @@ def __init__(self, self.color_encoding = 'passthrough' self.depth_32_encoding = 'passthrough' self.depth_16_encoding = 'passthrough' - + # Note: We DO NOT initialize RealSense pipeline here! # The camera is published by a separate node (e.g., realsense-ros) # We only subscribe to the published topics @@ -162,43 +162,43 @@ def __init__(self, # Origin: (u=320, v=500) -> (x=0, y=0, z=0.8) # Green box: (u=320, v=240) -> (x=0.5, y=0, z=0.8) # Gear: (u=305, v=95) -> (x=0.83, y=0.03, z=0.8) - + # Coordinate system mapping: # u increases right -> y DECREASES (u represents -y direction) # v increases down -> x DECREASES (v represents -x direction) - + self.u_origin = 320 # u=320 corresponds to y=0 self.v_origin = 500 # v=500 corresponds to x=0 - + # Calculate scaling factors from calibration points: # Green box: du=0, dv=-260 pixels -> dx=0.5, dy=0 meters # Gear: du=-15, dv=-405 pixels -> dx=0.83, dy=0.03 meters - + # From green box (vertical movement in image): # dv = 240 - 500 = -260 pixels (up in image) # dx = 0.5 - 0 = 0.5 meters (positive x, which is up) # scale_x = 0.5 / 260 = 0.00192 m/pixel - + # From gear (horizontal AND vertical movement): # du = 305 - 320 = -15 pixels (left in image) # dy = 0.03 - 0 = 0.03 meters (positive y, which is left) # scale_y = 0.03 / 15 = 0.002 m/pixel - + dv_green = 240 - 500 # -260 pixels (up in image) dx_green = 0.5 - 0 # 0.5 meters (positive x in world) - + du_gear = 305 - 320 # -15 pixels (left in image) dy_gear = 0.03 - 0 # 0.03 meters (positive y in world) - + self.scale_x = abs(dx_green / dv_green) # 0.5/260 = 0.00192 m/pixel self.scale_y = abs(dy_gear / du_gear) # 0.03/15 = 0.002 m/pixel - + # Depth calibration: Store reference depth for z-coordinate conversion # At calibration points, z should be 0.8m (table height) # We'll measure the actual depth sensor reading and use it as reference self.z_table = 0.8 # World z-coordinate of table surface self.depth_reference = None # Will be set from first depth reading at calibration point - + self.get_logger().info(f'Pixel-to-world calibration: scale_x={self.scale_x:.6f} m/px, scale_y={self.scale_y:.6f} m/px') self.get_logger().info(f'Origin: pixel({self.u_origin}, {self.v_origin}) -> world(0, 0, 0.8)') self.get_logger().info(f'Coordinate mapping: u right=-y, v down=-x') @@ -267,34 +267,34 @@ def info_cb(self, msg: CameraInfo): def pixel_to_world_calibrated(self, u: int, v: int, depth_m: float): """Convert pixel (u,v) to world coordinates (x,y,z) using calibration. - + Coordinate system: - u increases right -> y DECREASES (u represents -y direction) - v increases down -> x DECREASES (v represents -x direction) - Origin at pixel (320, 500) = world (0, 0, 0.8) - Depth is inversely related to z: small depth = high z (near camera, far from ground) - + Args: u: pixel column (positive right) v: pixel row (positive down) depth_m: depth in meters from camera (from depth sensor) - + Returns: (x, y, z) in world coordinates (meters) """ # Calculate pixel offset from origin du = u - self.u_origin # positive = right in image dv = v - self.v_origin # positive = down in image - + # Apply transformation based on coordinate mapping: # v down (-dv up) -> x increases: x = -dv * scale_x # u right (-du left) -> y increases: y = -du * scale_y x = -dv * self.scale_x # Up in image -> positive x y = -du * self.scale_y # Left in image -> positive y - + # Apply y-offset for robot reference frame # y = y - 0.5442 # Shift y by -0.5442 meters - + # Convert depth to z-coordinate # Depth is inversely related to z: smaller depth = further from ground = higher z # At table (z=0.8), we need to calibrate based on actual depth reading @@ -311,23 +311,23 @@ def pixel_to_world_calibrated(self, u: int, v: int, depth_m: float): self.depth_reference = depth_m self.get_logger().info(f'Set depth reference: {self.depth_reference:.3f}m at z={self.z_table}m') z = self.z_table - + return (x, y, z) def read_depth_at(self, u: float, v: float, max_search: int = 5): """Read depth with bilinear interpolation; if invalid, search a median window. Returns depth in meters. Assumes table is at 0.8m depth. - + Handles invalid depth (NaN/zero/inf) common on reflective surfaces or sensor noise by searching a 5-pixel neighborhood and taking the median of valid depth values. - + The depth sensor returns distance from camera. We assume the table surface is at 0.8m from the camera, which should be the largest/most common depth value. """ if self.latest_depth is None: self.get_logger().warn('No depth image available, using default table depth 0.8m') return 0.8 # Default table depth - + depth = self.latest_depth h, w = depth.shape[:2] if not (0 <= int(v) < h and 0 <= int(u) < w): @@ -374,11 +374,11 @@ def bilinear(u_, v_): d_val = bilinear(uu, vv) if d_val is not None: valid_depths.append(d_val) - + if len(valid_depths) == 0: self.get_logger().warn(f'No valid depth found within {max_search}px of ({u:.1f},{v:.1f}), using table depth 0.8m') return 0.8 # Default table depth - + # Use median to be robust against outliers median_depth = float(np.median(valid_depths)) return median_depth @@ -400,43 +400,43 @@ def handle_pixel_to_real(self, req, resp): # Get depth at this pixel (default to table depth 0.8m) depth_m = self.read_depth_at(float(u), float(v)) - + # Convert pixel to world coordinates using calibration x_w, y_w, z_w = self.pixel_to_world_calibrated(u, v, depth_m) - + # DEBUG: Visualize the pixel location and world coordinates on the image if self.latest_rgb is not None: debug_img = self.latest_rgb.copy() - + # Draw a large crosshair at the requested pixel cv2.drawMarker(debug_img, (u, v), (0, 0, 255), cv2.MARKER_CROSS, 40, 3) - + # Draw a circle around it cv2.circle(debug_img, (u, v), 20, (0, 255, 0), 2) - + # Add text label with pixel coordinates label_pixel = f"Pixel: ({u}, {v})" cv2.putText(debug_img, label_pixel, (u + 25, v - 45), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2) - + # Add text label with world coordinates label_world = f"World: ({x_w:.3f}, {y_w:.3f}, {z_w:.3f})m" cv2.putText(debug_img, label_world, (u + 25, v - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2) - + # Add depth label label_depth = f"Depth: {depth_m:.3f}m" cv2.putText(debug_img, label_depth, (u + 25, v + 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2) - + # Draw origin marker at (320, 500) origin_u, origin_v = self.u_origin, self.v_origin if 0 <= origin_u < debug_img.shape[1] and 0 <= origin_v < debug_img.shape[0]: - cv2.drawMarker(debug_img, (origin_u, origin_v), (255, 255, 0), + cv2.drawMarker(debug_img, (origin_u, origin_v), (255, 255, 0), cv2.MARKER_TILTED_CROSS, 30, 2) cv2.putText(debug_img, "Origin (0,0)", (origin_u + 10, origin_v - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) - + # Draw coordinate axes for reference cv2.putText(debug_img, "u -> (right)", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) @@ -446,7 +446,7 @@ def handle_pixel_to_real(self, req, resp): cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) cv2.putText(debug_img, "v (down)", (10, 85), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) - + # Publish debug image try: debug_msg = self.bridge.cv2_to_imgmsg(debug_img, encoding='bgr8') @@ -472,7 +472,7 @@ def handle_pixel_to_real(self, req, resp): resp.x = float(x_w) resp.y = float(y_w) resp.z = float(z_w) - + # Check if this is a known calibration point and log the error for calib_point in self.validation_points: if abs(u - calib_point["pixel"][0]) <= 1 and abs(v - calib_point["pixel"][1]) <= 1: @@ -481,7 +481,7 @@ def handle_pixel_to_real(self, req, resp): error_y = y_w - expected[1] error_z = z_w - expected[2] error_dist = np.sqrt(error_x**2 + error_y**2 + error_z**2) - + self.get_logger().info( f'Validation point "{calib_point["name"]}" at pixel ({u},{v}): ' f'Calculated ({x_w:.3f}, {y_w:.3f}, {z_w:.3f}), ' @@ -489,7 +489,7 @@ def handle_pixel_to_real(self, req, resp): f'Error: dx={error_x:.3f}m, dy={error_y:.3f}m, dz={error_z:.3f}m, dist={error_dist:.3f}m' ) break - + self.get_logger().info(f'Pixel ({u},{v}) -> World ({x_w:.3f}m, {y_w:.3f}m, {z_w:.3f}m)') return resp @@ -508,3 +508,4 @@ def main(args=None): if __name__ == '__main__': main() + diff --git a/vision/pixel_to_real_world.py b/vision/pixel_to_real_world.py index a0be114..747b10f 100644 --- a/vision/pixel_to_real_world.py +++ b/vision/pixel_to_real_world.py @@ -8,6 +8,7 @@ from cv_bridge import CvBridge from image_geometry import PinholeCameraModel import numpy as np +import math class PixelToRealNode(Node): @@ -51,10 +52,12 @@ def __init__(self): # error = (x - x^, y - y^, z - z^) # t_base_cam_new = t_base_cam_old + error # This compensates for calibration errors in the camera position. + # Practically: put the silver tip of the tape measure where the center of the gripper is, read the distance to the center of the cube + # see what direction (- or +) the read is, add that # Benchmark: # LOW: (-0.0386, 0.5303, 0.5238) # HIGH: (-0.0361, 0.5303, 0.6458) - self.t_base_cam = np.array([-0.0361, 0.5303, 0.6458]) + self.t_base_cam = np.array([-0.146, 0.635, 0.8]) self.R_base_cam = np.array([ [1.0, 0.0, 0.0], @@ -62,7 +65,22 @@ def __init__(self): [0.0, 0.0, -1.0] ]) - self.get_logger().info("PixelToReal with TF ready.") + # ---- METHOD 2: Empirical linear model ---- + # Least-squares fit on 22 calibration points (5 measures × 2 colours + # × TR/BL), 2026-03-12. Model: x = a0 + a1*u + a2*v + # y = b0 + b1*u + b2*v + # 2D RMSE = 20.8 mm (x: 12.1 mm, y: 16.9 mm) + self._emp_ax = np.array([-0.56859693, +0.00130317, +0.00002114]) # [1,u,v]→x + self._emp_ay = np.array([+0.98011251, -0.00002728, -0.00133088]) # [1,u,v]→y + self._emp_z = 0 # table height (m) + + # ---- Hybrid blend parameters ---- + # Empirical is most accurate near image centre (calibration region). + # Gaussian decay controls how quickly we trust intrinsics at the edges. + self._image_center = (320, 240) # (u_c, v_c) + self._blend_sigma_frac = 0.6 # fraction of max diagonal for σ + + self.get_logger().info("PixelToReal with empirical + intrinsic hybrid ready.") # -------------------------------------------------- @@ -95,6 +113,49 @@ def get_robust_depth(self, u, v, window_size=5): # -------------------------------------------------- + def _intrinsic_estimate(self, u: int, v: int, Z: float) -> np.ndarray: + """METHOD 1: pinhole back-projection + rigid-body transform to base.""" + ray = self.cam_model.projectPixelTo3dRay((u, v)) + p_cam = np.array(ray) * Z + p_base = self.R_base_cam @ p_cam + self.t_base_cam + return p_base + + def _empirical_estimate(self, u: int, v: int) -> np.ndarray: + """METHOD 2: simple linear model fitted from calibration data. + + Least-squares fit on 22 calibration points, 2026-03-12: + x = -0.56859693 + 0.00130317*u + 0.00002114*v (RMS 12.1 mm) + y = 0.98011251 - 0.00002728*u - 0.00133088*v (RMS 16.9 mm) + 2D RMSE = 20.8 mm + """ + feat = np.array([1.0, float(u), float(v)]) + x = float(np.dot(self._emp_ax, feat)) + y = float(np.dot(self._emp_ay, feat)) + return np.array([x, y, self._emp_z]) + + def _hybrid_estimate( + self, + u: int, v: int, + p_intrinsic: np.ndarray, + p_empirical: np.ndarray, + ) -> tuple: + """HYBRID: Gaussian-weighted blend. + + Empirical weight is highest at the image centre (calibration region) + and decays towards the edges; intrinsic picks up the slack. + """ + uc, vc = self._image_center + dist = math.sqrt((u - uc) ** 2 + (v - vc) ** 2) + sigma = math.sqrt(uc ** 2 + vc ** 2) * self._blend_sigma_frac + + w_emp = math.exp(-dist / sigma) + w_int = 1.0 - w_emp + + p_hybrid = w_emp * p_empirical + w_int * p_intrinsic + return p_hybrid, w_emp, w_int + + # -------------------------------------------------- + def handle_pixel_to_real(self, request, response): if self.depth_image is None or not self.camera_ready: @@ -111,21 +172,26 @@ def handle_pixel_to_real(self, request, response): Z = float(depth_raw) * self.depth_scale - # Camera frame 3D - ray = self.cam_model.projectPixelTo3dRay((u, v)) + # METHOD 1: intrinsic back-projection + p_int = self._intrinsic_estimate(u, v, Z) - X_cam = ray[0] * Z - Y_cam = ray[1] * Z - Z_cam = Z + # METHOD 2: empirical linear model + p_emp = self._empirical_estimate(u, v) - p_cam = np.array([X_cam, Y_cam, Z_cam]) + # HYBRID: Gaussian-weighted blend + p_hybrid, w_emp, w_int = self._hybrid_estimate(u, v, p_int, p_emp) - # Transform to base frame - p_base = self.R_base_cam @ p_cam + self.t_base_cam + response.x = float(p_hybrid[0]) + response.y = float(p_hybrid[1]) + response.z = float(p_hybrid[2]) - response.x = float(p_base[0]) - response.y = float(p_base[1]) - response.z = float(p_base[2]) + self.get_logger().info( + f'Pixel ({u},{v}) depth={Z:.3f}m | ' + f'Intrinsic=[{p_int[0]:.4f},{p_int[1]:.4f},{p_int[2]:.4f}] ' + f'Empirical=[{p_emp[0]:.4f},{p_emp[1]:.4f}] ' + f'Hybrid=[{p_hybrid[0]:.4f},{p_hybrid[1]:.4f},{p_hybrid[2]:.4f}] ' + f'(w_emp={w_emp:.3f} w_int={w_int:.3f})' + ) return response @@ -139,4 +205,4 @@ def main(args=None): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/vision/simple_sam_detector.py b/vision/simple_sam_detector.py index 418fad7..ca05e1d 100755 --- a/vision/simple_sam_detector.py +++ b/vision/simple_sam_detector.py @@ -29,7 +29,7 @@ from rclpy.node import Node from rclpy.qos import QoSProfile, QoSReliabilityPolicy, QoSHistoryPolicy from custom_interfaces.msg import SAMDetections, SAMDetection -from custom_interfaces.srv import DetectObjects +from custom_interfaces.srv import DetectObjects, FindObjectAngle, PixelToReal from sensor_msgs.msg import Image from std_srvs.srv import Trigger from std_msgs.msg import Header @@ -165,7 +165,25 @@ def __init__(self, single_shot_mode=False): Trigger, '/vision/classify_bbox_filtered' ) - + + # Service client for GraspNet detection + self.grasp_client = self.create_client( + Trigger, + '/vision/detect_grasp' + ) + + # Service client for OBB angle detection + self.obb_client = self.create_client( + FindObjectAngle, + '/obb/find_object_angle' + ) + + # Service client for pixel-to-real conversion + self.pixel_to_real_client = self.create_client( + PixelToReal, + '/pixel_to_real' + ) + # OpenCV window setup self.window_name = f"SAM Object Detection - {self.rgb_topic}" cv2.namedWindow(self.window_name, cv2.WINDOW_NORMAL) @@ -240,11 +258,13 @@ def depth_callback(self, msg: Image): self.get_logger().error(f"Failed to convert depth image: {e}") def run_pipeline_callback(self, request, response): - """Service callback for /vision/run_pipeline - triggers detection and publishes to topic""" + """Service callback for /vision/run_pipeline - runs SAM+CLIP+GraspNet+OBB+Pixel-to-Real""" + import time + pipeline_start = time.perf_counter() + try: - # Use captured frame instead of latest_rgb for consistency frame_to_use = self.captured_frame if self.frame_captured else self.latest_rgb - + if frame_to_use is None: response.success = False response.message = json.dumps({ @@ -254,53 +274,207 @@ def run_pipeline_callback(self, request, response): }, indent=2) self.get_logger().warn("No image received yet") return response - + self.get_logger().info("=" * 80) - self.get_logger().info("Running SAM detection on captured frame...") - self.get_logger().info(f"Frame shape: {frame_to_use.shape}") + self.get_logger().info("Running full pipeline: SAM + CLIP + GraspNet + OBB + Pixel-to-Real") self.get_logger().info("=" * 80) - - # Run detection on captured frame (with IoU tracking from previous frame) + + # ── Step 1: SAM detection ────────────────────────────────────────── + sam_start = time.perf_counter() self.latest_detections = self._detect_objects(frame_to_use) - - # Store current detections as previous for next frame IoU calculation self.previous_detections = self.latest_detections.copy() - - # Build JSON response in the requested schema - detection_data = self._build_detection_schema() - - # Publish detections as ROS2 message self._publish_detections_ros() - - response.success = True - response.message = json.dumps(detection_data, indent=2) - - self.get_logger().info("=" * 80) - self.get_logger().info(f"Detection complete: {len(self.latest_detections)} objects found") - self.get_logger().info("=" * 80) - - # Print JSON output with bounding boxes - self.get_logger().info("JSON OUTPUT (with bounding boxes):") - self.get_logger().info("=" * 80) - self.get_logger().info(response.message) - self.get_logger().info("=" * 80) - - # Print detection details in readable format - self.get_logger().info("Bounding Boxes Summary:") - for i, det in enumerate(self.latest_detections): + sam_latency = time.perf_counter() - sam_start + self.get_logger().info(f"[1/5] SAM: {len(self.latest_detections)} objects in {sam_latency:.3f}s") + + # ── Step 2: CLIP classification ──────────────────────────────────── + time.sleep(0.3) # allow CLIP to process published detections + clip_start = time.perf_counter() + clip_classifications = {} + clip_success = False + if self.clip_filter_client.wait_for_service(timeout_sec=2.0): + try: + fut = self.clip_filter_client.call_async(Trigger.Request()) + rclpy.spin_until_future_complete(self, fut, timeout_sec=5.0) + clip_resp = fut.result() + if clip_resp and clip_resp.success: + clip_json = json.loads(clip_resp.message) + for region in clip_json.get('regions', []): + rid = region.get('region_id') + clip_classifications[rid] = { + 'label': region.get('label'), + 'confidence': float(region.get('confidence', 0.0)), + } + clip_success = True + except Exception as e: + self.get_logger().warn(f"CLIP call failed: {e}") + clip_latency = time.perf_counter() - clip_start + self.get_logger().info(f"[2/5] CLIP: {len(clip_classifications)} labelled in {clip_latency:.3f}s") + + # ── Step 3: GraspNet detection ───────────────────────────────────── + grasp_start = time.perf_counter() + grasp_map = {} + grasp_success = False + if self.grasp_client.wait_for_service(timeout_sec=2.0): + try: + fut = self.grasp_client.call_async(Trigger.Request()) + rclpy.spin_until_future_complete(self, fut, timeout_sec=10.0) + grasp_resp = fut.result() + if grasp_resp and grasp_resp.success: + grasp_json = json.loads(grasp_resp.message) + for g in grasp_json.get('grasps', []): + obj_id_str = g.get('object_id', '') + try: + oid = int(obj_id_str.split('_')[1]) if '_' in obj_id_str else int(obj_id_str) + except (ValueError, IndexError): + continue + if oid in grasp_map: + continue + pix = g.get('pixel_location', [0, 0]) + pos = g.get('position', {}) + grasp_map[oid] = { + 'pixel': {'u': int(pix[0]) if len(pix) > 0 else 0, + 'v': int(pix[1]) if len(pix) > 1 else 0}, + 'world': {'x': float(pos.get('x', 0.0)), + 'y': float(pos.get('y', 0.0)), + 'z': float(pos.get('z', 0.0))}, + 'quality_score': float(g.get('quality_score', 0.0)), + 'grasp_width': float(g.get('grasp_width', 0.0)), + 'approach_direction': g.get('approach_direction', ''), + } + grasp_success = True + except Exception as e: + self.get_logger().warn(f"GraspNet call failed: {e}") + grasp_latency = time.perf_counter() - grasp_start + self.get_logger().info(f"[3/5] GraspNet: {len(grasp_map)} grasps in {grasp_latency:.3f}s") + + # ── Step 4: OBB angle detection ──────────────────────────────────── + obb_start = time.perf_counter() + obb_map = {} + obb_success = False + if self.obb_client.wait_for_service(timeout_sec=2.0): + try: + fut = self.obb_client.call_async(FindObjectAngle.Request()) + rclpy.spin_until_future_complete(self, fut, timeout_sec=5.0) + obb_resp = fut.result() + if obb_resp and obb_resp.success: + n_obb = obb_resp.total_objects + bboxes_flat = list(obb_resp.bboxes) + for i in range(n_obb): + bx1 = bboxes_flat[i * 4] if len(bboxes_flat) > i * 4 else 0 + by1 = bboxes_flat[i * 4 + 1] if len(bboxes_flat) > i * 4 + 1 else 0 + bx2 = bboxes_flat[i * 4 + 2] if len(bboxes_flat) > i * 4 + 2 else 0 + by2 = bboxes_flat[i * 4 + 3] if len(bboxes_flat) > i * 4 + 3 else 0 + theta = float(obb_resp.thetas[i]) if i < len(obb_resp.thetas) else 0.0 + obb_entry = { + 'theta_rad': theta, + 'angle_deg': float(theta * 180.0 / 3.141592653589793), + 'width_px': float(obb_resp.widths[i]) if i < len(obb_resp.widths) else 0.0, + 'height_px': float(obb_resp.heights[i]) if i < len(obb_resp.heights) else 0.0, + 'center_u': float(obb_resp.centers_u[i]) if i < len(obb_resp.centers_u) else 0.0, + 'center_v': float(obb_resp.centers_v[i]) if i < len(obb_resp.centers_v) else 0.0, + } + # Match OBB result to SAM detection by IoU + obb_bbox = [bx1, by1, bx2, by2] + best_iou, best_idx = 0.0, i + for sam_idx, det in enumerate(self.latest_detections): + iou = self._calculate_iou(obb_bbox, det['bbox']) + if iou > best_iou: + best_iou, best_idx = iou, sam_idx + obb_map[best_idx] = obb_entry + obb_success = True + except Exception as e: + self.get_logger().warn(f"OBB call failed: {e}") + obb_latency = time.perf_counter() - obb_start + self.get_logger().info(f"[4/5] OBB: {len(obb_map)} objects in {obb_latency:.3f}s") + + # ── Step 5: Pixel-to-Real for each detected object center ────────── + p2r_start = time.perf_counter() + p2r_map = {} + p2r_success = False + if self.pixel_to_real_client.wait_for_service(timeout_sec=2.0): + p2r_success = True + for idx, det in enumerate(self.latest_detections): + cx, cy = det['center'] + try: + req = PixelToReal.Request() + req.u = int(cx) + req.v = int(cy) + fut = self.pixel_to_real_client.call_async(req) + rclpy.spin_until_future_complete(self, fut, timeout_sec=2.0) + p2r_resp = fut.result() + if p2r_resp: + p2r_map[idx] = { + 'u': int(cx), 'v': int(cy), + 'x': float(p2r_resp.x), + 'y': float(p2r_resp.y), + 'z': float(p2r_resp.z), + } + except Exception as e: + self.get_logger().warn(f"Pixel-to-real idx={idx} failed: {e}") + p2r_latency = time.perf_counter() - p2r_start + self.get_logger().info(f"[5/5] Pixel-to-Real: {len(p2r_map)} objects in {p2r_latency:.3f}s") + + # ── Build unified JSON response ──────────────────────────────────── + total_latency = time.perf_counter() - pipeline_start + objects_out = [] + for idx, det in enumerate(self.latest_detections): + clip_info = clip_classifications.get(idx, {}) + grasp_info = grasp_map.get(idx, {}) + obb_info = obb_map.get(idx, {}) + p2r_info = p2r_map.get(idx, {}) bbox = det['bbox'] - distance = det.get('distance_cm', 'N/A') - self.get_logger().info( - f" [{i}] {det['class_name']}: bbox={bbox}, " - f"confidence={det['confidence']:.2f}, distance={distance}" - ) + objects_out.append({ + 'object_id': det['id'], + 'label': clip_info.get('label', '') if clip_info else '', + 'bbox': {'x1': bbox[0], 'y1': bbox[1], 'x2': bbox[2], 'y2': bbox[3]}, + 'center': {'u': det['center'][0], 'v': det['center'][1]}, + 'sam_confidence': round(float(det['confidence']), 4), + 'clip_confidence': round(float(clip_info.get('confidence', 0.0)), 4) if clip_info else '', + 'iou_score': round(float(det.get('iou_score', 0.0)), 4), + 'is_stable': bool(det.get('is_stable', False)), + 'distance_cm': float(det['distance_cm']) if det.get('distance_cm') is not None else '', + 'world': p2r_info if p2r_info else {}, + 'has_grasp': bool(grasp_info), + 'grasp': grasp_info if grasp_info else {}, + 'obb': { + 'angle_deg': round(obb_info['angle_deg'], 2), + 'theta_rad': round(obb_info['theta_rad'], 4), + 'width_px': round(obb_info['width_px'], 1), + 'height_px': round(obb_info['height_px'], 1), + 'center_u': round(obb_info['center_u'], 1), + 'center_v': round(obb_info['center_v'], 1), + } if obb_info else {}, + }) + + pipeline_result = { + 'pipeline': 'run_pipeline', + 'timestamp': datetime.utcnow().isoformat() + 'Z', + 'total_objects': len(objects_out), + 'latency_s': round(total_latency, 3), + 'services': { + 'sam': {'success': True, 'latency_s': round(sam_latency, 3), 'total_detections': len(self.latest_detections)}, + 'clip': {'success': clip_success, 'latency_s': round(clip_latency, 3), 'filtered_regions': len(clip_classifications)}, + 'graspnet': {'success': grasp_success, 'latency_s': round(grasp_latency, 3), 'total_grasps': len(grasp_map)}, + 'obb': {'success': obb_success, 'latency_s': round(obb_latency, 3), 'total_objects': len(obb_map)}, + 'pixel_to_real': {'success': p2r_success, 'latency_s': round(p2r_latency, 3), 'total_converted': len(p2r_map)}, + }, + 'objects': objects_out, + } + + # Save to vision_runs_history.json (dashboard reads this) + self._save_run_pipeline_result( + pipeline_result, clip_classifications, grasp_map, obb_map, p2r_map, + sam_latency, clip_latency, grasp_latency, obb_latency + ) + + response.success = True + response.message = json.dumps(pipeline_result, indent=2) + self.get_logger().info("=" * 80) - - # Verify bounding boxes are in output - bbox_count = len([d for d in detection_data.get('detections', [{}])[0].get('detections', []) if 'bbox' in d]) - self.get_logger().info(f"Verified: {bbox_count} bounding boxes included in JSON output") + self.get_logger().info(f"Full pipeline done: {len(objects_out)} objects, {total_latency:.2f}s total") self.get_logger().info("=" * 80) - + except Exception as e: response.success = False response.message = json.dumps({ @@ -308,12 +482,119 @@ def run_pipeline_callback(self, request, response): "error": str(e), "timestamp": datetime.utcnow().isoformat() + "Z" }, indent=2) - self.get_logger().error(f"Detection error: {e}") + self.get_logger().error(f"Pipeline error: {e}") import traceback self.get_logger().error(traceback.format_exc()) - + return response + def _save_run_pipeline_result(self, pipeline_result, clip_classifications, grasp_map, obb_map, p2r_map, + sam_latency, clip_latency, grasp_latency, obb_latency): + """Save /vision/run_pipeline results to vision_runs_history.json for the dashboard.""" + try: + from pathlib import Path + package_path = Path(__file__).parent.parent + history_file = package_path / 'vision_runs_history.json' + + history = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + data = json.load(f) + if isinstance(data, list): + history = data + except Exception: + pass + + last_run_no = history[-1]['meta']['run_no'] if history else 0 + run_no = last_run_no + 1 + + num_dets = len(self.latest_detections) + iou_scores = [float(d.get('iou_score', 0.0)) for d in self.latest_detections] + is_stable_array = [bool(d.get('is_stable', False)) for d in self.latest_detections] + avg_sam_conf = (sum(float(d.get('confidence', 0.0)) for d in self.latest_detections) / num_dets + if num_dets > 0 else 0.0) + avg_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0.0 + stability_rate = (sum(1 for s in is_stable_array if s) / len(is_stable_array) + if is_stable_array else 0.0) + + objects = [] + for idx, det in enumerate(self.latest_detections): + clip_info = clip_classifications.get(idx, {}) + grasp_info = grasp_map.get(idx, {}) + obb_info = obb_map.get(idx, {}) + p2r_info = p2r_map.get(idx, {}) + bbox = det['bbox'] + objects.append({ + 'object_id': det['id'], + 'label': clip_info.get('label', '') if clip_info else '', + 'bbox_x1': bbox[0], + 'bbox_y1': bbox[1], + 'bbox_x2': bbox[2], + 'bbox_y2': bbox[3], + 'sam_confidence': round(float(det.get('confidence', 0.0)), 4), + 'clip_confidence': round(float(clip_info.get('confidence', 0.0)), 4) if clip_info else '', + 'distance_cm': float(det['distance_cm']) if det.get('distance_cm') is not None else '', + 'iou_score': iou_scores[idx], + 'is_stable': is_stable_array[idx], + 'has_grasp': bool(grasp_info), + 'grasp': grasp_info if grasp_info else {}, + 'world': p2r_info if p2r_info else {}, + 'obb_angle_deg': round(obb_info['angle_deg'], 2) if obb_info else '', + 'obb_theta_rad': round(obb_info['theta_rad'], 4) if obb_info else '', + 'obb_width_px': round(obb_info['width_px'], 1) if obb_info else '', + 'obb_height_px': round(obb_info['height_px'], 1) if obb_info else '', + 'obb_center_u': round(obb_info['center_u'], 1) if obb_info else '', + 'obb_center_v': round(obb_info['center_v'], 1) if obb_info else '', + }) + + run = { + 'meta': { + 'run_no': run_no, + 'timestamp': datetime.utcnow().isoformat() + 'Z', + 'latency_s': pipeline_result['latency_s'], + 'source': 'run_pipeline', + }, + 'sam': { + 'success': True, + 'latency_s': round(sam_latency, 3), + 'total_detections': num_dets, + 'avg_confidence': round(avg_sam_conf, 4), + 'average_iou': round(avg_iou, 4), + 'stability_rate': round(stability_rate, 4), + }, + 'clip': { + 'success': bool(clip_classifications), + 'latency_s': round(clip_latency, 3), + 'filtered_regions': len(clip_classifications), + }, + 'scene': {'success': False, 'latency_s': 0.0}, + 'obb': { + 'success': bool(obb_map), + 'latency_s': round(obb_latency, 3), + 'total_objects': len(obb_map), + }, + 'graspnet': { + 'success': bool(grasp_map), + 'latency_s': round(grasp_latency, 3), + 'total_grasps': len(grasp_map), + }, + 'objects': objects, + 'relations': [], + 'grasps': list(grasp_map.values()), + } + + history.append(run) + history = history[-50:] # keep last 50 runs + + with open(history_file, 'w') as f: + json.dump(history, f, indent=2) + + self.get_logger().info(f"Saved run #{run_no} to {history_file} ({num_dets} objects)") + + except Exception as e: + self.get_logger().warn(f"Failed to save run pipeline history: {e}") + def detect_objects_callback(self, request, response): """Service callback for /vision/detect_objects - returns detection results directly""" try: @@ -408,14 +689,17 @@ def detect_objects_callback(self, request, response): clip_info = clip_classifications.get(idx) if clip_info: - # Use CLIP label and confidence + # Confidence = CLIP softmax probability over candidate labels [0, 1] object_ids.append(f"{clip_info['label']}_{idx}") confidences.append(float(clip_info['confidence'])) self.get_logger().info(f" Region {idx}: {clip_info['label']} (CLIP confidence: {clip_info['confidence']:.2f})") else: - # Use SAM generic label + # No CLIP result — fall back to SAM shape score (circularity-based, + # range [0.50, 0.95]). This is NOT a semantic confidence; it only + # reflects contour regularity. object_ids.append(det['id']) confidences.append(float(det['confidence'])) + self.get_logger().debug(f" Region {idx}: no CLIP result, using SAM shape score={det['confidence']:.2f}") bbox = det['bbox'] bbox_x1.append(bbox[0]) @@ -485,7 +769,15 @@ def detect_objects_callback(self, request, response): f"conf={confidences[i]:.2f}, dist={distances_cm[i]:.1f}cm" ) self.get_logger().info("=" * 80) - + + # Save results to vision_runs_history.json for real-time dashboard + end_inner = time.perf_counter() + self._save_detect_objects_run( + object_ids, bbox_x1, bbox_y1, bbox_x2, bbox_y2, + confidences, distances_cm, iou_scores, is_stable_array, + clip_classifications, end_inner - start + ) + except Exception as e: response.success = False response.total_detections = 0 @@ -509,9 +801,103 @@ def detect_objects_callback(self, request, response): end = time.perf_counter() latency = end - start self.get_logger().info(f"Total detection latency: {latency:.6f} seconds") - + return response + def _save_detect_objects_run(self, object_ids, bbox_x1, bbox_y1, bbox_x2, bbox_y2, + confidences, distances_cm, iou_scores, is_stable_array, + clip_classifications, latency_s): + """Save /vision/detect_objects results to vision_runs_history.json for the dashboard.""" + try: + from pathlib import Path + + # Same path used by benchmark_dashboard and collect_and_export + package_path = Path(__file__).parent.parent + history_file = package_path / 'vision_runs_history.json' + + # Load existing history + history = [] + if history_file.exists(): + try: + with open(history_file, 'r') as f: + data = json.load(f) + if isinstance(data, list): + history = data + except Exception: + pass + + last_run_no = history[-1]['meta']['run_no'] if history else 0 + run_no = last_run_no + 1 + + num_dets = len(self.latest_detections) + total_sam_conf = 0.0 + objects = [] + for idx in range(num_dets): + det = self.latest_detections[idx] + clip_info = clip_classifications.get(idx, {}) + sam_conf = float(det.get('confidence', 0.0)) + total_sam_conf += sam_conf + objects.append({ + 'object_id': object_ids[idx] if idx < len(object_ids) else f'object_{idx}', + 'label': clip_info.get('label', '') if clip_info else '', + 'bbox_x1': bbox_x1[idx] if idx < len(bbox_x1) else 0, + 'bbox_y1': bbox_y1[idx] if idx < len(bbox_y1) else 0, + 'bbox_x2': bbox_x2[idx] if idx < len(bbox_x2) else 0, + 'bbox_y2': bbox_y2[idx] if idx < len(bbox_y2) else 0, + 'sam_confidence': round(sam_conf, 4), + 'clip_confidence': round(float(clip_info.get('confidence', 0.0)), 4) if clip_info else '', + 'distance_cm': distances_cm[idx] if idx < len(distances_cm) else '', + 'iou_score': iou_scores[idx] if idx < len(iou_scores) else '', + 'is_stable': is_stable_array[idx] if idx < len(is_stable_array) else '', + 'has_grasp': False, + 'grasp': {}, + 'obb_angle_deg': '', 'obb_theta_rad': '', + 'obb_width_px': '', 'obb_height_px': '', + 'obb_center_u': '', 'obb_center_v': '', + }) + + avg_sam_conf = total_sam_conf / num_dets if num_dets > 0 else 0.0 + avg_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0.0 + stability_rate = sum(1 for s in is_stable_array if s) / len(is_stable_array) if is_stable_array else 0.0 + + run = { + 'meta': { + 'run_no': run_no, + 'timestamp': datetime.utcnow().isoformat() + 'Z', + 'latency_s': round(latency_s, 3), + 'source': 'detect_objects', + }, + 'sam': { + 'success': True, + 'latency_s': round(latency_s, 3), + 'total_detections': num_dets, + 'avg_confidence': round(avg_sam_conf, 4), + 'average_iou': round(avg_iou, 4), + 'stability_rate': round(stability_rate, 4), + }, + 'clip': { + 'success': bool(clip_classifications), + 'latency_s': 0.0, + 'filtered_regions': len(clip_classifications), + }, + 'scene': {'success': False, 'latency_s': 0.0}, + 'obb': {'success': False, 'latency_s': 0.0}, + 'objects': objects, + 'relations': [], + 'grasps': [], + } + + history.append(run) + history = history[-50:] # keep last 50 runs + + with open(history_file, 'w') as f: + json.dump(history, f, indent=2) + + self.get_logger().info(f"Saved run #{run_no} to {history_file} ({num_dets} objects)") + + except Exception as e: + self.get_logger().warn(f"Failed to save run history: {e}") + def _startup_announce(self): """One-shot announce to make sure global topics appear after node startup.""" try: @@ -721,12 +1107,17 @@ def _detect_objects(self, rgb_image: np.ndarray) -> List[Dict]: mask = np.zeros((h, w), dtype=np.uint8) cv2.drawContours(mask, [contour], -1, 255, -1) - # Calculate confidence based on contour properties + # Shape-based segmentation score (NOT a model prediction probability). + # Circularity (4π·area / perimeter²) measures how "blob-like" the + # contour is: 1.0 = perfect circle, <1.0 = irregular shape. + # Score range: [0.50, 0.95]. This is used as a proxy for + # segmentation quality when no model IoU score is available. + # Downstream consumers should treat this as `sam_shape_score`, + # NOT as a semantic classification confidence. perimeter = cv2.arcLength(contour, True) circularity = 4 * np.pi * area / (perimeter * perimeter) if perimeter > 0 else 0 - confidence = min(0.95, 0.50 + circularity * 0.45) # More lenient baseline - - # Relaxed confidence threshold (was 0.4, now 0.3) + confidence = min(0.95, 0.50 + circularity * 0.45) + if confidence <= 0.3: continue @@ -988,157 +1379,132 @@ def _publish_detections_ros(self): def visualization_callback(self): """Display camera feed with detections in OpenCV window""" - # Use captured frame if available, otherwise latest_rgb frame_to_display = self.captured_frame if self.frame_captured else self.latest_rgb - + if frame_to_display is None: - # Show waiting message blank = np.zeros((480, 640, 3), dtype=np.uint8) - cv2.putText( - blank, - f"Waiting for {self.rgb_topic}...", - (100, 240), - cv2.FONT_HERSHEY_SIMPLEX, - 1.0, - (255, 255, 255), - 2 - ) + cv2.putText(blank, f"Waiting for {self.rgb_topic}...", (80, 240), + cv2.FONT_HERSHEY_SIMPLEX, 0.65, (160, 160, 160), 1) cv2.imshow(self.window_name, blank) cv2.waitKey(1) return - - # Create visualization image + vis_image = frame_to_display.copy() - - # Determine if we should show corner points (when detections < 3) - # For debug and finding u,v in image + + # Distinct color palette per object (BGR) + COLORS = [ + (0, 200, 255), # amber + (80, 255, 80), # lime + (255, 80, 80), # blue + (255, 0, 200), # magenta + (0, 230, 230), # yellow + (200, 80, 255), # violet + (0, 255, 180), # spring green + (255, 180, 0), # sky blue + ] + show_corner_points = len(self.latest_detections) < 5 - - # Draw detections + + # ── Pass 1: draw all masks first (underneath boxes) ────────────────── + mask_overlay = vis_image.copy() + for idx, det in enumerate(self.latest_detections): + color = COLORS[idx % len(COLORS)] + mask = det['mask'] + mask_overlay[mask > 0] = ( + int(color[0] * 0.55), + int(color[1] * 0.55), + int(color[2] * 0.55), + ) + vis_image = cv2.addWeighted(vis_image, 0.6, mask_overlay, 0.4, 0) + + # ── Pass 2: boxes, labels, debug corners ───────────────────────────── for idx, det in enumerate(self.latest_detections): bbox = det['bbox'] confidence = det['confidence'] distance = det.get('distance_cm') - obj_no = idx # Object number - - # Draw bounding box - cv2.rectangle( - vis_image, - (bbox[0], bbox[1]), - (bbox[2], bbox[3]), - (0, 255, 0), # Green - 2 - ) - - # Display 4 corner coordinates if detections < 3 - # Debug: Show corner coordinates when detections < 5 (for u,v verification) + color = COLORS[idx % len(COLORS)] + x1, y1, x2, y2 = bbox + + # Corner-bracket bounding box (professional look) + clen = max(14, int(min(x2 - x1, y2 - y1) * 0.15)) + lw = 2 + for (px, py, dx, dy) in [ + (x1, y1, 1, 1), (x2, y1, -1, 1), + (x1, y2, 1, -1), (x2, y2, -1, -1), + ]: + cv2.line(vis_image, (px, py), (px + dx * clen, py), color, lw) + cv2.line(vis_image, (px, py), (px, py + dy * clen), color, lw) + # Thin dim full rectangle + cv2.rectangle(vis_image, (x1, y1), (x2, y2), color, 1) + + # ── Debug corner coordinates ────────────────────────────────── if show_corner_points: - font_scale = 0.4 - font_thickness = 1 - text_color = (0, 255, 255) # Yellow text for visibility - bg_color = (0, 0, 0) # Black background - padding = 2 - - # Draw all 4 corners in multi-line format for better readability corners_text = [ - f"TL:({bbox[0]},{bbox[1]})", - f"TR:({bbox[2]},{bbox[1]})", - f"BL:({bbox[0]},{bbox[3]})", - f"BR:({bbox[2]},{bbox[3]})" + f"TL:({x1},{y1})", f"TR:({x2},{y1})", + f"BL:({x1},{y2})", f"BR:({x2},{y2})", ] - - # Position text block above bbox - start_y = max(bbox[1] - 80, 20) # Ensure it stays on screen - line_height = 18 - + start_y = max(y1 - 68, 26) for i, line_text in enumerate(corners_text): - text_size, _ = cv2.getTextSize(line_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness) - y_pos = start_y + (i * line_height) - - # Draw background - cv2.rectangle( - vis_image, - (bbox[0] - padding, y_pos - text_size[1] - padding), - (bbox[0] + text_size[0] + padding, y_pos + padding), - bg_color, - -1 - ) - - # Draw text - cv2.putText( - vis_image, - line_text, - (bbox[0], y_pos), - cv2.FONT_HERSHEY_SIMPLEX, - font_scale, - text_color, - font_thickness - ) - - # Draw filled mask with transparency - mask = det['mask'] - colored_mask = np.zeros_like(vis_image) - colored_mask[:, :] = (0, 255, 0) # Green overlay - vis_image = np.where( - mask[..., None] > 0, - cv2.addWeighted(vis_image, 0.7, colored_mask, 0.3, 0), - vis_image - ) - - # Draw label with object number and distance + ts, _ = cv2.getTextSize(line_text, cv2.FONT_HERSHEY_SIMPLEX, 0.33, 1) + ty = start_y + i * 16 + cv2.rectangle(vis_image, (x1 - 2, ty - ts[1] - 2), + (x1 + ts[0] + 2, ty + 2), (15, 15, 15), -1) + cv2.putText(vis_image, line_text, (x1, ty), + cv2.FONT_HERSHEY_SIMPLEX, 0.33, color, 1) + + # ── Label ───────────────────────────────────────────────────── if distance is not None: - label = f"#{obj_no} {det.get('class_name', det['id'])}: {confidence:.2f} ({distance:.1f}cm)" + label = f"#{idx} {det.get('class_name', det['id'])} {confidence:.2f} {distance:.1f}cm" else: - label = f"#{obj_no} {det.get('class_name', det['id'])}: {confidence:.2f}" - - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2) - - # Label background - cv2.rectangle( - vis_image, - (bbox[0], bbox[1] - label_size[1] - 10), - (bbox[0] + label_size[0], bbox[1]), - (0, 255, 0), - -1 - ) - - # Label text - cv2.putText( - vis_image, - label, - (bbox[0], bbox[1] - 5), - cv2.FONT_HERSHEY_SIMPLEX, - 0.6, - (0, 0, 0), # Black text - 2 - ) - - # Add info overlay + label = f"#{idx} {det.get('class_name', det['id'])} {confidence:.2f}" + + fs, ft = 0.38, 1 + (lw_px, lh_px), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, fs, ft) + pad = 4 + accent = 3 # color accent bar width + + # Place label above bbox; clamp to image top + ly = y1 - pad - 2 + if ly - lh_px - pad < 0: + ly = y1 + lh_px + pad + 2 + + bx1 = x1 + bx2 = x1 + accent + pad + lw_px + pad + by1 = ly - lh_px - pad + by2 = ly + pad + + # Clamp to image bounds + bx1 = max(0, bx1); by1 = max(0, by1) + bx2 = min(vis_image.shape[1] - 1, bx2) + by2 = min(vis_image.shape[0] - 1, by2) + + # Semi-transparent dark background + roi = vis_image[by1:by2, bx1:bx2] + if roi.size > 0: + dark = np.full_like(roi, (18, 18, 18)) + vis_image[by1:by2, bx1:bx2] = cv2.addWeighted(roi, 0.25, dark, 0.75, 0) + + # Color accent bar on left + cv2.rectangle(vis_image, (bx1, by1), (bx1 + accent, by2), color, -1) + + # White label text + cv2.putText(vis_image, label, (bx1 + accent + pad, ly), + cv2.FONT_HERSHEY_SIMPLEX, fs, (240, 240, 240), ft) + + # ── Top info bar ───────────────────────────────────────────────────── + h, w = vis_image.shape[:2] + bar_h = 22 mode_text = "CONTINUOUS" if self.continuous_detection else "SINGLE SHOT" - corner_indicator = " [CORNER POINTS ON]" if show_corner_points else "" - info_text = f"Mode: {mode_text} | Objects: {len(self.latest_detections)}{corner_indicator}" - - cv2.putText( - vis_image, - info_text, - (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (255, 255, 255), # White - 2 - ) - - cv2.putText( - vis_image, - info_text, - (10, 30), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (0, 0, 0), # Black outline - 4 - ) - - # Show image + obj_count = len(self.latest_detections) + debug_tag = " [DEBUG]" if (show_corner_points and obj_count > 0) else "" + info_text = f"Mode: {mode_text} | Objects detected: {obj_count}{debug_tag}" + + roi = vis_image[0:bar_h, 0:w] + dark_bar = np.full_like(roi, (12, 12, 12)) + vis_image[0:bar_h, 0:w] = cv2.addWeighted(roi, 0.25, dark_bar, 0.75, 0) + cv2.putText(vis_image, info_text, (8, 15), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (210, 210, 210), 1) + cv2.imshow(self.window_name, vis_image) cv2.waitKey(1) diff --git a/vision/unified_pipeline.py b/vision/unified_pipeline.py index e47cefa..f3a9111 100644 --- a/vision/unified_pipeline.py +++ b/vision/unified_pipeline.py @@ -287,7 +287,10 @@ def _call_clip_classification(self) -> Dict[int, Dict]: clip_map[region_id] = { "label": region.get('label'), "confidence": float(region.get('confidence', 0.0)), - "is_top1_accurate": region.get('confidence', 0.0) >= 0.5 # Top-1 if confidence >= 0.5 + # confidence here is a softmax probability over candidate labels. + # >= 0.5 means the model assigns more probability to this label + # than to all other candidates combined — a high-precision bar. + "is_top1_accurate": region.get('confidence', 0.0) >= 0.5 } return clip_map diff --git a/vision_scripts/collect_and_export.py b/vision_scripts/collect_and_export.py new file mode 100644 index 0000000..35ae942 --- /dev/null +++ b/vision_scripts/collect_and_export.py @@ -0,0 +1,649 @@ +#!/usr/bin/env python3 +""" +Vision Pipeline Data Collector & Excel Exporter + +Calls all 4 vision services in sequence, stores the last 20 runs in a JSON +history file, and exports every run to an Excel workbook (4 sheets). + +Services called (in order): + 1. /vision/run_pipeline (std_srvs/Trigger) -> SAM detects + publishes + 2. /vision/classify_bbox_filtered (std_srvs/Trigger) -> CLIP labels per region + 3. /vision/understand_scene (std_srvs/Trigger) -> objects + relations + grasps + +Output files (written to workspace root next to README.md): + vision_runs_history.json — rolling last-20-runs database + vision_runs_export.xlsx — Excel workbook, 4 sheets: + • Runs — one row per pipeline run + • Objects — one row per detected object + • Relations — one row per spatial relation + • Grasps — one row per grasp pose + +Usage: + # Make sure the vision nodes are already running, then: + python3 vision_scripts/collect_and_export.py + + # Or as a ROS2 entry point (after colcon build): + ros2 run vision collect_and_export +""" + +import math +import rclpy +from rclpy.node import Node +from std_srvs.srv import Trigger +import json +import time +import os +import sys +from datetime import datetime +from pathlib import Path + +# OBB service uses custom_interfaces (optional — graceful skip if not built) +try: + from custom_interfaces.srv import FindObjectAngle + _OBB_AVAILABLE = True +except ImportError: + _OBB_AVAILABLE = False + +# --------------------------------------------------------------------------- +# Output file paths (workspace root = parent of this script's package dir) +# --------------------------------------------------------------------------- +_SCRIPT_DIR = Path(__file__).resolve().parent +_WORKSPACE_ROOT = _SCRIPT_DIR.parent +HISTORY_FILE = _WORKSPACE_ROOT / "vision_runs_history.json" +EXCEL_FILE = _WORKSPACE_ROOT / "vision_runs_export.xlsx" +MAX_HISTORY = 20 + + +# --------------------------------------------------------------------------- +# Helper: load / save history +# --------------------------------------------------------------------------- + +def _load_history(): + if HISTORY_FILE.exists(): + try: + with open(HISTORY_FILE, "r") as f: + data = json.load(f) + if isinstance(data, list): + return data + except Exception: + pass + return [] + + +def _save_history(runs): + runs = runs[-MAX_HISTORY:] # keep last 20 only + with open(HISTORY_FILE, "w") as f: + json.dump(runs, f, indent=2) + + +# --------------------------------------------------------------------------- +# Helper: export to Excel using openpyxl +# --------------------------------------------------------------------------- + +def _export_excel(runs): + """Write runs list → Excel with 4 sheets.""" + try: + from openpyxl import Workbook + from openpyxl.styles import PatternFill, Font, Alignment + from openpyxl.utils import get_column_letter + except ImportError: + print("[WARN] openpyxl not installed. Run: pip install openpyxl") + print("[WARN] Skipping Excel export. JSON history saved successfully.") + return + + wb = Workbook() + + # ---- colour scheme ---- + header_fill = PatternFill("solid", fgColor="4F81BD") + header_font = Font(bold=True, color="FFFFFF") + alt_fill = PatternFill("solid", fgColor="DCE6F1") + + def _write_sheet(ws, headers, rows, colour_col=None): + """Write header row then data rows with alternating row colour.""" + ws.append(headers) + for cell in ws[1]: + cell.fill = header_fill + cell.font = header_font + cell.alignment = Alignment(horizontal="center") + for row_idx, row in enumerate(rows, start=2): + ws.append(row) + if row_idx % 2 == 0: + for cell in ws[row_idx]: + cell.fill = alt_fill + # Auto-width + for col_idx, header in enumerate(headers, start=1): + col_letter = get_column_letter(col_idx) + max_len = len(str(header)) + for row in ws.iter_rows(min_col=col_idx, max_col=col_idx): + for cell in row: + try: + max_len = max(max_len, len(str(cell.value or ""))) + except Exception: + pass + ws.column_dimensions[col_letter].width = min(max_len + 4, 40) + + # ---- Sheet 1: Runs ---- + ws_runs = wb.active + ws_runs.title = "Runs" + runs_headers = [ + "Run No", "Timestamp", "SAM Total Objects", "CLIP Filtered Regions", + "Graspable Objects", "Total Relations", "Avg SAM Confidence", + "Avg IoU", "Stability Rate (%)", "Scene Description", + "SAM Success", "CLIP Success", "Scene Success", "OBB Success", + "Total Latency (s)", "SAM Latency (s)", "CLIP Latency (s)", + "Scene Latency (s)", "OBB Latency (s)" + ] + runs_rows = [] + for run in runs: + meta = run.get("meta", {}) + sam = run.get("sam", {}) + clip = run.get("clip", {}) + scene = run.get("scene", {}) + obb = run.get("obb", {}) + runs_rows.append([ + meta.get("run_no", ""), + meta.get("timestamp", ""), + sam.get("total_detections", ""), + clip.get("filtered_regions", ""), + scene.get("graspable_objects", ""), + scene.get("total_relations", ""), + sam.get("avg_confidence", ""), + sam.get("average_iou", ""), + round(sam.get("stability_rate", 0) * 100, 1), + scene.get("scene_description", ""), + sam.get("success", ""), + clip.get("success", ""), + scene.get("success", ""), + obb.get("success", ""), + meta.get("latency_s", ""), + sam.get("latency_s", ""), + clip.get("latency_s", ""), + scene.get("latency_s", ""), + obb.get("latency_s", ""), + ]) + _write_sheet(ws_runs, runs_headers, runs_rows) + + # ---- Sheet 2: Objects ---- + ws_obj = wb.create_sheet("Objects") + obj_headers = [ + "Run No", "Timestamp", "Object ID", "CLIP Label", + "BBox X1", "BBox Y1", "BBox X2", "BBox Y2", + "SAM Confidence", "CLIP Confidence", + "Distance (cm)", "IoU Score", "Is Stable", + "Has Grasp", "Grasp Quality", "Grasp Width (m)", + # OBB columns + "OBB Angle (deg)", "OBB Theta (rad)", + "OBB Width (px)", "OBB Height (px)", + "OBB Center U", "OBB Center V", + ] + obj_rows = [] + for run in runs: + meta = run.get("meta", {}) + run_no = meta.get("run_no", "") + ts = meta.get("timestamp", "") + for obj in run.get("objects", []): + grasp = obj.get("grasp", {}) + obj_rows.append([ + run_no, ts, + obj.get("object_id", ""), + obj.get("label", ""), + obj.get("bbox_x1", ""), obj.get("bbox_y1", ""), + obj.get("bbox_x2", ""), obj.get("bbox_y2", ""), + obj.get("sam_confidence", ""), + obj.get("clip_confidence", ""), + obj.get("distance_cm", ""), + obj.get("iou_score", ""), + obj.get("is_stable", ""), + obj.get("has_grasp", ""), + grasp.get("quality_score", ""), + grasp.get("width_m", ""), + # OBB + obj.get("obb_angle_deg", ""), + obj.get("obb_theta_rad", ""), + obj.get("obb_width_px", ""), + obj.get("obb_height_px", ""), + obj.get("obb_center_u", ""), + obj.get("obb_center_v", ""), + ]) + _write_sheet(ws_obj, obj_headers, obj_rows) + + # ---- Sheet 3: Relations ---- + ws_rel = wb.create_sheet("Relations") + rel_headers = [ + "Run No", "Timestamp", "Scene ID", + "Subject", "Relation", "Target Object", + "Confidence", "Distance 2D", "Description" + ] + rel_rows = [] + for run in runs: + meta = run.get("meta", {}) + run_no = meta.get("run_no", "") + ts = meta.get("timestamp", "") + scene = run.get("scene", {}) + scene_id = scene.get("scene_id", "") + for rel in run.get("relations", []): + rel_rows.append([ + run_no, ts, scene_id, + rel.get("subject", ""), + rel.get("relation", ""), + rel.get("target_object", ""), + rel.get("confidence", ""), + rel.get("distance_2d", ""), + rel.get("description", ""), + ]) + _write_sheet(ws_rel, rel_headers, rel_rows) + + # ---- Sheet 4: Grasps ---- + ws_grasp = wb.create_sheet("Grasps") + grasp_headers = [ + "Run No", "Timestamp", + "Object ID", "Pos X (m)", "Pos Y (m)", "Pos Z (m)", + "Orient X", "Orient Y", "Orient Z", "Orient W", + "Quality Score", "Grasp Width (m)", "Approach Direction" + ] + grasp_rows = [] + for run in runs: + meta = run.get("meta", {}) + run_no = meta.get("run_no", "") + ts = meta.get("timestamp", "") + for g in run.get("grasps", []): + grasp_rows.append([ + run_no, ts, + g.get("object_id", ""), + g.get("pos_x", ""), g.get("pos_y", ""), g.get("pos_z", ""), + g.get("orient_x", ""), g.get("orient_y", ""), + g.get("orient_z", ""), g.get("orient_w", ""), + g.get("quality_score", ""), + g.get("width_m", ""), + g.get("approach_direction", ""), + ]) + _write_sheet(ws_grasp, grasp_headers, grasp_rows) + + # ---- Sheet 5: OBB Angles ---- + ws_obb = wb.create_sheet("OBB Angles") + obb_headers = [ + "Run No", "Timestamp", + "Object ID", "CLIP Label", + "OBB Angle (deg)", "OBB Theta (rad)", + "OBB Width (px)", "OBB Height (px)", + "OBB Center U", "OBB Center V", + "BBox X1", "BBox Y1", "BBox X2", "BBox Y2", + "SAM Confidence", "Distance (cm)", + "OBB Latency (s)", + ] + obb_rows = [] + for run in runs: + meta = run.get("meta", {}) + run_no = meta.get("run_no", "") + ts = meta.get("timestamp", "") + obb_lat = run.get("obb", {}).get("latency_s", "") + for obj in run.get("objects", []): + if obj.get("obb_angle_deg", "") == "": + continue # skip objects where OBB wasn't available + obb_rows.append([ + run_no, ts, + obj.get("object_id", ""), + obj.get("label", ""), + obj.get("obb_angle_deg", ""), + obj.get("obb_theta_rad", ""), + obj.get("obb_width_px", ""), + obj.get("obb_height_px", ""), + obj.get("obb_center_u", ""), + obj.get("obb_center_v", ""), + obj.get("bbox_x1", ""), obj.get("bbox_y1", ""), + obj.get("bbox_x2", ""), obj.get("bbox_y2", ""), + obj.get("sam_confidence", ""), + obj.get("distance_cm", ""), + obb_lat, + ]) + _write_sheet(ws_obb, obb_headers, obb_rows) + + wb.save(str(EXCEL_FILE)) + print(f"[OK] Excel exported → {EXCEL_FILE}") + + +# --------------------------------------------------------------------------- +# ROS2 collector node +# --------------------------------------------------------------------------- + +class VisionDataCollector(Node): + + def __init__(self): + super().__init__("vision_data_collector") + + self._sam_client = self.create_client(Trigger, "/vision/run_pipeline") + self._clip_client = self.create_client(Trigger, "/vision/classify_bbox_filtered") + self._scene_client = self.create_client(Trigger, "/vision/understand_scene") + + # OBB client — only if custom_interfaces is built + if _OBB_AVAILABLE: + self._obb_client = self.create_client(FindObjectAngle, "/obb/find_object_angle") + else: + self._obb_client = None + self.get_logger().warn("custom_interfaces not found — OBB step will be skipped") + + # ------------------------------------------------------------------ + # Low-level call helper + # ------------------------------------------------------------------ + + def _call(self, client, service_name, timeout=10.0): + """Wait for service, call it, return (success, message_str).""" + self.get_logger().info(f"Waiting for {service_name} ...") + if not client.wait_for_service(timeout_sec=5.0): + self.get_logger().warn(f"{service_name} not available — skipping") + return False, None + + future = client.call_async(Trigger.Request()) + start = time.time() + while not future.done(): + rclpy.spin_once(self, timeout_sec=0.05) + if time.time() - start > timeout: + self.get_logger().error(f"{service_name} call timed out") + return False, None + + result = future.result() + if result is None: + return False, None + return result.success, result.message + + def _call_obb(self, timeout=10.0): + """Call /obb/find_object_angle (FindObjectAngle service). Returns response or None.""" + if self._obb_client is None: + return None + service_name = "/obb/find_object_angle" + self.get_logger().info(f"Waiting for {service_name} ...") + if not self._obb_client.wait_for_service(timeout_sec=5.0): + self.get_logger().warn(f"{service_name} not available — skipping OBB step") + return None + future = self._obb_client.call_async(FindObjectAngle.Request()) + start = time.time() + while not future.done(): + rclpy.spin_once(self, timeout_sec=0.05) + if time.time() - start > timeout: + self.get_logger().error(f"{service_name} call timed out") + return None + return future.result() + + # ------------------------------------------------------------------ + # Collect one run + # ------------------------------------------------------------------ + + def collect_run(self, run_no): + """Call all services and return a unified run dict.""" + ts = datetime.utcnow().isoformat() + "Z" + t0 = time.perf_counter() + + run = { + "meta": {"run_no": run_no, "timestamp": ts}, + "sam": {"success": False, "latency_s": 0.0}, + "clip": {"success": False, "latency_s": 0.0}, + "scene": {"success": False, "latency_s": 0.0}, + "obb": {"success": False, "total_objects": 0, "latency_s": 0.0}, + "objects": [], + "relations": [], + "grasps": [], + } + + # ---- 1. SAM ---- + self.get_logger().info("=" * 60) + self.get_logger().info(f"RUN #{run_no} — Step 1: SAM /vision/run_pipeline") + t_sam = time.perf_counter() + sam_ok, sam_msg = self._call(self._sam_client, "/vision/run_pipeline") + run["sam"]["latency_s"] = round(time.perf_counter() - t_sam, 3) + if sam_ok and sam_msg: + try: + sam_data = json.loads(sam_msg) + summary = sam_data.get("summary", {}) + metrics = sam_data.get("metrics", {}) + coco = metrics.get("coco_ap_style", {}) + circ = metrics.get("circularity_confidence", {}) + + run["sam"] = { + "success": True, + "latency_s": run["sam"]["latency_s"], + "total_detections": summary.get("total_detections", 0), + "avg_confidence": circ.get("average_confidence", 0.0), + "average_iou": coco.get("average_iou", 0.0), + "stability_rate": coco.get("stability_rate", 0.0), + } + + # Parse per-object data from SAM response + for frame in sam_data.get("detections", []): + for det in frame.get("detections", []): + bbox = det.get("bbox", [0, 0, 0, 0]) + run["objects"].append({ + "object_id": det.get("class_name", "object"), + "label": "", # filled by CLIP below + "bbox_x1": bbox[0] if len(bbox) > 0 else "", + "bbox_y1": bbox[1] if len(bbox) > 1 else "", + "bbox_x2": bbox[2] if len(bbox) > 2 else "", + "bbox_y2": bbox[3] if len(bbox) > 3 else "", + "sam_confidence": det.get("confidence", ""), + "clip_confidence": "", + "distance_cm": det.get("distance_cm", ""), + "iou_score": det.get("iou_with_previous", ""), + "is_stable": det.get("is_stable_detection", ""), + "has_grasp": False, + "grasp": {}, + }) + self.get_logger().info(f" SAM: {run['sam']['total_detections']} objects detected") + except Exception as e: + self.get_logger().error(f" SAM parse error: {e}") + + # Wait 500 ms for CLIP to auto-process the published SAM topic + time.sleep(0.5) + + # ---- 2. CLIP ---- + self.get_logger().info(f"RUN #{run_no} — Step 2: CLIP /vision/classify_bbox_filtered") + t_clip = time.perf_counter() + clip_ok, clip_msg = self._call(self._clip_client, "/vision/classify_bbox_filtered") + run["clip"]["latency_s"] = round(time.perf_counter() - t_clip, 3) + if clip_ok and clip_msg: + try: + clip_data = json.loads(clip_msg) + run["clip"] = { + "success": True, + "latency_s": run["clip"]["latency_s"], + "total_sam_regions": clip_data.get("total_sam_regions", 0), + "filtered_regions": clip_data.get("filtered_regions", 0), + } + # Merge CLIP labels into objects by region_id index + clip_regions = {r["region_id"]: r for r in clip_data.get("regions", [])} + for idx, obj in enumerate(run["objects"]): + region = clip_regions.get(idx) + if region: + obj["label"] = region.get("label", "") + obj["clip_confidence"] = region.get("confidence", "") + self.get_logger().info(f" CLIP: {run['clip']['filtered_regions']} regions classified") + except Exception as e: + self.get_logger().error(f" CLIP parse error: {e}") + + # ---- 3. Scene Understanding ---- + self.get_logger().info(f"RUN #{run_no} — Step 3: Scene /vision/understand_scene") + t_scene = time.perf_counter() + scene_ok, scene_msg = self._call(self._scene_client, "/vision/understand_scene", timeout=15.0) + run["scene"]["latency_s"] = round(time.perf_counter() - t_scene, 3) + if scene_ok and scene_msg: + try: + scene_data = json.loads(scene_msg) + run["scene"] = { + "success": True, + "latency_s": run["scene"]["latency_s"], + "scene_id": scene_data.get("scene_id", ""), + "total_objects": scene_data.get("total_objects", 0), + "total_relations": scene_data.get("total_relations", 0), + "graspable_objects": scene_data.get("graspable_objects", 0), + "scene_description": scene_data.get("scene_description", ""), + } + + # Per-object details (enriches existing objects list) + scene_objects = scene_data.get("objects", {}) + for obj_id, obj_info in scene_objects.items(): + # Try to match to an existing object entry by bounding box proximity + # or append a new entry if scene understanding found it + matched = False + s_bbox = obj_info.get("bbox", []) + for obj in run["objects"]: + if (len(s_bbox) >= 4 and + obj.get("bbox_x1") == s_bbox[0] and + obj.get("bbox_y1") == s_bbox[1]): + # Update with scene info + obj["label"] = obj.get("label") or obj_info.get("label", "") + obj["has_grasp"] = obj_info.get("has_grasp", False) + if obj_info.get("has_grasp") and obj_info.get("grasp_quality") is not None: + obj["grasp"] = { + "quality_score": obj_info.get("grasp_quality"), + "width_m": "", # not available at this level + } + matched = True + break + + if not matched: + s_bbox_safe = s_bbox if len(s_bbox) >= 4 else ["", "", "", ""] + run["objects"].append({ + "object_id": obj_id, + "label": obj_info.get("label", ""), + "bbox_x1": s_bbox_safe[0], + "bbox_y1": s_bbox_safe[1], + "bbox_x2": s_bbox_safe[2], + "bbox_y2": s_bbox_safe[3], + "sam_confidence": "", + "clip_confidence": obj_info.get("confidence", ""), + "distance_cm": obj_info.get("distance_cm", ""), + "iou_score": "", + "is_stable": "", + "has_grasp": obj_info.get("has_grasp", False), + "grasp": { + "quality_score": obj_info.get("grasp_quality", ""), + "width_m": "", + }, + }) + + # Relations for this object + for rel in obj_info.get("relations", []): + run["relations"].append({ + "subject": obj_id, + "relation": rel.get("relation", ""), + "target_object": rel.get("target_object", ""), + "confidence": rel.get("confidence", ""), + "distance_2d": rel.get("distance_2d", ""), + "description": rel.get("description", ""), + }) + + self.get_logger().info( + f" Scene: {run['scene']['total_objects']} objects, " + f"{run['scene']['total_relations']} relations" + ) + except Exception as e: + self.get_logger().error(f" Scene parse error: {e}") + + # ---- 4. OBB Angle Benchmark ---- + self.get_logger().info(f"RUN #{run_no} — Step 4: OBB /obb/find_object_angle") + t_obb = time.perf_counter() + obb_resp = self._call_obb() + run["obb"]["latency_s"] = round(time.perf_counter() - t_obb, 3) + + if obb_resp is not None and obb_resp.success: + run["obb"]["success"] = True + run["obb"]["total_objects"] = obb_resp.total_objects + + # Build a lookup: object_id -> OBB data + obb_by_id = {} + for i, oid in enumerate(obb_resp.object_ids): + theta_rad = obb_resp.thetas[i] + # The service already stores the remapped angle (90 - geom_deg). + # angle_deg is directly the display angle (0° = vertical). + angle_deg = math.degrees(theta_rad) + obb_by_id[oid] = { + "obb_center_u": obb_resp.centers_u[i], + "obb_center_v": obb_resp.centers_v[i], + "obb_theta_rad": round(theta_rad, 5), + "obb_angle_deg": round(angle_deg, 2), + "obb_width_px": round(obb_resp.widths[i], 2), + "obb_height_px": round(obb_resp.heights[i], 2), + } + + # Merge into existing objects list (match by object_id) + for obj in run["objects"]: + obb = obb_by_id.get(obj.get("object_id", "")) + if obb: + obj.update(obb) + else: + # Initialise missing OBB fields so Excel has consistent columns + obj.setdefault("obb_center_u", "") + obj.setdefault("obb_center_v", "") + obj.setdefault("obb_theta_rad", "") + obj.setdefault("obb_angle_deg", "") + obj.setdefault("obb_width_px", "") + obj.setdefault("obb_height_px", "") + + self.get_logger().info( + f" OBB: {obb_resp.total_objects} objects, " + f"latency={run['obb']['latency_s']}s" + ) + else: + # Ensure OBB keys exist even if service was unavailable + for obj in run["objects"]: + for k in ("obb_center_u","obb_center_v","obb_theta_rad","obb_angle_deg","obb_width_px","obb_height_px"): + obj.setdefault(k, "") + self.get_logger().warn(" OBB: service unavailable or returned failure") + + # ---- Finalize ---- + run["meta"]["latency_s"] = round(time.perf_counter() - t0, 3) + self.get_logger().info( + f"RUN #{run_no} complete in {run['meta']['latency_s']}s — " + f"{len(run['objects'])} objects, {len(run['relations'])} relations" + ) + self.get_logger().info("=" * 60) + return run + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main(args=None): + rclpy.init(args=args) + + node = VisionDataCollector() + + print() + print("=" * 60) + print(" Vision Pipeline Data Collector & Excel Exporter") + print("=" * 60) + print(f" History file : {HISTORY_FILE}") + print(f" Excel file : {EXCEL_FILE}") + print(f" Max runs kept: {MAX_HISTORY}") + print("=" * 60) + + try: + # Determine the next run number from existing history + history = _load_history() + last_run_no = history[-1]["meta"]["run_no"] if history else 0 + run_no = last_run_no + 1 + + # Collect the run + run = node.collect_run(run_no) + + # Append to history and persist + history.append(run) + _save_history(history) + print(f"[OK] History saved → {HISTORY_FILE} ({len(history[-MAX_HISTORY:])} runs stored)") + + # Export all stored runs to Excel + _export_excel(history[-MAX_HISTORY:]) + + print() + print("Done. Open vision_runs_export.xlsx to view results.") + print() + + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + if rclpy.ok(): + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/vision_scripts/test_dashboard_data_flow.py b/vision_scripts/test_dashboard_data_flow.py new file mode 100644 index 0000000..dfc754b --- /dev/null +++ b/vision_scripts/test_dashboard_data_flow.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python3 +""" +Unit test: verify detect_objects → vision_runs_history.json → dashboard data flow. + +Runs WITHOUT ROS2. Tests: + 1. JSON file is written to the correct path + 2. JSON schema matches exactly what dashboard HTML expects + 3. /api/data live path (benchmark_dashboard SAM topic callback schema) + 4. /api/run-history path (run history table schema) + +Usage: + python3 vision_scripts/test_dashboard_data_flow.py +""" + +import json +import os +import sys +import tempfile +import traceback +from pathlib import Path +from datetime import datetime + +# ── Path setup ──────────────────────────────────────────────────────────────── +SCRIPT_DIR = Path(__file__).resolve().parent +PACKAGE_ROOT = SCRIPT_DIR.parent # src/vision/ +HISTORY_FILE = PACKAGE_ROOT / "vision_runs_history.json" + +PASS = "\033[92m[PASS]\033[0m" +FAIL = "\033[91m[FAIL]\033[0m" +INFO = "\033[94m[INFO]\033[0m" + +failures = [] + +def check(name, condition, detail=""): + if condition: + print(f" {PASS} {name}") + else: + print(f" {FAIL} {name}" + (f" — {detail}" if detail else "")) + failures.append(name) + + +# ── Fake detection data (mirrors what detect_objects_callback produces) ──────── + +FAKE_DETECTIONS = [ + { + "id": "object_0", + "class_name": "object_0", + "confidence": 0.82, + "bbox": [100, 150, 300, 400], + "center": [200, 275], + "area": 45000, + "distance_cm": 55.3, + "iou_score": 0.71, + "is_stable": True, + "matched_prev_id": "", + "mask": None, # would be np array in real code + }, + { + "id": "object_1", + "class_name": "object_1", + "confidence": 0.64, + "bbox": [400, 200, 580, 450], + "center": [490, 325], + "area": 34200, + "distance_cm": 72.1, + "iou_score": 0.0, + "is_stable": False, + "matched_prev_id": "", + "mask": None, + }, +] + +FAKE_CLIP = { + 0: {"label": "cup", "confidence": 0.91, "bbox": [100, 150, 300, 400]}, + 1: {"label": "bottle", "confidence": 0.78, "bbox": [400, 200, 580, 450]}, +} + + +# ── Reproduce _save_detect_objects_run logic verbatim ───────────────────────── + +def simulate_save(latest_detections, clip_classifications, latency_s, history_file): + """Exact copy of _save_detect_objects_run from simple_sam_detector.py.""" + history = [] + if history_file.exists(): + try: + with open(history_file, "r") as f: + data = json.load(f) + if isinstance(data, list): + history = data + except Exception: + pass + + last_run_no = history[-1]["meta"]["run_no"] if history else 0 + run_no = last_run_no + 1 + + # Build parallel arrays (same logic as detect_objects_callback) + object_ids = [] + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = [], [], [], [] + confidences = [] + distances_cm = [] + iou_scores = [] + is_stable_arr = [] + + for idx, det in enumerate(latest_detections): + clip_info = clip_classifications.get(idx) + if clip_info: + object_ids.append(f"{clip_info['label']}_{idx}") + confidences.append(float(clip_info["confidence"])) + else: + object_ids.append(det["id"]) + confidences.append(float(det["confidence"])) + + bbox = det["bbox"] + bbox_x1.append(bbox[0]); bbox_y1.append(bbox[1]) + bbox_x2.append(bbox[2]); bbox_y2.append(bbox[3]) + distances_cm.append(float(det.get("distance_cm", -1.0))) + iou_scores.append(float(det.get("iou_score", 0.0))) + is_stable_arr.append(bool(det.get("is_stable", False))) + + num_dets = len(latest_detections) + total_sam_conf = 0.0 + objects = [] + for idx in range(num_dets): + det = latest_detections[idx] + clip_info = clip_classifications.get(idx, {}) + sam_conf = float(det.get("confidence", 0.0)) + total_sam_conf += sam_conf + objects.append({ + "object_id": object_ids[idx] if idx < len(object_ids) else f"object_{idx}", + "label": clip_info.get("label", "") if clip_info else "", + "bbox_x1": bbox_x1[idx] if idx < len(bbox_x1) else 0, + "bbox_y1": bbox_y1[idx] if idx < len(bbox_y1) else 0, + "bbox_x2": bbox_x2[idx] if idx < len(bbox_x2) else 0, + "bbox_y2": bbox_y2[idx] if idx < len(bbox_y2) else 0, + "sam_confidence": round(sam_conf, 4), + "clip_confidence": round(float(clip_info.get("confidence", 0.0)), 4) if clip_info else "", + "distance_cm": distances_cm[idx] if idx < len(distances_cm) else "", + "iou_score": iou_scores[idx] if idx < len(iou_scores) else "", + "is_stable": is_stable_arr[idx] if idx < len(is_stable_arr) else "", + "has_grasp": False, + "grasp": {}, + "obb_angle_deg": "", "obb_theta_rad": "", + "obb_width_px": "", "obb_height_px": "", + "obb_center_u": "", "obb_center_v": "", + }) + + avg_sam_conf = total_sam_conf / num_dets if num_dets > 0 else 0.0 + avg_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0.0 + stability_rate = sum(1 for s in is_stable_arr if s) / len(is_stable_arr) if is_stable_arr else 0.0 + + run = { + "meta": { + "run_no": run_no, + "timestamp": datetime.utcnow().isoformat() + "Z", + "latency_s": round(latency_s, 3), + "source": "detect_objects", + }, + "sam": { + "success": True, + "latency_s": round(latency_s, 3), + "total_detections": num_dets, + "avg_confidence": round(avg_sam_conf, 4), + "average_iou": round(avg_iou, 4), + "stability_rate": round(stability_rate, 4), + }, + "clip": { + "success": bool(clip_classifications), + "latency_s": 0.0, + "filtered_regions": len(clip_classifications), + }, + "scene": {"success": False, "latency_s": 0.0}, + "obb": {"success": False, "latency_s": 0.0}, + "objects": objects, + "relations": [], + "grasps": [], + } + + history.append(run) + history = history[-20:] + + with open(history_file, "w") as f: + json.dump(history, f, indent=2) + + return run, history + + +# ── Test 1: File is written ──────────────────────────────────────────────────── + +def test_file_written(): + print("\n── Test 1: File write ───────────────────────────────────────────") + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tf: + tmp = Path(tf.name) + try: + run, history = simulate_save(FAKE_DETECTIONS, FAKE_CLIP, 1.23, tmp) + check("File created", tmp.exists()) + check("File is valid JSON", True) # would have thrown above + check("History is list", isinstance(history, list)) + check("History has 1 entry", len(history) == 1) + check("run_no == 1", run["meta"]["run_no"] == 1) + print(f" {INFO} Written to: {tmp}") + return tmp, run, history + except Exception as e: + check("No exception during save", False, str(e)) + traceback.print_exc() + return None, None, None + finally: + pass # keep file for next test + + +# ── Test 2: JSON schema matches /api/run-history dashboard expectations ──────── + +def test_run_history_schema(run): + print("\n── Test 2: /api/run-history schema ─────────────────────────────") + if run is None: + print(" Skipped (previous test failed)") + return + + meta = run.get("meta", {}) + sam = run.get("sam", {}) + clip = run.get("clip", {}) + objs = run.get("objects", []) + + # meta fields (dashboard uses: run_no, timestamp, latency_s) + check("meta.run_no present", "run_no" in meta) + check("meta.timestamp present", "timestamp" in meta) + check("meta.latency_s present", "latency_s" in meta) + + # sam fields (dashboard uses: total_detections, avg_confidence, average_iou, stability_rate, latency_s, success) + for field in ["total_detections", "avg_confidence", "average_iou", "stability_rate", "latency_s", "success"]: + check(f"sam.{field} present", field in sam) + + # clip fields (dashboard uses: filtered_regions, latency_s, success) + for field in ["filtered_regions", "latency_s", "success"]: + check(f"clip.{field} present", field in clip) + + # objects array + check("objects is list", isinstance(objs, list)) + check("objects not empty", len(objs) == len(FAKE_DETECTIONS)) + + # per-object fields (dashboard uses all of these) + obj_fields = [ + "object_id", "label", "bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2", + "sam_confidence", "clip_confidence", "distance_cm", + "iou_score", "is_stable", "has_grasp", "grasp", + "obb_angle_deg", "obb_theta_rad", + ] + if objs: + for field in obj_fields: + check(f"objects[0].{field} present", field in objs[0]) + + # value spot-checks + check("sam.total_detections == 2", sam.get("total_detections") == 2) + check("clip.filtered_regions == 2", clip.get("filtered_regions") == 2) + check("object label 'cup' in object_id", "cup" in objs[0].get("object_id", "") or objs[0].get("label") == "cup") + check("bbox values correct", objs[0]["bbox_x1"] == 100 and objs[0]["bbox_y2"] == 400) + check("sam_confidence is float", isinstance(objs[0]["sam_confidence"], float)) + check("clip_confidence is float", isinstance(objs[0]["clip_confidence"], float)) + check("iou_score is float", isinstance(objs[0]["iou_score"], float)) + check("is_stable is bool", isinstance(objs[0]["is_stable"], bool)) + + +# ── Test 3: /api/data schema (benchmark_dashboard SAM topic callback) ────────── + +def test_api_data_schema(): + print("\n── Test 3: /api/data schema (sam_detections_callback output) ───") + # Replicate what benchmark_dashboard.sam_detections_callback produces + timestamp = datetime.now().isoformat() + sam_data_records = [] + for det in FAKE_DETECTIONS: + bbox = det["bbox"] + center = det["center"] + record = { + "timestamp": timestamp, + "frame_id": "camera_link", + "obj_id": det["id"], + "bbox": {"x1": bbox[0], "y1": bbox[1], "x2": bbox[2], "y2": bbox[3]}, + "center": {"u": center[0], "v": center[1]}, + "confidence": float(det["confidence"]), + "area": det["area"], + "distance_cm": float(det["distance_cm"]), + "iou_score": float(det["iou_score"]), + "is_stable": bool(det["is_stable"]), + "ap_iou_threshold": 0.5 if det["is_stable"] else 0.0, + } + sam_data_records.append(record) + + # Check each field that updateSAMDetections() accesses in the HTML + r = sam_data_records[0] + for field in ["obj_id", "bbox", "center", "confidence", "iou_score", "is_stable", "distance_cm", "timestamp"]: + check(f"sam record has '{field}'", field in r) + + check("bbox has x1/y1/x2/y2", all(k in r["bbox"] for k in ["x1","y1","x2","y2"])) + check("center has u/v", all(k in r["center"] for k in ["u","v"])) + check("confidence is numeric", isinstance(r["confidence"], float)) + check("distance_cm is numeric", isinstance(r["distance_cm"], float)) + + # Simulate full /api/data structure + api_data = { + "pixel_to_real": [], + "sam_detections": sam_data_records, + "clip_classifications":[], + "grasp_detections": [], + "scene_understanding": [], + "metadata": {"start_time": timestamp, "total_calls": len(sam_data_records)}, + } + # Check updateDashboard() fields + for key in ["pixel_to_real", "sam_detections", "clip_classifications", + "grasp_detections", "scene_understanding", "metadata"]: + check(f"/api/data has '{key}'", key in api_data) + check("metadata.total_calls present", "total_calls" in api_data["metadata"]) + + +# ── Test 4: Incremental append (run_no increments) ─────────────────────────── + +def test_incremental_append(tmp_file): + print("\n── Test 4: Incremental append (run numbers) ─────────────────────") + if tmp_file is None: + print(" Skipped") + return + simulate_save(FAKE_DETECTIONS, FAKE_CLIP, 0.9, tmp_file) + simulate_save(FAKE_DETECTIONS, FAKE_CLIP, 1.1, tmp_file) + + with open(tmp_file) as f: + history = json.load(f) + + check("3 runs in history", len(history) == 3) + check("run_no increments: 1,2,3", [r["meta"]["run_no"] for r in history] == [1, 2, 3]) + check("latest run_no is 3", history[-1]["meta"]["run_no"] == 3) + check("source == 'detect_objects'", all(r["meta"].get("source") == "detect_objects" for r in history)) + + +# ── Test 5: Actual HISTORY_FILE path resolution ─────────────────────────────── + +def test_path_resolution(): + print("\n── Test 5: Path resolution ──────────────────────────────────────") + # This is how simple_sam_detector.py resolves the path: + # Path(__file__).parent.parent where __file__ is vision/simple_sam_detector.py + sam_file = PACKAGE_ROOT / "vision" / "simple_sam_detector.py" + resolved = sam_file.parent.parent / "vision_runs_history.json" + + # And how benchmark_dashboard.py resolves it: + dash_file = PACKAGE_ROOT / "vision" / "benchmark_dashboard.py" + dash_resolved = dash_file.parent.parent / "vision_runs_history.json" + + check("simple_sam_detector path resolves correctly", + str(resolved) == str(HISTORY_FILE), + f"got {resolved}") + check("benchmark_dashboard path resolves correctly", + str(dash_resolved) == str(HISTORY_FILE), + f"got {dash_resolved}") + check("Both paths are identical", + resolved == dash_resolved) + check("simple_sam_detector.py exists", + sam_file.exists(), + f"missing: {sam_file}") + check("benchmark_dashboard.py exists", + dash_file.exists(), + f"missing: {dash_file}") + print(f" {INFO} History file path: {HISTORY_FILE}") + + +# ── Test 6: Dashboard HTML endpoints exist ──────────────────────────────────── + +def test_html_endpoints(): + print("\n── Test 6: Dashboard HTML references ────────────────────────────") + html_file = PACKAGE_ROOT / "dashboard" / "index.html" + check("index.html exists", html_file.exists()) + if html_file.exists(): + content = html_file.read_text() + check("fetches /api/data", "/api/data" in content) + check("fetches /api/run-history", "/api/run-history" in content) + check("polls every 2s", "2000" in content) + check("runHistoryBody table", "runHistoryBody" in content) + check("samBody table", "samBody" in content) + check("latestObjectsBody table", "latestObjectsBody" in content) + + +# ── Main ────────────────────────────────────────────────────────────────────── + +def main(): + print("=" * 65) + print(" Vision Dashboard Data-Flow Unit Test") + print("=" * 65) + + tmp_file, run, history = test_file_written() + test_run_history_schema(run) + test_api_data_schema() + test_incremental_append(tmp_file) + test_path_resolution() + test_html_endpoints() + + # Clean up temp file + if tmp_file and tmp_file.exists(): + tmp_file.unlink() + + print("\n" + "=" * 65) + if failures: + print(f"\033[91m {len(failures)} FAILED:\033[0m") + for f in failures: + print(f" • {f}") + print("=" * 65) + sys.exit(1) + else: + print(f"\033[92m All tests passed.\033[0m") + print("=" * 65) + + # ── Show current history file state ─────────────────────────────────────── + print(f"\n{INFO} Checking actual history file: {HISTORY_FILE}") + if HISTORY_FILE.exists(): + with open(HISTORY_FILE) as f: + data = json.load(f) + print(f" {PASS} File exists — {len(data)} run(s) stored") + if data: + latest = data[-1] + print(f" {INFO} Latest run: #{latest['meta']['run_no']} " + f"ts={latest['meta']['timestamp']} " + f"objects={latest['sam']['total_detections']}") + else: + print(f" \033[93m[WARN]\033[0m {HISTORY_FILE} does not exist yet.") + print(f" The file is created when you call:") + print(f" ros2 service call /vision/detect_objects custom_interfaces/srv/DetectObjects") + print(f" Make sure simple_sam_detector is running and a camera frame is available.") + + +if __name__ == "__main__": + main()