From 7e0610ee3ed325cc8120462eceefd9b6ea737929 Mon Sep 17 00:00:00 2001 From: Paul Nechifor Date: Sun, 22 Feb 2026 00:11:53 +0200 Subject: [PATCH] fix(metric3d): remove metric3d to fix tests --- dimos/environment/__init__.py | 0 dimos/environment/environment.py | 178 ------------------ dimos/models/depth/__init__.py | 0 dimos/models/depth/metric3d.py | 187 ------------------- dimos/models/depth/test_metric3d.py | 102 ----------- dimos/robot/all_blueprints.py | 1 - dimos/robot/drone/README.md | 9 +- dimos/robot/drone/camera_module.py | 68 +------ dimos/robot/drone/drone.py | 4 +- dimos/robot/unitree/depth_module.py | 243 ------------------------- dimos/robot/unitree_webrtc/__init__.py | 1 - dimos/types/manipulation.py | 2 +- onnx/metric3d_vit_small.onnx | 3 - 13 files changed, 8 insertions(+), 790 deletions(-) delete mode 100644 dimos/environment/__init__.py delete mode 100644 dimos/environment/environment.py delete mode 100644 dimos/models/depth/__init__.py delete mode 100644 dimos/models/depth/metric3d.py delete mode 100644 dimos/models/depth/test_metric3d.py delete mode 100644 dimos/robot/unitree/depth_module.py delete mode 100644 onnx/metric3d_vit_small.onnx diff --git a/dimos/environment/__init__.py b/dimos/environment/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/dimos/environment/environment.py b/dimos/environment/environment.py deleted file mode 100644 index ba1923b765..0000000000 --- a/dimos/environment/environment.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC, abstractmethod - -import numpy as np - - -class Environment(ABC): - def __init__(self) -> None: - self.environment_type = None - self.graph = None - - @abstractmethod - def label_objects(self) -> list[str]: - """ - Label all objects in the environment. - - Returns: - A list of string labels representing the objects in the environment. - """ - pass - - @abstractmethod - def get_visualization(self, format_type): # type: ignore[no-untyped-def] - """Return different visualization formats like images, NERFs, or other 3D file types.""" - pass - - @abstractmethod - def generate_segmentations( # type: ignore[no-untyped-def] - self, model: str | None = None, objects: list[str] | None = None, *args, **kwargs - ) -> list[np.ndarray]: # type: ignore[type-arg] - """ - Generate object segmentations of objects[] using neural methods. - - Args: - model (str, optional): The string of the desired segmentation model (SegmentAnything, etc.) - objects (list[str], optional): The list of strings of the specific objects to segment. - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Returns: - list of numpy.ndarray: A list where each element is a numpy array - representing a binary mask for a segmented area of an object in the environment. - - Note: - The specific arguments and their usage will depend on the subclass implementation. - """ - pass - - @abstractmethod - def get_segmentations(self) -> list[np.ndarray]: # type: ignore[type-arg] - """ - Get segmentations using a method like 'segment anything'. - - Returns: - list of numpy.ndarray: A list where each element is a numpy array - representing a binary mask for a segmented area of an object in the environment. - """ - pass - - @abstractmethod - def generate_point_cloud(self, object: str | None = None, *args, **kwargs) -> np.ndarray: # type: ignore[no-untyped-def, type-arg] - """ - Generate a point cloud for the entire environment or a specific object. - - Args: - object (str, optional): The string of the specific object to get the point cloud for. - If None, returns the point cloud for the entire environment. - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Returns: - np.ndarray: A numpy array representing the generated point cloud. - Shape: (n, 3) where n is the number of points and each point is [x, y, z]. - - Note: - The specific arguments and their usage will depend on the subclass implementation. - """ - pass - - @abstractmethod - def get_point_cloud(self, object: str | None = None) -> np.ndarray: # type: ignore[type-arg] - """ - Return point clouds of the entire environment or a specific object. - - Args: - object (str, optional): The string of the specific object to get the point cloud for. If None, returns the point cloud for the entire environment. - - Returns: - np.ndarray: A numpy array representing the point cloud. - Shape: (n, 3) where n is the number of points and each point is [x, y, z]. - """ - pass - - @abstractmethod - def generate_depth_map( # type: ignore[no-untyped-def] - self, - stereo: bool | None = None, - monocular: bool | None = None, - model: str | None = None, - *args, - **kwargs, - ) -> np.ndarray: # type: ignore[type-arg] - """ - Generate a depth map using monocular or stereo camera methods. - - Args: - stereo (bool, optional): Whether to stereo camera is avaliable for ground truth depth map generation. - monocular (bool, optional): Whether to use monocular camera for neural depth map generation. - model (str, optional): The string of the desired monocular depth model (Metric3D, ZoeDepth, etc.) - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Returns: - np.ndarray: A 2D numpy array representing the generated depth map. - Shape: (height, width) where each value represents the depth - at that pixel location. - - Note: - The specific arguments and their usage will depend on the subclass implementation. - """ - pass - - @abstractmethod - def get_depth_map(self) -> np.ndarray: # type: ignore[type-arg] - """ - Return a depth map of the environment. - - Returns: - np.ndarray: A 2D numpy array representing the depth map. - Shape: (height, width) where each value represents the depth - at that pixel location. Typically, closer objects have smaller - values and farther objects have larger values. - - Note: - The exact range and units of the depth values may vary depending on the - specific implementation and the sensor or method used to generate the depth map. - """ - pass - - def initialize_from_images(self, images): # type: ignore[no-untyped-def] - """Initialize the environment from a set of image frames or video.""" - raise NotImplementedError("This method is not implemented for this environment type.") - - def initialize_from_file(self, file_path): # type: ignore[no-untyped-def] - """Initialize the environment from a spatial file type. - - Supported file types include: - - GLTF/GLB (GL Transmission Format) - - FBX (Filmbox) - - OBJ (Wavefront Object) - - USD/USDA/USDC (Universal Scene Description) - - STL (Stereolithography) - - COLLADA (DAE) - - Alembic (ABC) - - PLY (Polygon File Format) - - 3DS (3D Studio) - - VRML/X3D (Virtual Reality Modeling Language) - - Args: - file_path (str): Path to the spatial file. - - Raises: - NotImplementedError: If the method is not implemented for this environment type. - """ - raise NotImplementedError("This method is not implemented for this environment type.") diff --git a/dimos/models/depth/__init__.py b/dimos/models/depth/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/dimos/models/depth/metric3d.py b/dimos/models/depth/metric3d.py deleted file mode 100644 index a668ea321e..0000000000 --- a/dimos/models/depth/metric3d.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from functools import cached_property -from typing import Any - -import cv2 -import torch - -from dimos.models.base import LocalModel, LocalModelConfig - - -@dataclass -class Metric3DConfig(LocalModelConfig): - """Configuration for Metric3D depth estimation model.""" - - camera_intrinsics: list[float] = field(default_factory=lambda: [500.0, 500.0, 320.0, 240.0]) - """Camera intrinsics [fx, fy, cx, cy].""" - - gt_depth_scale: float = 256.0 - """Scale factor for ground truth depth.""" - - device: str = "cuda" if torch.cuda.is_available() else "cpu" - """Device to run the model on.""" - - -class Metric3D(LocalModel): - default_config = Metric3DConfig - config: Metric3DConfig - - def __init__(self, **kwargs: object) -> None: - super().__init__(**kwargs) - self.intrinsic = self.config.camera_intrinsics - self.intrinsic_scaled: list[float] | None = None - self.gt_depth_scale = self.config.gt_depth_scale - self.pad_info: list[int] | None = None - self.rgb_origin: Any = None - - @cached_property - def _model(self) -> Any: - model = torch.hub.load( # type: ignore[no-untyped-call] - "yvanyin/metric3d", "metric3d_vit_small", pretrain=True - ) - model = model.to(self.device) - model.eval() - return model - - """ - Input: Single image in RGB format - Output: Depth map - """ - - def update_intrinsic(self, intrinsic): # type: ignore[no-untyped-def] - """ - Update the intrinsic parameters dynamically. - Ensure that the input intrinsic is valid. - """ - if len(intrinsic) != 4: - raise ValueError("Intrinsic must be a list or tuple with 4 values: [fx, fy, cx, cy]") - self.intrinsic = intrinsic - print(f"Intrinsics updated to: {self.intrinsic}") - - def infer_depth(self, img, debug: bool = False): # type: ignore[no-untyped-def] - if debug: - print(f"Input image: {img}") - try: - if isinstance(img, str): - print(f"Image type string: {type(img)}") - img_data = cv2.imread(img) - if img_data is None: - raise ValueError(f"Failed to load image from {img}") - self.rgb_origin = img_data[:, :, ::-1] - else: - # print(f"Image type not string: {type(img)}, cv2 conversion assumed to be handled. If not, this will throw an error") - self.rgb_origin = img - except Exception as e: - print(f"Error parsing into infer_depth: {e}") - - img = self.rescale_input(img, self.rgb_origin) # type: ignore[no-untyped-call] - - with torch.no_grad(): - pred_depth, confidence, output_dict = self._model.inference({"input": img}) - - # Convert to PIL format - depth_image = self.unpad_transform_depth(pred_depth) # type: ignore[no-untyped-call] - - return depth_image.cpu().numpy() - - def save_depth(self, pred_depth) -> None: # type: ignore[no-untyped-def] - # Save the depth map to a file - pred_depth_np = pred_depth.cpu().numpy() - output_depth_file = "output_depth_map.png" - cv2.imwrite(output_depth_file, pred_depth_np) - print(f"Depth map saved to {output_depth_file}") - - # Adjusts input size to fit pretrained ViT model - def rescale_input(self, rgb, rgb_origin): # type: ignore[no-untyped-def] - #### ajust input size to fit pretrained model - # keep ratio resize - input_size = (616, 1064) # for vit model - # input_size = (544, 1216) # for convnext model - h, w = rgb_origin.shape[:2] - scale = min(input_size[0] / h, input_size[1] / w) - rgb = cv2.resize( - rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR - ) - # remember to scale intrinsic, hold depth - self.intrinsic_scaled = [ - self.intrinsic[0] * scale, - self.intrinsic[1] * scale, - self.intrinsic[2] * scale, - self.intrinsic[3] * scale, - ] - # padding to input_size - padding = [123.675, 116.28, 103.53] - h, w = rgb.shape[:2] - pad_h = input_size[0] - h - pad_w = input_size[1] - w - pad_h_half = pad_h // 2 - pad_w_half = pad_w // 2 - rgb = cv2.copyMakeBorder( - rgb, - pad_h_half, - pad_h - pad_h_half, - pad_w_half, - pad_w - pad_w_half, - cv2.BORDER_CONSTANT, - value=padding, - ) - self.pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half] - - #### normalize - mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None] - std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None] - rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float() - rgb = torch.div((rgb - mean), std) - rgb = rgb[None, :, :, :].to(self.device) - return rgb - - def unpad_transform_depth(self, pred_depth): # type: ignore[no-untyped-def] - # un pad - pred_depth = pred_depth.squeeze() - pred_depth = pred_depth[ - self.pad_info[0] : pred_depth.shape[0] - self.pad_info[1], # type: ignore[index] - self.pad_info[2] : pred_depth.shape[1] - self.pad_info[3], # type: ignore[index] - ] - - # upsample to original size - pred_depth = torch.nn.functional.interpolate( - pred_depth[None, None, :, :], - self.rgb_origin.shape[:2], - mode="bilinear", - ).squeeze() - ###################### canonical camera space ###################### - - #### de-canonical transform - canonical_to_real_scale = ( - self.intrinsic_scaled[0] / 1000.0 # type: ignore[index] - ) # 1000.0 is the focal length of canonical camera - pred_depth = pred_depth * canonical_to_real_scale # now the depth is metric - pred_depth = torch.clamp(pred_depth, 0, 1000) - return pred_depth - - def eval_predicted_depth(self, depth_file, pred_depth) -> None: # type: ignore[no-untyped-def] - if depth_file is not None: - gt_depth_np = cv2.imread(depth_file, -1) - if gt_depth_np is None: - raise ValueError(f"Failed to load depth file from {depth_file}") - gt_depth_scaled = gt_depth_np / self.gt_depth_scale - gt_depth = torch.from_numpy(gt_depth_scaled).float().to(self.device) - assert gt_depth.shape == pred_depth.shape - - mask = gt_depth > 1e-8 # type: ignore[operator] - abs_rel_err = (torch.abs(pred_depth[mask] - gt_depth[mask]) / gt_depth[mask]).mean() # type: ignore[index] - print("abs_rel_err:", abs_rel_err.item()) diff --git a/dimos/models/depth/test_metric3d.py b/dimos/models/depth/test_metric3d.py deleted file mode 100644 index 33e39f6a29..0000000000 --- a/dimos/models/depth/test_metric3d.py +++ /dev/null @@ -1,102 +0,0 @@ -from contextlib import contextmanager - -import numpy as np -import pytest - -from dimos.models.depth.metric3d import Metric3D -from dimos.msgs.sensor_msgs import Image -from dimos.utils.data import get_data - - -@contextmanager -def skip_xformers_unsupported(): - try: - yield - except NotImplementedError as e: - if "memory_efficient_attention" in str(e): - pytest.skip(f"xformers not supported on this GPU: {e}") - raise - - -@pytest.fixture -def sample_intrinsics() -> list[float]: - """Sample camera intrinsics [fx, fy, cx, cy].""" - return [500.0, 500.0, 320.0, 240.0] - -@pytest.mark.cuda -@pytest.mark.gpu -def test_metric3d_init(sample_intrinsics: list[float]) -> None: - """Test Metric3D initialization.""" - model = Metric3D(camera_intrinsics=sample_intrinsics) - assert model.config.camera_intrinsics == sample_intrinsics - assert model.config.gt_depth_scale == 256.0 - assert model.device == "cuda" - - -@pytest.mark.gpu -def test_metric3d_update_intrinsic(sample_intrinsics: list[float]) -> None: - """Test updating camera intrinsics.""" - model = Metric3D(camera_intrinsics=sample_intrinsics) - - new_intrinsics = [600.0, 600.0, 400.0, 300.0] - model.update_intrinsic(new_intrinsics) - assert model.intrinsic == new_intrinsics - -@pytest.mark.gpu -def test_metric3d_update_intrinsic_invalid(sample_intrinsics: list[float]) -> None: - """Test that invalid intrinsics raise an error.""" - model = Metric3D(camera_intrinsics=sample_intrinsics) - - with pytest.raises(ValueError, match="Intrinsic must be a list"): - model.update_intrinsic([1.0, 2.0]) # Only 2 values - - -@pytest.mark.cuda -@pytest.mark.gpu -def test_metric3d_infer_depth(sample_intrinsics: list[float]) -> None: - """Test depth inference on a sample image.""" - model = Metric3D(camera_intrinsics=sample_intrinsics) - model.start() - - # Load test image - image = Image.from_file(get_data("cafe.jpg")).to_rgb() - rgb_array = image.data - - # Run inference - with skip_xformers_unsupported(): - depth_map = model.infer_depth(rgb_array) - - # Verify output - assert isinstance(depth_map, np.ndarray) - assert depth_map.shape[:2] == rgb_array.shape[:2] # Same spatial dimensions - assert depth_map.dtype in [np.float32, np.float64] - assert depth_map.min() >= 0 # Depth should be non-negative - - print(f"Depth map shape: {depth_map.shape}") - print(f"Depth range: [{depth_map.min():.2f}, {depth_map.max():.2f}]") - - model.stop() - - -@pytest.mark.cuda -@pytest.mark.gpu -def test_metric3d_multiple_inferences(sample_intrinsics: list[float]) -> None: - """Test multiple depth inferences.""" - model = Metric3D(camera_intrinsics=sample_intrinsics) - model.start() - - image = Image.from_file(get_data("cafe.jpg")).to_rgb() - rgb_array = image.data - - # Run multiple inferences - depths = [] - for _ in range(3): - with skip_xformers_unsupported(): - depth = model.infer_depth(rgb_array) - depths.append(depth) - - # Results should be consistent - for i in range(1, len(depths)): - assert np.allclose(depths[0], depths[i], rtol=1e-5) - - model.stop() diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 19d7e7db29..bdfd98cd17 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -95,7 +95,6 @@ "cost_mapper": "dimos.mapping.costmapper", "demo_calculator_skill": "dimos.agents.skills.demo_calculator_skill", "demo_robot": "dimos.agents.skills.demo_robot", - "depth_module": "dimos.robot.unitree.depth_module", "detection3d_module": "dimos.perception.detection.module3D", "detection_db_module": "dimos.perception.detection.moduleDB", "fastlio2_module": "dimos.hardware.sensors.lidar.fastlio2.module", diff --git a/dimos/robot/drone/README.md b/dimos/robot/drone/README.md index 6e8ceb4d63..100e2deadd 100644 --- a/dimos/robot/drone/README.md +++ b/dimos/robot/drone/README.md @@ -126,7 +126,7 @@ DJI Drone ← Wireless → DJI Controller ← USB → Android Device ← WiFi ``` drone.py # Main orchestrator ├── connection_module.py # MAVLink communication & skills -├── camera_module.py # Video processing & depth estimation +├── camera_module.py # Video processing ├── tracking_module.py # Visual servoing & object tracking ├── mavlink_connection.py # Low-level MAVLink protocol └── dji_video_stream.py # GStreamer video capture @@ -242,13 +242,6 @@ drone.start() - **ROS/DimOS**: X=Forward, Y=Left, Z=Up - Automatic conversion handled internally -### Depth Estimation -Camera module can generate depth maps using Metric3D: -```python -# Depth published to /drone/depth and /drone/pointcloud -# Requires GPU with 8GB+ VRAM -``` - ### Foxglove Visualization Connect Foxglove Studio to `ws://localhost:8765` to see: - Live video with tracking overlay diff --git a/dimos/robot/drone/camera_module.py b/dimos/robot/drone/camera_module.py index 8ba88fd028..248b1ceb6e 100644 --- a/dimos/robot/drone/camera_module.py +++ b/dimos/robot/drone/camera_module.py @@ -15,7 +15,7 @@ # Copyright 2025-2026 Dimensional Inc. -"""Camera module for drone with depth estimation.""" +"""Camera module for drone.""" import threading import time @@ -25,9 +25,8 @@ from dimos.core import In, Module, Out, rpc from dimos.msgs.geometry_msgs import PoseStamped -from dimos.msgs.sensor_msgs import Image, ImageFormat +from dimos.msgs.sensor_msgs import Image from dimos.msgs.std_msgs import Header -from dimos.perception.common.utils import colorize_depth from dimos.utils.logging_config import setup_logger logger = setup_logger() @@ -35,15 +34,13 @@ class DroneCameraModule(Module): """ - Camera module for drone that processes RGB images to generate depth using Metric3D. + Camera module for drone Subscribes to: - /video: RGB camera images from drone Publishes: - /drone/color_image: RGB camera images - - /drone/depth_image: Depth images from Metric3D - - /drone/depth_colorized: Colorized depth - /drone/camera_info: Camera calibration - /drone/camera_pose: Camera pose from TF """ @@ -53,8 +50,6 @@ class DroneCameraModule(Module): # Outputs color_image: Out[Image] - depth_image: Out[Image] - depth_colorized: Out[Image] camera_info: Out[CameraInfo] camera_pose: Out[PoseStamped] @@ -64,7 +59,6 @@ def __init__( world_frame_id: str = "world", camera_frame_id: str = "camera_link", base_frame_id: str = "base_link", - gt_depth_scale: float = 2.0, **kwargs: Any, ) -> None: """Initialize drone camera module. @@ -73,7 +67,6 @@ def __init__( camera_intrinsics: [fx, fy, cx, cy] camera_frame_id: TF frame for camera base_frame_id: TF frame for drone base - gt_depth_scale: Depth scale factor """ super().__init__(**kwargs) @@ -84,10 +77,6 @@ def __init__( self.camera_frame_id = camera_frame_id self.base_frame_id = base_frame_id self.world_frame_id = world_frame_id - self.gt_depth_scale = gt_depth_scale - - # Metric3D for depth - self.metric3d: Any = None # Lazy-loaded Metric3D model # Processing state self._running = False @@ -104,7 +93,6 @@ def start(self) -> None: logger.warning("Camera module already running") return - # Start processing thread for depth (which will init Metric3D and handle video) self._running = True self._stop_processing.clear() self._processing_thread = threading.Thread(target=self._processing_loop, daemon=True) @@ -121,22 +109,9 @@ def _on_video_frame(self, frame: Image) -> None: # Publish color image immediately self.color_image.publish(frame) - # Store for depth processing self._latest_frame = frame def _processing_loop(self) -> None: - """Process depth estimation in background.""" - # Initialize Metric3D in the background thread - if self.metric3d is None: - try: - from dimos.models.depth.metric3d import Metric3D - - self.metric3d = Metric3D(camera_intrinsics=self.camera_intrinsics) - logger.info("Metric3D initialized") - except Exception as e: - logger.warning(f"Metric3D not available: {e}") - self.metric3d = None - # Subscribe to video once connection is available subscribed = False while not subscribed and not self._stop_processing.is_set(): @@ -151,12 +126,10 @@ def _processing_loop(self) -> None: logger.debug(f"Waiting for video connection: {e}") time.sleep(0.1) - logger.info("Depth processing loop started") - _reported_error = False while not self._stop_processing.is_set(): - if self._latest_frame is not None and self.metric3d is not None: + if self._latest_frame is not None: try: frame = self._latest_frame self._latest_frame = None @@ -164,34 +137,9 @@ def _processing_loop(self) -> None: # Get numpy array from Image img_array = frame.data - # Generate depth - depth_array = self.metric3d.infer_depth(img_array) / self.gt_depth_scale - # Create header header = Header(self.camera_frame_id) - # Publish depth - depth_msg = Image( - data=depth_array, - format=ImageFormat.DEPTH, - frame_id=header.frame_id, - ts=header.ts, - ) - self.depth_image.publish(depth_msg) - - # Publish colorized depth - depth_colorized_array = colorize_depth( - depth_array, max_depth=10.0, overlay_stats=True - ) - if depth_colorized_array is not None: - depth_colorized_msg = Image( - data=depth_colorized_array, - format=ImageFormat.RGB, - frame_id=header.frame_id, - ts=header.ts, - ) - self.depth_colorized.publish(depth_colorized_msg) - # Publish camera info self._publish_camera_info(header, img_array.shape) @@ -201,12 +149,10 @@ def _processing_loop(self) -> None: except Exception as e: if not _reported_error: _reported_error = True - logger.error(f"Error processing depth: {e}") + logger.error(f"Error processing frame: {e}") else: time.sleep(0.01) - logger.info("Depth processing loop stopped") - def _publish_camera_info(self, header: Header, shape: tuple[int, ...]) -> None: """Publish camera calibration info.""" try: @@ -279,8 +225,4 @@ def stop(self) -> None: if self._processing_thread and self._processing_thread.is_alive(): self._processing_thread.join(timeout=2.0) - # Cleanup Metric3D - if self.metric3d: - self.metric3d.cleanup() - logger.info("Camera module stopped") diff --git a/dimos/robot/drone/drone.py b/dimos/robot/drone/drone.py index 8e72d56ed1..6b9500804f 100644 --- a/dimos/robot/drone/drone.py +++ b/dimos/robot/drone/drone.py @@ -51,7 +51,7 @@ class Drone(Robot): - """Generic MAVLink-based drone with video and depth capabilities.""" + """Generic MAVLink-based drone with video capabilities.""" def __init__( self, @@ -164,8 +164,6 @@ def _deploy_camera(self) -> None: # Configure LCM transports self.camera.color_image.transport = core.LCMTransport("/drone/color_image", Image) - self.camera.depth_image.transport = core.LCMTransport("/drone/depth_image", Image) - self.camera.depth_colorized.transport = core.LCMTransport("/drone/depth_colorized", Image) self.camera.camera_info.transport = core.LCMTransport("/drone/camera_info", CameraInfo) self.camera.camera_pose.transport = core.LCMTransport("/drone/camera_pose", PoseStamped) diff --git a/dimos/robot/unitree/depth_module.py b/dimos/robot/unitree/depth_module.py deleted file mode 100644 index 07f065caea..0000000000 --- a/dimos/robot/unitree/depth_module.py +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import threading -import time - -from dimos_lcm.sensor_msgs import CameraInfo -import numpy as np - -from dimos.core import In, Module, Out, rpc -from dimos.core.global_config import GlobalConfig -from dimos.msgs.sensor_msgs import Image, ImageFormat -from dimos.utils.logging_config import setup_logger - -logger = setup_logger() - - -class DepthModule(Module): - """ - Depth module for Unitree Go2 that processes RGB images to generate depth using Metric3D. - - Subscribes to: - - /go2/color_image: RGB camera images from Unitree - - /go2/camera_info: Camera calibration information - - Publishes: - - /go2/depth_image: Depth images generated by Metric3D - """ - - # LCM inputs - color_image: In[Image] - camera_info: In[CameraInfo] - - # LCM outputs - depth_image: Out[Image] - - def __init__( # type: ignore[no-untyped-def] - self, - gt_depth_scale: float = 0.5, - cfg: GlobalConfig | None = None, - **kwargs, - ) -> None: - """ - Initialize Depth Module. - - Args: - gt_depth_scale: Ground truth depth scaling factor - """ - super().__init__(**kwargs) - - self.camera_intrinsics = None - self.gt_depth_scale = gt_depth_scale - self.metric3d = None - self._camera_info_received = False - - # Processing state - self._running = False - self._latest_frame = None - self._last_image = None - self._last_timestamp = None - self._last_depth = None - self._cannot_process_depth = False - - # Threading - self._processing_thread: threading.Thread | None = None - self._stop_processing = threading.Event() - - if cfg: - if cfg.simulation: - self.gt_depth_scale = 1.0 - - @rpc - def start(self) -> None: - super().start() - - if self._running: - logger.warning("Camera module already running") - return - - # Set running flag before starting - self._running = True - - # Subscribe to video and camera info inputs - self.color_image.subscribe(self._on_video) - self.camera_info.subscribe(self._on_camera_info) - - # Start processing thread - self._start_processing_thread() - - logger.info("Depth module started") - - @rpc - def stop(self) -> None: - if not self._running: - return - - self._running = False - self._stop_processing.set() - - # Wait for thread to finish - if self._processing_thread and self._processing_thread.is_alive(): - self._processing_thread.join(timeout=2.0) - - super().stop() - - def _on_camera_info(self, msg: CameraInfo) -> None: - """Process camera info to extract intrinsics.""" - if self.metric3d is not None: - return # Already initialized - - try: - # Extract intrinsics from camera matrix K - K = msg.K - fx = K[0] - fy = K[4] - cx = K[2] - cy = K[5] - - self.camera_intrinsics = [fx, fy, cx, cy] # type: ignore[assignment] - - # Initialize Metric3D with camera intrinsics - from dimos.models.depth.metric3d import Metric3D - - self.metric3d = Metric3D(camera_intrinsics=self.camera_intrinsics) # type: ignore[assignment] - self._camera_info_received = True - - logger.info( - f"Initialized Metric3D with intrinsics from camera_info: {self.camera_intrinsics}" - ) - - except Exception as e: - logger.error(f"Error processing camera info: {e}") - - def _on_video(self, msg: Image) -> None: - """Store latest video frame for processing.""" - if not self._running: - return - - # Simply store the latest frame - processing happens in main loop - self._latest_frame = msg # type: ignore[assignment] - logger.debug( - f"Received video frame: format={msg.format}, shape={msg.data.shape if hasattr(msg.data, 'shape') else 'unknown'}" - ) - - def _start_processing_thread(self) -> None: - """Start the processing thread.""" - self._stop_processing.clear() - self._processing_thread = threading.Thread(target=self._main_processing_loop, daemon=True) - self._processing_thread.start() - logger.info("Started depth processing thread") - - def _main_processing_loop(self) -> None: - """Main processing loop that continuously processes latest frames.""" - logger.info("Starting main processing loop") - - while not self._stop_processing.is_set(): - # Process latest frame if available - if self._latest_frame is not None: - try: - msg = self._latest_frame - self._latest_frame = None # Clear to avoid reprocessing - # Store for publishing - self._last_image = msg.data - self._last_timestamp = msg.ts if msg.ts else time.time() - # Process depth - self._process_depth(self._last_image) - - except Exception as e: - logger.error(f"Error in main processing loop: {e}", exc_info=True) - else: - # Small sleep to avoid busy waiting - time.sleep(0.001) - - logger.info("Main processing loop stopped") - - def _process_depth(self, img_array: np.ndarray) -> None: # type: ignore[type-arg] - """Process depth estimation using Metric3D.""" - if self._cannot_process_depth: - self._last_depth = None - return - - # Wait for camera info to initialize Metric3D - if self.metric3d is None: - logger.debug("Waiting for camera_info to initialize Metric3D") - return - - try: - logger.debug(f"Processing depth for image shape: {img_array.shape}") - - # Generate depth map - depth_array = self.metric3d.infer_depth(img_array) * self.gt_depth_scale - - self._last_depth = depth_array - logger.debug(f"Generated depth map shape: {depth_array.shape}") - - self._publish_depth() - - except Exception as e: - logger.error(f"Error processing depth: {e}") - self._cannot_process_depth = True - - def _publish_depth(self) -> None: - """Publish depth image.""" - if not self._running: - return - - try: - # Publish depth image - if self._last_depth is not None: - # Convert depth to uint16 (millimeters) for more efficient storage - # Clamp to valid range [0, 65.535] meters before converting - depth_clamped = np.clip(self._last_depth, 0, 65.535) - depth_uint16 = (depth_clamped * 1000).astype(np.uint16) - depth_msg = Image( - data=depth_uint16, - format=ImageFormat.DEPTH16, # Use DEPTH16 format for uint16 depth - frame_id="camera_link", - ts=self._last_timestamp, - ) - self.depth_image.publish(depth_msg) - logger.debug(f"Published depth image (uint16): shape={depth_uint16.shape}") - - except Exception as e: - logger.error(f"Error publishing depth data: {e}", exc_info=True) - - -depth_module = DepthModule.blueprint - - -__all__ = ["DepthModule", "depth_module"] diff --git a/dimos/robot/unitree_webrtc/__init__.py b/dimos/robot/unitree_webrtc/__init__.py index 4524bba226..451aa53128 100644 --- a/dimos/robot/unitree_webrtc/__init__.py +++ b/dimos/robot/unitree_webrtc/__init__.py @@ -20,7 +20,6 @@ _ALIAS_MODULES = { "demo_error_on_name_conflicts": "dimos.robot.unitree.demo_error_on_name_conflicts", - "depth_module": "dimos.robot.unitree.depth_module", "keyboard_teleop": "dimos.robot.unitree.keyboard_teleop", "mujoco_connection": "dimos.robot.unitree.mujoco_connection", "type": "dimos.robot.unitree.type", diff --git a/dimos/types/manipulation.py b/dimos/types/manipulation.py index 507b9e9b85..76ad7979f2 100644 --- a/dimos/types/manipulation.py +++ b/dimos/types/manipulation.py @@ -80,7 +80,7 @@ class ObjectData(TypedDict, total=False): # Basic detection information object_id: int # Unique ID for the object bbox: list[float] # Bounding box [x1, y1, x2, y2] - depth: float # Depth in meters from Metric3d + depth: float # Depth in meters confidence: float # Detection confidence class_id: int # Class ID from the detector label: str # Semantic label (e.g., 'cup', 'table') diff --git a/onnx/metric3d_vit_small.onnx b/onnx/metric3d_vit_small.onnx deleted file mode 100644 index bfddd41628..0000000000 --- a/onnx/metric3d_vit_small.onnx +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:14805174265dd721ac3b396bd5ee7190c708cec41150ed298267f6c3126bc060 -size 151333865