From 7e0610ee3ed325cc8120462eceefd9b6ea737929 Mon Sep 17 00:00:00 2001
From: Paul Nechifor <paul@nechifor.net>
Date: Sun, 22 Feb 2026 00:11:53 +0200
Subject: [PATCH] fix(metric3d): remove metric3d to fix tests

---
 dimos/environment/__init__.py          |   0
 dimos/environment/environment.py       | 178 ------------------
 dimos/models/depth/__init__.py         |   0
 dimos/models/depth/metric3d.py         | 187 -------------------
 dimos/models/depth/test_metric3d.py    | 102 -----------
 dimos/robot/all_blueprints.py          |   1 -
 dimos/robot/drone/README.md            |   9 +-
 dimos/robot/drone/camera_module.py     |  68 +------
 dimos/robot/drone/drone.py             |   4 +-
 dimos/robot/unitree/depth_module.py    | 243 -------------------------
 dimos/robot/unitree_webrtc/__init__.py |   1 -
 dimos/types/manipulation.py            |   2 +-
 onnx/metric3d_vit_small.onnx           |   3 -
 13 files changed, 8 insertions(+), 790 deletions(-)
 delete mode 100644 dimos/environment/__init__.py
 delete mode 100644 dimos/environment/environment.py
 delete mode 100644 dimos/models/depth/__init__.py
 delete mode 100644 dimos/models/depth/metric3d.py
 delete mode 100644 dimos/models/depth/test_metric3d.py
 delete mode 100644 dimos/robot/unitree/depth_module.py
 delete mode 100644 onnx/metric3d_vit_small.onnx

diff --git a/dimos/environment/__init__.py b/dimos/environment/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/dimos/environment/environment.py b/dimos/environment/environment.py
deleted file mode 100644
index ba1923b765..0000000000
--- a/dimos/environment/environment.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from abc import ABC, abstractmethod
-
-import numpy as np
-
-
-class Environment(ABC):
-    def __init__(self) -> None:
-        self.environment_type = None
-        self.graph = None
-
-    @abstractmethod
-    def label_objects(self) -> list[str]:
-        """
-        Label all objects in the environment.
-
-        Returns:
-            A list of string labels representing the objects in the environment.
-        """
-        pass
-
-    @abstractmethod
-    def get_visualization(self, format_type):  # type: ignore[no-untyped-def]
-        """Return different visualization formats like images, NERFs, or other 3D file types."""
-        pass
-
-    @abstractmethod
-    def generate_segmentations(  # type: ignore[no-untyped-def]
-        self, model: str | None = None, objects: list[str] | None = None, *args, **kwargs
-    ) -> list[np.ndarray]:  # type: ignore[type-arg]
-        """
-        Generate object segmentations of objects[] using neural methods.
-
-        Args:
-            model (str, optional): The string of the desired segmentation model (SegmentAnything, etc.)
-            objects (list[str], optional): The list of strings of the specific objects to segment.
-            *args: Variable length argument list.
-            **kwargs: Arbitrary keyword arguments.
-
-        Returns:
-            list of numpy.ndarray: A list where each element is a numpy array
-            representing a binary mask for a segmented area of an object in the environment.
-
-        Note:
-            The specific arguments and their usage will depend on the subclass implementation.
-        """
-        pass
-
-    @abstractmethod
-    def get_segmentations(self) -> list[np.ndarray]:  # type: ignore[type-arg]
-        """
-        Get segmentations using a method like 'segment anything'.
-
-        Returns:
-            list of numpy.ndarray: A list where each element is a numpy array
-            representing a binary mask for a segmented area of an object in the environment.
-        """
-        pass
-
-    @abstractmethod
-    def generate_point_cloud(self, object: str | None = None, *args, **kwargs) -> np.ndarray:  # type: ignore[no-untyped-def, type-arg]
-        """
-        Generate a point cloud for the entire environment or a specific object.
-
-        Args:
-            object (str, optional): The string of the specific object to get the point cloud for.
-                                    If None, returns the point cloud for the entire environment.
-            *args: Variable length argument list.
-            **kwargs: Arbitrary keyword arguments.
-
-        Returns:
-            np.ndarray: A numpy array representing the generated point cloud.
-                        Shape: (n, 3) where n is the number of points and each point is [x, y, z].
-
-        Note:
-            The specific arguments and their usage will depend on the subclass implementation.
-        """
-        pass
-
-    @abstractmethod
-    def get_point_cloud(self, object: str | None = None) -> np.ndarray:  # type: ignore[type-arg]
-        """
-        Return point clouds of the entire environment or a specific object.
-
-        Args:
-            object (str, optional): The string of the specific object to get the point cloud for. If None, returns the point cloud for the entire environment.
-
-        Returns:
-            np.ndarray: A numpy array representing the point cloud.
-                        Shape: (n, 3) where n is the number of points and each point is [x, y, z].
-        """
-        pass
-
-    @abstractmethod
-    def generate_depth_map(  # type: ignore[no-untyped-def]
-        self,
-        stereo: bool | None = None,
-        monocular: bool | None = None,
-        model: str | None = None,
-        *args,
-        **kwargs,
-    ) -> np.ndarray:  # type: ignore[type-arg]
-        """
-        Generate a depth map using monocular or stereo camera methods.
-
-        Args:
-            stereo (bool, optional): Whether to stereo camera is avaliable for ground truth depth map generation.
-            monocular (bool, optional): Whether to use monocular camera for neural depth map generation.
-            model (str, optional): The string of the desired monocular depth model (Metric3D, ZoeDepth, etc.)
-            *args: Variable length argument list.
-            **kwargs: Arbitrary keyword arguments.
-
-        Returns:
-            np.ndarray: A 2D numpy array representing the generated depth map.
-                        Shape: (height, width) where each value represents the depth
-                        at that pixel location.
-
-        Note:
-            The specific arguments and their usage will depend on the subclass implementation.
-        """
-        pass
-
-    @abstractmethod
-    def get_depth_map(self) -> np.ndarray:  # type: ignore[type-arg]
-        """
-        Return a depth map of the environment.
-
-        Returns:
-            np.ndarray: A 2D numpy array representing the depth map.
-                        Shape: (height, width) where each value represents the depth
-                        at that pixel location. Typically, closer objects have smaller
-                        values and farther objects have larger values.
-
-        Note:
-            The exact range and units of the depth values may vary depending on the
-            specific implementation and the sensor or method used to generate the depth map.
-        """
-        pass
-
-    def initialize_from_images(self, images):  # type: ignore[no-untyped-def]
-        """Initialize the environment from a set of image frames or video."""
-        raise NotImplementedError("This method is not implemented for this environment type.")
-
-    def initialize_from_file(self, file_path):  # type: ignore[no-untyped-def]
-        """Initialize the environment from a spatial file type.
-
-        Supported file types include:
-        - GLTF/GLB (GL Transmission Format)
-        - FBX (Filmbox)
-        - OBJ (Wavefront Object)
-        - USD/USDA/USDC (Universal Scene Description)
-        - STL (Stereolithography)
-        - COLLADA (DAE)
-        - Alembic (ABC)
-        - PLY (Polygon File Format)
-        - 3DS (3D Studio)
-        - VRML/X3D (Virtual Reality Modeling Language)
-
-        Args:
-            file_path (str): Path to the spatial file.
-
-        Raises:
-            NotImplementedError: If the method is not implemented for this environment type.
-        """
-        raise NotImplementedError("This method is not implemented for this environment type.")
diff --git a/dimos/models/depth/__init__.py b/dimos/models/depth/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/dimos/models/depth/metric3d.py b/dimos/models/depth/metric3d.py
deleted file mode 100644
index a668ea321e..0000000000
--- a/dimos/models/depth/metric3d.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-from functools import cached_property
-from typing import Any
-
-import cv2
-import torch
-
-from dimos.models.base import LocalModel, LocalModelConfig
-
-
-@dataclass
-class Metric3DConfig(LocalModelConfig):
-    """Configuration for Metric3D depth estimation model."""
-
-    camera_intrinsics: list[float] = field(default_factory=lambda: [500.0, 500.0, 320.0, 240.0])
-    """Camera intrinsics [fx, fy, cx, cy]."""
-
-    gt_depth_scale: float = 256.0
-    """Scale factor for ground truth depth."""
-
-    device: str = "cuda" if torch.cuda.is_available() else "cpu"
-    """Device to run the model on."""
-
-
-class Metric3D(LocalModel):
-    default_config = Metric3DConfig
-    config: Metric3DConfig
-
-    def __init__(self, **kwargs: object) -> None:
-        super().__init__(**kwargs)
-        self.intrinsic = self.config.camera_intrinsics
-        self.intrinsic_scaled: list[float] | None = None
-        self.gt_depth_scale = self.config.gt_depth_scale
-        self.pad_info: list[int] | None = None
-        self.rgb_origin: Any = None
-
-    @cached_property
-    def _model(self) -> Any:
-        model = torch.hub.load(  # type: ignore[no-untyped-call]
-            "yvanyin/metric3d", "metric3d_vit_small", pretrain=True
-        )
-        model = model.to(self.device)
-        model.eval()
-        return model
-
-    """
-    Input: Single image in RGB format
-    Output: Depth map
-    """
-
-    def update_intrinsic(self, intrinsic):  # type: ignore[no-untyped-def]
-        """
-        Update the intrinsic parameters dynamically.
-        Ensure that the input intrinsic is valid.
-        """
-        if len(intrinsic) != 4:
-            raise ValueError("Intrinsic must be a list or tuple with 4 values: [fx, fy, cx, cy]")
-        self.intrinsic = intrinsic
-        print(f"Intrinsics updated to: {self.intrinsic}")
-
-    def infer_depth(self, img, debug: bool = False):  # type: ignore[no-untyped-def]
-        if debug:
-            print(f"Input image: {img}")
-        try:
-            if isinstance(img, str):
-                print(f"Image type string: {type(img)}")
-                img_data = cv2.imread(img)
-                if img_data is None:
-                    raise ValueError(f"Failed to load image from {img}")
-                self.rgb_origin = img_data[:, :, ::-1]
-            else:
-                # print(f"Image type not string: {type(img)}, cv2 conversion assumed to be handled. If not, this will throw an error")
-                self.rgb_origin = img
-        except Exception as e:
-            print(f"Error parsing into infer_depth: {e}")
-
-        img = self.rescale_input(img, self.rgb_origin)  # type: ignore[no-untyped-call]
-
-        with torch.no_grad():
-            pred_depth, confidence, output_dict = self._model.inference({"input": img})
-
-        # Convert to PIL format
-        depth_image = self.unpad_transform_depth(pred_depth)  # type: ignore[no-untyped-call]
-
-        return depth_image.cpu().numpy()
-
-    def save_depth(self, pred_depth) -> None:  # type: ignore[no-untyped-def]
-        # Save the depth map to a file
-        pred_depth_np = pred_depth.cpu().numpy()
-        output_depth_file = "output_depth_map.png"
-        cv2.imwrite(output_depth_file, pred_depth_np)
-        print(f"Depth map saved to {output_depth_file}")
-
-    # Adjusts input size to fit pretrained ViT model
-    def rescale_input(self, rgb, rgb_origin):  # type: ignore[no-untyped-def]
-        #### ajust input size to fit pretrained model
-        # keep ratio resize
-        input_size = (616, 1064)  # for vit model
-        # input_size = (544, 1216) # for convnext model
-        h, w = rgb_origin.shape[:2]
-        scale = min(input_size[0] / h, input_size[1] / w)
-        rgb = cv2.resize(
-            rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR
-        )
-        # remember to scale intrinsic, hold depth
-        self.intrinsic_scaled = [
-            self.intrinsic[0] * scale,
-            self.intrinsic[1] * scale,
-            self.intrinsic[2] * scale,
-            self.intrinsic[3] * scale,
-        ]
-        # padding to input_size
-        padding = [123.675, 116.28, 103.53]
-        h, w = rgb.shape[:2]
-        pad_h = input_size[0] - h
-        pad_w = input_size[1] - w
-        pad_h_half = pad_h // 2
-        pad_w_half = pad_w // 2
-        rgb = cv2.copyMakeBorder(
-            rgb,
-            pad_h_half,
-            pad_h - pad_h_half,
-            pad_w_half,
-            pad_w - pad_w_half,
-            cv2.BORDER_CONSTANT,
-            value=padding,
-        )
-        self.pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]
-
-        #### normalize
-        mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None]
-        std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None]
-        rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float()
-        rgb = torch.div((rgb - mean), std)
-        rgb = rgb[None, :, :, :].to(self.device)
-        return rgb
-
-    def unpad_transform_depth(self, pred_depth):  # type: ignore[no-untyped-def]
-        # un pad
-        pred_depth = pred_depth.squeeze()
-        pred_depth = pred_depth[
-            self.pad_info[0] : pred_depth.shape[0] - self.pad_info[1],  # type: ignore[index]
-            self.pad_info[2] : pred_depth.shape[1] - self.pad_info[3],  # type: ignore[index]
-        ]
-
-        # upsample to original size
-        pred_depth = torch.nn.functional.interpolate(
-            pred_depth[None, None, :, :],
-            self.rgb_origin.shape[:2],
-            mode="bilinear",
-        ).squeeze()
-        ###################### canonical camera space ######################
-
-        #### de-canonical transform
-        canonical_to_real_scale = (
-            self.intrinsic_scaled[0] / 1000.0  # type: ignore[index]
-        )  # 1000.0 is the focal length of canonical camera
-        pred_depth = pred_depth * canonical_to_real_scale  # now the depth is metric
-        pred_depth = torch.clamp(pred_depth, 0, 1000)
-        return pred_depth
-
-    def eval_predicted_depth(self, depth_file, pred_depth) -> None:  # type: ignore[no-untyped-def]
-        if depth_file is not None:
-            gt_depth_np = cv2.imread(depth_file, -1)
-            if gt_depth_np is None:
-                raise ValueError(f"Failed to load depth file from {depth_file}")
-            gt_depth_scaled = gt_depth_np / self.gt_depth_scale
-            gt_depth = torch.from_numpy(gt_depth_scaled).float().to(self.device)
-            assert gt_depth.shape == pred_depth.shape
-
-            mask = gt_depth > 1e-8  # type: ignore[operator]
-            abs_rel_err = (torch.abs(pred_depth[mask] - gt_depth[mask]) / gt_depth[mask]).mean()  # type: ignore[index]
-            print("abs_rel_err:", abs_rel_err.item())
diff --git a/dimos/models/depth/test_metric3d.py b/dimos/models/depth/test_metric3d.py
deleted file mode 100644
index 33e39f6a29..0000000000
--- a/dimos/models/depth/test_metric3d.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from contextlib import contextmanager
-
-import numpy as np
-import pytest
-
-from dimos.models.depth.metric3d import Metric3D
-from dimos.msgs.sensor_msgs import Image
-from dimos.utils.data import get_data
-
-
-@contextmanager
-def skip_xformers_unsupported():
-    try:
-        yield
-    except NotImplementedError as e:
-        if "memory_efficient_attention" in str(e):
-            pytest.skip(f"xformers not supported on this GPU: {e}")
-        raise
-
-
-@pytest.fixture
-def sample_intrinsics() -> list[float]:
-    """Sample camera intrinsics [fx, fy, cx, cy]."""
-    return [500.0, 500.0, 320.0, 240.0]
-
-@pytest.mark.cuda
-@pytest.mark.gpu
-def test_metric3d_init(sample_intrinsics: list[float]) -> None:
-    """Test Metric3D initialization."""
-    model = Metric3D(camera_intrinsics=sample_intrinsics)
-    assert model.config.camera_intrinsics == sample_intrinsics
-    assert model.config.gt_depth_scale == 256.0
-    assert model.device == "cuda"
-
-
-@pytest.mark.gpu
-def test_metric3d_update_intrinsic(sample_intrinsics: list[float]) -> None:
-    """Test updating camera intrinsics."""
-    model = Metric3D(camera_intrinsics=sample_intrinsics)
-
-    new_intrinsics = [600.0, 600.0, 400.0, 300.0]
-    model.update_intrinsic(new_intrinsics)
-    assert model.intrinsic == new_intrinsics
-
-@pytest.mark.gpu
-def test_metric3d_update_intrinsic_invalid(sample_intrinsics: list[float]) -> None:
-    """Test that invalid intrinsics raise an error."""
-    model = Metric3D(camera_intrinsics=sample_intrinsics)
-
-    with pytest.raises(ValueError, match="Intrinsic must be a list"):
-        model.update_intrinsic([1.0, 2.0])  # Only 2 values
-
-
-@pytest.mark.cuda
-@pytest.mark.gpu
-def test_metric3d_infer_depth(sample_intrinsics: list[float]) -> None:
-    """Test depth inference on a sample image."""
-    model = Metric3D(camera_intrinsics=sample_intrinsics)
-    model.start()
-
-    # Load test image
-    image = Image.from_file(get_data("cafe.jpg")).to_rgb()
-    rgb_array = image.data
-
-    # Run inference
-    with skip_xformers_unsupported():
-        depth_map = model.infer_depth(rgb_array)
-
-    # Verify output
-    assert isinstance(depth_map, np.ndarray)
-    assert depth_map.shape[:2] == rgb_array.shape[:2]  # Same spatial dimensions
-    assert depth_map.dtype in [np.float32, np.float64]
-    assert depth_map.min() >= 0  # Depth should be non-negative
-
-    print(f"Depth map shape: {depth_map.shape}")
-    print(f"Depth range: [{depth_map.min():.2f}, {depth_map.max():.2f}]")
-
-    model.stop()
-
-
-@pytest.mark.cuda
-@pytest.mark.gpu
-def test_metric3d_multiple_inferences(sample_intrinsics: list[float]) -> None:
-    """Test multiple depth inferences."""
-    model = Metric3D(camera_intrinsics=sample_intrinsics)
-    model.start()
-
-    image = Image.from_file(get_data("cafe.jpg")).to_rgb()
-    rgb_array = image.data
-
-    # Run multiple inferences
-    depths = []
-    for _ in range(3):
-        with skip_xformers_unsupported():
-            depth = model.infer_depth(rgb_array)
-        depths.append(depth)
-
-    # Results should be consistent
-    for i in range(1, len(depths)):
-        assert np.allclose(depths[0], depths[i], rtol=1e-5)
-
-    model.stop()
diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py
index 19d7e7db29..bdfd98cd17 100644
--- a/dimos/robot/all_blueprints.py
+++ b/dimos/robot/all_blueprints.py
@@ -95,7 +95,6 @@
     "cost_mapper": "dimos.mapping.costmapper",
     "demo_calculator_skill": "dimos.agents.skills.demo_calculator_skill",
     "demo_robot": "dimos.agents.skills.demo_robot",
-    "depth_module": "dimos.robot.unitree.depth_module",
     "detection3d_module": "dimos.perception.detection.module3D",
     "detection_db_module": "dimos.perception.detection.moduleDB",
     "fastlio2_module": "dimos.hardware.sensors.lidar.fastlio2.module",
diff --git a/dimos/robot/drone/README.md b/dimos/robot/drone/README.md
index 6e8ceb4d63..100e2deadd 100644
--- a/dimos/robot/drone/README.md
+++ b/dimos/robot/drone/README.md
@@ -126,7 +126,7 @@ DJI Drone ← Wireless → DJI Controller ← USB → Android Device ← WiFi 
 ```
 drone.py                    # Main orchestrator
 ├── connection_module.py    # MAVLink communication & skills
-├── camera_module.py        # Video processing & depth estimation
+├── camera_module.py        # Video processing
 ├── tracking_module.py      # Visual servoing & object tracking
 ├── mavlink_connection.py   # Low-level MAVLink protocol
 └── dji_video_stream.py     # GStreamer video capture
@@ -242,13 +242,6 @@ drone.start()
 - **ROS/DimOS**: X=Forward, Y=Left, Z=Up
 - Automatic conversion handled internally
 
-### Depth Estimation
-Camera module can generate depth maps using Metric3D:
-```python
-# Depth published to /drone/depth and /drone/pointcloud
-# Requires GPU with 8GB+ VRAM
-```
-
 ### Foxglove Visualization
 Connect Foxglove Studio to `ws://localhost:8765` to see:
 - Live video with tracking overlay
diff --git a/dimos/robot/drone/camera_module.py b/dimos/robot/drone/camera_module.py
index 8ba88fd028..248b1ceb6e 100644
--- a/dimos/robot/drone/camera_module.py
+++ b/dimos/robot/drone/camera_module.py
@@ -15,7 +15,7 @@
 
 # Copyright 2025-2026 Dimensional Inc.
 
-"""Camera module for drone with depth estimation."""
+"""Camera module for drone."""
 
 import threading
 import time
@@ -25,9 +25,8 @@
 
 from dimos.core import In, Module, Out, rpc
 from dimos.msgs.geometry_msgs import PoseStamped
-from dimos.msgs.sensor_msgs import Image, ImageFormat
+from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.std_msgs import Header
-from dimos.perception.common.utils import colorize_depth
 from dimos.utils.logging_config import setup_logger
 
 logger = setup_logger()
@@ -35,15 +34,13 @@
 
 class DroneCameraModule(Module):
     """
-    Camera module for drone that processes RGB images to generate depth using Metric3D.
+    Camera module for drone
 
     Subscribes to:
         - /video: RGB camera images from drone
 
     Publishes:
         - /drone/color_image: RGB camera images
-        - /drone/depth_image: Depth images from Metric3D
-        - /drone/depth_colorized: Colorized depth
         - /drone/camera_info: Camera calibration
         - /drone/camera_pose: Camera pose from TF
     """
@@ -53,8 +50,6 @@ class DroneCameraModule(Module):
 
     # Outputs
     color_image: Out[Image]
-    depth_image: Out[Image]
-    depth_colorized: Out[Image]
     camera_info: Out[CameraInfo]
     camera_pose: Out[PoseStamped]
 
@@ -64,7 +59,6 @@ def __init__(
         world_frame_id: str = "world",
         camera_frame_id: str = "camera_link",
         base_frame_id: str = "base_link",
-        gt_depth_scale: float = 2.0,
         **kwargs: Any,
     ) -> None:
         """Initialize drone camera module.
@@ -73,7 +67,6 @@ def __init__(
             camera_intrinsics: [fx, fy, cx, cy]
             camera_frame_id: TF frame for camera
             base_frame_id: TF frame for drone base
-            gt_depth_scale: Depth scale factor
         """
         super().__init__(**kwargs)
 
@@ -84,10 +77,6 @@ def __init__(
         self.camera_frame_id = camera_frame_id
         self.base_frame_id = base_frame_id
         self.world_frame_id = world_frame_id
-        self.gt_depth_scale = gt_depth_scale
-
-        # Metric3D for depth
-        self.metric3d: Any = None  # Lazy-loaded Metric3D model
 
         # Processing state
         self._running = False
@@ -104,7 +93,6 @@ def start(self) -> None:
             logger.warning("Camera module already running")
             return
 
-        # Start processing thread for depth (which will init Metric3D and handle video)
         self._running = True
         self._stop_processing.clear()
         self._processing_thread = threading.Thread(target=self._processing_loop, daemon=True)
@@ -121,22 +109,9 @@ def _on_video_frame(self, frame: Image) -> None:
         # Publish color image immediately
         self.color_image.publish(frame)
 
-        # Store for depth processing
         self._latest_frame = frame
 
     def _processing_loop(self) -> None:
-        """Process depth estimation in background."""
-        # Initialize Metric3D in the background thread
-        if self.metric3d is None:
-            try:
-                from dimos.models.depth.metric3d import Metric3D
-
-                self.metric3d = Metric3D(camera_intrinsics=self.camera_intrinsics)
-                logger.info("Metric3D initialized")
-            except Exception as e:
-                logger.warning(f"Metric3D not available: {e}")
-                self.metric3d = None
-
         # Subscribe to video once connection is available
         subscribed = False
         while not subscribed and not self._stop_processing.is_set():
@@ -151,12 +126,10 @@ def _processing_loop(self) -> None:
                 logger.debug(f"Waiting for video connection: {e}")
                 time.sleep(0.1)
 
-        logger.info("Depth processing loop started")
-
         _reported_error = False
 
         while not self._stop_processing.is_set():
-            if self._latest_frame is not None and self.metric3d is not None:
+            if self._latest_frame is not None:
                 try:
                     frame = self._latest_frame
                     self._latest_frame = None
@@ -164,34 +137,9 @@ def _processing_loop(self) -> None:
                     # Get numpy array from Image
                     img_array = frame.data
 
-                    # Generate depth
-                    depth_array = self.metric3d.infer_depth(img_array) / self.gt_depth_scale
-
                     # Create header
                     header = Header(self.camera_frame_id)
 
-                    # Publish depth
-                    depth_msg = Image(
-                        data=depth_array,
-                        format=ImageFormat.DEPTH,
-                        frame_id=header.frame_id,
-                        ts=header.ts,
-                    )
-                    self.depth_image.publish(depth_msg)
-
-                    # Publish colorized depth
-                    depth_colorized_array = colorize_depth(
-                        depth_array, max_depth=10.0, overlay_stats=True
-                    )
-                    if depth_colorized_array is not None:
-                        depth_colorized_msg = Image(
-                            data=depth_colorized_array,
-                            format=ImageFormat.RGB,
-                            frame_id=header.frame_id,
-                            ts=header.ts,
-                        )
-                        self.depth_colorized.publish(depth_colorized_msg)
-
                     # Publish camera info
                     self._publish_camera_info(header, img_array.shape)
 
@@ -201,12 +149,10 @@ def _processing_loop(self) -> None:
                 except Exception as e:
                     if not _reported_error:
                         _reported_error = True
-                        logger.error(f"Error processing depth: {e}")
+                        logger.error(f"Error processing frame: {e}")
             else:
                 time.sleep(0.01)
 
-        logger.info("Depth processing loop stopped")
-
     def _publish_camera_info(self, header: Header, shape: tuple[int, ...]) -> None:
         """Publish camera calibration info."""
         try:
@@ -279,8 +225,4 @@ def stop(self) -> None:
         if self._processing_thread and self._processing_thread.is_alive():
             self._processing_thread.join(timeout=2.0)
 
-        # Cleanup Metric3D
-        if self.metric3d:
-            self.metric3d.cleanup()
-
         logger.info("Camera module stopped")
diff --git a/dimos/robot/drone/drone.py b/dimos/robot/drone/drone.py
index 8e72d56ed1..6b9500804f 100644
--- a/dimos/robot/drone/drone.py
+++ b/dimos/robot/drone/drone.py
@@ -51,7 +51,7 @@
 
 
 class Drone(Robot):
-    """Generic MAVLink-based drone with video and depth capabilities."""
+    """Generic MAVLink-based drone with video capabilities."""
 
     def __init__(
         self,
@@ -164,8 +164,6 @@ def _deploy_camera(self) -> None:
 
         # Configure LCM transports
         self.camera.color_image.transport = core.LCMTransport("/drone/color_image", Image)
-        self.camera.depth_image.transport = core.LCMTransport("/drone/depth_image", Image)
-        self.camera.depth_colorized.transport = core.LCMTransport("/drone/depth_colorized", Image)
         self.camera.camera_info.transport = core.LCMTransport("/drone/camera_info", CameraInfo)
         self.camera.camera_pose.transport = core.LCMTransport("/drone/camera_pose", PoseStamped)
 
diff --git a/dimos/robot/unitree/depth_module.py b/dimos/robot/unitree/depth_module.py
deleted file mode 100644
index 07f065caea..0000000000
--- a/dimos/robot/unitree/depth_module.py
+++ /dev/null
@@ -1,243 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import threading
-import time
-
-from dimos_lcm.sensor_msgs import CameraInfo
-import numpy as np
-
-from dimos.core import In, Module, Out, rpc
-from dimos.core.global_config import GlobalConfig
-from dimos.msgs.sensor_msgs import Image, ImageFormat
-from dimos.utils.logging_config import setup_logger
-
-logger = setup_logger()
-
-
-class DepthModule(Module):
-    """
-    Depth module for Unitree Go2 that processes RGB images to generate depth using Metric3D.
-
-    Subscribes to:
-        - /go2/color_image: RGB camera images from Unitree
-        - /go2/camera_info: Camera calibration information
-
-    Publishes:
-        - /go2/depth_image: Depth images generated by Metric3D
-    """
-
-    # LCM inputs
-    color_image: In[Image]
-    camera_info: In[CameraInfo]
-
-    # LCM outputs
-    depth_image: Out[Image]
-
-    def __init__(  # type: ignore[no-untyped-def]
-        self,
-        gt_depth_scale: float = 0.5,
-        cfg: GlobalConfig | None = None,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize Depth Module.
-
-        Args:
-            gt_depth_scale: Ground truth depth scaling factor
-        """
-        super().__init__(**kwargs)
-
-        self.camera_intrinsics = None
-        self.gt_depth_scale = gt_depth_scale
-        self.metric3d = None
-        self._camera_info_received = False
-
-        # Processing state
-        self._running = False
-        self._latest_frame = None
-        self._last_image = None
-        self._last_timestamp = None
-        self._last_depth = None
-        self._cannot_process_depth = False
-
-        # Threading
-        self._processing_thread: threading.Thread | None = None
-        self._stop_processing = threading.Event()
-
-        if cfg:
-            if cfg.simulation:
-                self.gt_depth_scale = 1.0
-
-    @rpc
-    def start(self) -> None:
-        super().start()
-
-        if self._running:
-            logger.warning("Camera module already running")
-            return
-
-        # Set running flag before starting
-        self._running = True
-
-        # Subscribe to video and camera info inputs
-        self.color_image.subscribe(self._on_video)
-        self.camera_info.subscribe(self._on_camera_info)
-
-        # Start processing thread
-        self._start_processing_thread()
-
-        logger.info("Depth module started")
-
-    @rpc
-    def stop(self) -> None:
-        if not self._running:
-            return
-
-        self._running = False
-        self._stop_processing.set()
-
-        # Wait for thread to finish
-        if self._processing_thread and self._processing_thread.is_alive():
-            self._processing_thread.join(timeout=2.0)
-
-        super().stop()
-
-    def _on_camera_info(self, msg: CameraInfo) -> None:
-        """Process camera info to extract intrinsics."""
-        if self.metric3d is not None:
-            return  # Already initialized
-
-        try:
-            # Extract intrinsics from camera matrix K
-            K = msg.K
-            fx = K[0]
-            fy = K[4]
-            cx = K[2]
-            cy = K[5]
-
-            self.camera_intrinsics = [fx, fy, cx, cy]  # type: ignore[assignment]
-
-            # Initialize Metric3D with camera intrinsics
-            from dimos.models.depth.metric3d import Metric3D
-
-            self.metric3d = Metric3D(camera_intrinsics=self.camera_intrinsics)  # type: ignore[assignment]
-            self._camera_info_received = True
-
-            logger.info(
-                f"Initialized Metric3D with intrinsics from camera_info: {self.camera_intrinsics}"
-            )
-
-        except Exception as e:
-            logger.error(f"Error processing camera info: {e}")
-
-    def _on_video(self, msg: Image) -> None:
-        """Store latest video frame for processing."""
-        if not self._running:
-            return
-
-        # Simply store the latest frame - processing happens in main loop
-        self._latest_frame = msg  # type: ignore[assignment]
-        logger.debug(
-            f"Received video frame: format={msg.format}, shape={msg.data.shape if hasattr(msg.data, 'shape') else 'unknown'}"
-        )
-
-    def _start_processing_thread(self) -> None:
-        """Start the processing thread."""
-        self._stop_processing.clear()
-        self._processing_thread = threading.Thread(target=self._main_processing_loop, daemon=True)
-        self._processing_thread.start()
-        logger.info("Started depth processing thread")
-
-    def _main_processing_loop(self) -> None:
-        """Main processing loop that continuously processes latest frames."""
-        logger.info("Starting main processing loop")
-
-        while not self._stop_processing.is_set():
-            # Process latest frame if available
-            if self._latest_frame is not None:
-                try:
-                    msg = self._latest_frame
-                    self._latest_frame = None  # Clear to avoid reprocessing
-                    # Store for publishing
-                    self._last_image = msg.data
-                    self._last_timestamp = msg.ts if msg.ts else time.time()
-                    # Process depth
-                    self._process_depth(self._last_image)
-
-                except Exception as e:
-                    logger.error(f"Error in main processing loop: {e}", exc_info=True)
-            else:
-                # Small sleep to avoid busy waiting
-                time.sleep(0.001)
-
-        logger.info("Main processing loop stopped")
-
-    def _process_depth(self, img_array: np.ndarray) -> None:  # type: ignore[type-arg]
-        """Process depth estimation using Metric3D."""
-        if self._cannot_process_depth:
-            self._last_depth = None
-            return
-
-        # Wait for camera info to initialize Metric3D
-        if self.metric3d is None:
-            logger.debug("Waiting for camera_info to initialize Metric3D")
-            return
-
-        try:
-            logger.debug(f"Processing depth for image shape: {img_array.shape}")
-
-            # Generate depth map
-            depth_array = self.metric3d.infer_depth(img_array) * self.gt_depth_scale
-
-            self._last_depth = depth_array
-            logger.debug(f"Generated depth map shape: {depth_array.shape}")
-
-            self._publish_depth()
-
-        except Exception as e:
-            logger.error(f"Error processing depth: {e}")
-            self._cannot_process_depth = True
-
-    def _publish_depth(self) -> None:
-        """Publish depth image."""
-        if not self._running:
-            return
-
-        try:
-            # Publish depth image
-            if self._last_depth is not None:
-                # Convert depth to uint16 (millimeters) for more efficient storage
-                # Clamp to valid range [0, 65.535] meters before converting
-                depth_clamped = np.clip(self._last_depth, 0, 65.535)
-                depth_uint16 = (depth_clamped * 1000).astype(np.uint16)
-                depth_msg = Image(
-                    data=depth_uint16,
-                    format=ImageFormat.DEPTH16,  # Use DEPTH16 format for uint16 depth
-                    frame_id="camera_link",
-                    ts=self._last_timestamp,
-                )
-                self.depth_image.publish(depth_msg)
-                logger.debug(f"Published depth image (uint16): shape={depth_uint16.shape}")
-
-        except Exception as e:
-            logger.error(f"Error publishing depth data: {e}", exc_info=True)
-
-
-depth_module = DepthModule.blueprint
-
-
-__all__ = ["DepthModule", "depth_module"]
diff --git a/dimos/robot/unitree_webrtc/__init__.py b/dimos/robot/unitree_webrtc/__init__.py
index 4524bba226..451aa53128 100644
--- a/dimos/robot/unitree_webrtc/__init__.py
+++ b/dimos/robot/unitree_webrtc/__init__.py
@@ -20,7 +20,6 @@
 
 _ALIAS_MODULES = {
     "demo_error_on_name_conflicts": "dimos.robot.unitree.demo_error_on_name_conflicts",
-    "depth_module": "dimos.robot.unitree.depth_module",
     "keyboard_teleop": "dimos.robot.unitree.keyboard_teleop",
     "mujoco_connection": "dimos.robot.unitree.mujoco_connection",
     "type": "dimos.robot.unitree.type",
diff --git a/dimos/types/manipulation.py b/dimos/types/manipulation.py
index 507b9e9b85..76ad7979f2 100644
--- a/dimos/types/manipulation.py
+++ b/dimos/types/manipulation.py
@@ -80,7 +80,7 @@ class ObjectData(TypedDict, total=False):
     # Basic detection information
     object_id: int  # Unique ID for the object
     bbox: list[float]  # Bounding box [x1, y1, x2, y2]
-    depth: float  # Depth in meters from Metric3d
+    depth: float  # Depth in meters
     confidence: float  # Detection confidence
     class_id: int  # Class ID from the detector
     label: str  # Semantic label (e.g., 'cup', 'table')
diff --git a/onnx/metric3d_vit_small.onnx b/onnx/metric3d_vit_small.onnx
deleted file mode 100644
index bfddd41628..0000000000
--- a/onnx/metric3d_vit_small.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:14805174265dd721ac3b396bd5ee7190c708cec41150ed298267f6c3126bc060
-size 151333865