diff --git a/SKILL.md b/SKILL.md index 050f881..b9dce91 100644 --- a/SKILL.md +++ b/SKILL.md @@ -65,9 +65,12 @@ When used for **privacy mode**, the `depth_only` blend mode fully anonymizes the | Platform | Backend | Runtime | Model | |----------|---------|---------|-------| | **macOS** | CoreML | Apple Neural Engine | `apple/coreml-depth-anything-v2-small` (.mlpackage) | -| Linux/Windows | PyTorch | CUDA / CPU | `depth-anything/Depth-Anything-V2-Small` (.pth) | +| Windows/Linux | **TensorRT** | NVIDIA TRT FP16 | Auto-built `.trt` engine (from ONNX) | +| Windows/Linux | PyTorch | CUDA / CPU | `depth-anything/Depth-Anything-V2-Small` (.pth) | -On macOS, CoreML runs on the Neural Engine, leaving the GPU free for other tasks. The model is auto-downloaded from HuggingFace and stored at `~/.aegis-ai/models/feature-extraction/`. +On macOS, CoreML runs on the Neural Engine, leaving the GPU free for other tasks. +On Windows/Linux with NVIDIA GPUs, TensorRT FP16 provides 2-4x speedup over vanilla PyTorch CUDA. Engines are auto-built on first run and cached at `~/.aegis-ai/models/feature-extraction/trt_engines/`. +The model is auto-downloaded from HuggingFace and stored at `~/.aegis-ai/models/feature-extraction/`. ## What You Get diff --git a/skills.json b/skills.json index 3440a5e..8ff6c87 100644 --- a/skills.json +++ b/skills.json @@ -137,7 +137,7 @@ "id": "depth-estimation", "name": "Depth Estimation (Privacy)", "description": "Privacy-first depth map transforms — anonymize camera feeds with Depth Anything v2 while preserving spatial awareness.", - "version": "1.1.0", + "version": "1.2.0", "category": "privacy", "path": "skills/transformation/depth-estimation", "tags": [ diff --git a/skills/transformation/depth-estimation/deploy.bat b/skills/transformation/depth-estimation/deploy.bat new file mode 100644 index 0000000..b553907 --- /dev/null +++ b/skills/transformation/depth-estimation/deploy.bat @@ -0,0 +1,66 @@ +@echo off +REM deploy.bat — Windows deployment for Depth Estimation (Privacy) skill +REM Creates venv, installs PyTorch + CUDA dependencies, verifies GPU detection. +REM +REM The Aegis deployment agent calls this on Windows instead of deploy.sh. + +setlocal enabledelayedexpansion + +set "SCRIPT_DIR=%~dp0" +set "VENV_DIR=%SCRIPT_DIR%.venv" +set "MODELS_DIR=%USERPROFILE%\.aegis-ai\models\feature-extraction" + +echo === Depth Estimation (Privacy) — Windows Setup === + +REM ── Create venv ─────────────────────────────────────────────────── +if not exist "%VENV_DIR%" ( + echo Creating virtual environment... + python -m venv "%VENV_DIR%" + if errorlevel 1 ( + echo ERROR: Failed to create virtual environment. Is Python installed? + exit /b 1 + ) +) + +set "PIP=%VENV_DIR%\Scripts\pip.exe" +set "PYTHON=%VENV_DIR%\Scripts\python.exe" + +REM Upgrade pip +"%PIP%" install --upgrade pip --quiet + +echo. +echo === Windows — PyTorch backend (CUDA/CPU) === +echo Installing PyTorch dependencies... +"%PIP%" install --quiet -r "%SCRIPT_DIR%requirements.txt" + +if errorlevel 1 ( + echo ERROR: pip install failed. Check requirements.txt and network connectivity. + exit /b 1 +) + +echo [OK] PyTorch dependencies installed + +REM ── Verify installation ─────────────────────────────────────────── +"%PYTHON%" -c "import torch, cv2, numpy, PIL; from depth_anything_v2.dpt import DepthAnythingV2; cuda = 'YES' if torch.cuda.is_available() else 'NO'; gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'; print(f'[OK] Verified: torch={torch.__version__}, CUDA={cuda}, GPU={gpu}')" + +if errorlevel 1 ( + echo WARNING: Verification failed. Some packages may not be installed correctly. + echo Trying minimal verification... + "%PYTHON%" -c "import torch; print(f'torch={torch.__version__}, CUDA={torch.cuda.is_available()}')" +) + +REM ── TensorRT verification (optional) ────────────────────────────── +echo. +echo === TensorRT Check === +"%PYTHON%" -c "import tensorrt; print(f'[OK] TensorRT={tensorrt.__version__}')" 2>nul +if errorlevel 1 ( + echo [INFO] TensorRT not available — will use PyTorch CUDA backend + echo [INFO] To enable TensorRT: pip install tensorrt +) else ( + echo [OK] TensorRT FP16 acceleration available +) + +echo. +echo === Setup complete === + +endlocal diff --git a/skills/transformation/depth-estimation/models.json b/skills/transformation/depth-estimation/models.json new file mode 100644 index 0000000..6881abf --- /dev/null +++ b/skills/transformation/depth-estimation/models.json @@ -0,0 +1,123 @@ +{ + "studio": { + "title": "3D Depth Vision Studio", + "subtitle": "Convert 2D video to 3D depth maps • Privacy-first scene understanding", + "icon": "layers" + }, + "models_dir": "~/.aegis-ai/models/feature-extraction", + "models": { + "depth-anything-v2-small": { + "name": "Depth Anything V2 Small", + "type": "depth_estimation", + "input_size": [518, 392], + "description": "Monocular depth estimation — fast, lightweight model", + "platforms": { + "darwin": { + "repository": "apple/coreml-depth-anything-v2-small", + "format": "mlpackage", + "variants": { + "DepthAnythingV2SmallF16": { + "precision": "float16", + "size_mb": 49.8, + "description": "Float16 — Optimized for Neural Engine" + }, + "DepthAnythingV2SmallF32": { + "precision": "float32", + "size_mb": 99.2, + "description": "Float32 — Highest precision" + } + } + }, + "win32": { + "repository": "depth-anything/Depth-Anything-V2-Small", + "format": "pth", + "variants": { + "depth_anything_v2_vits": { + "precision": "float32", + "size_mb": 99, + "description": "PyTorch ViT-S — CUDA/CPU/MPS compatible" + }, + "depth_anything_v2_vits_trt_fp16": { + "precision": "float16", + "format": "trt", + "size_mb": 25, + "description": "TensorRT FP16 — Fastest (requires NVIDIA GPU + TensorRT)", + "requires": "tensorrt" + } + } + }, + "linux": { + "repository": "depth-anything/Depth-Anything-V2-Small", + "format": "pth", + "variants": { + "depth_anything_v2_vits": { + "precision": "float32", + "size_mb": 99, + "description": "PyTorch ViT-S — CUDA/CPU compatible" + } + } + } + } + }, + "depth-anything-v2-base": { + "name": "Depth Anything V2 Base", + "type": "depth_estimation", + "input_size": [518, 392], + "description": "Monocular depth estimation — balanced speed and accuracy", + "platforms": { + "win32": { + "repository": "depth-anything/Depth-Anything-V2-Base", + "format": "pth", + "variants": { + "depth_anything_v2_vitb": { + "precision": "float32", + "size_mb": 390, + "description": "PyTorch ViT-B — CUDA/CPU/MPS compatible" + } + } + }, + "linux": { + "repository": "depth-anything/Depth-Anything-V2-Base", + "format": "pth", + "variants": { + "depth_anything_v2_vitb": { + "precision": "float32", + "size_mb": 390, + "description": "PyTorch ViT-B — CUDA/CPU compatible" + } + } + } + } + }, + "depth-anything-v2-large": { + "name": "Depth Anything V2 Large", + "type": "depth_estimation", + "input_size": [518, 392], + "description": "Monocular depth estimation — highest accuracy, more VRAM required", + "platforms": { + "win32": { + "repository": "depth-anything/Depth-Anything-V2-Large", + "format": "pth", + "variants": { + "depth_anything_v2_vitl": { + "precision": "float32", + "size_mb": 1340, + "description": "PyTorch ViT-L — CUDA recommended (1.3 GB)" + } + } + }, + "linux": { + "repository": "depth-anything/Depth-Anything-V2-Large", + "format": "pth", + "variants": { + "depth_anything_v2_vitl": { + "precision": "float32", + "size_mb": 1340, + "description": "PyTorch ViT-L — CUDA recommended (1.3 GB)" + } + } + } + } + } + } +} diff --git a/skills/transformation/depth-estimation/requirements.txt b/skills/transformation/depth-estimation/requirements.txt index 2717a00..a31ba0d 100644 --- a/skills/transformation/depth-estimation/requirements.txt +++ b/skills/transformation/depth-estimation/requirements.txt @@ -1,10 +1,17 @@ # Depth Estimation — Privacy Transform Skill # CoreML-first on macOS (Neural Engine), PyTorch fallback on other platforms. # +# INSTALL WITH: pip install --ignore-requires-python -r requirements.txt +# The depth-anything-v2 PyPI wheel declares python_requires>=3.12 in its +# metadata, but is pure Python (py3-none-any) and works on Python 3.11+. +# # macOS: coremltools loads .mlpackage models — fast, leaves GPU free. # Other: PyTorch + depth-anything-v2 pip package + HF weights. # Common: opencv, numpy, pillow, huggingface_hub for model download. +# ── CUDA (NVIDIA systems get CUDA wheels, CPU-only falls back) ─────── +--extra-index-url https://download.pytorch.org/whl/cu126 + # ── CoreML (macOS only) ────────────────────────────────────────────── coremltools>=8.0; sys_platform == "darwin" @@ -20,3 +27,10 @@ numpy>=1.24.0 opencv-python-headless>=4.8.0 Pillow>=10.0.0 matplotlib>=3.7.0 + +# ── TensorRT acceleration (optional, NVIDIA only) ─────────────────── +# Provides ~2-4x speedup over vanilla PyTorch CUDA via FP16 inference. +# Requires NVIDIA GPU with Compute Capability >= 7.0 (Turing+). +# If not installed, the skill gracefully falls back to PyTorch. +tensorrt>=10.0; sys_platform != "darwin" +onnxruntime-gpu>=1.17.0; sys_platform != "darwin" diff --git a/skills/transformation/depth-estimation/scripts/benchmark.py b/skills/transformation/depth-estimation/scripts/benchmark.py new file mode 100644 index 0000000..bbcba12 --- /dev/null +++ b/skills/transformation/depth-estimation/scripts/benchmark.py @@ -0,0 +1,380 @@ +#!/usr/bin/env python3 +""" +Depth Estimation Benchmark — Cross-platform performance measurement. + +Measures inference latency for depth-estimation models across CoreML (macOS) +and PyTorch (CUDA/MPS/CPU). Outputs JSON results compatible with +Aegis DepthVisionStudio's benchmark UI. + +Usage: + python benchmark.py --variant depth_anything_v2_vits --runs 10 --device auto + python benchmark.py --variant DepthAnythingV2SmallF16 --runs 5 --compute-units cpu_and_ne + +The --compute-units flag is macOS/CoreML only (maps to coremltools.ComputeUnit). +On other platforms, use --device to select cuda/cpu/mps. +""" + +import sys +import os +import json +import time +import argparse +import tempfile +import statistics +from pathlib import Path + +# Add parent for transform imports +_script_dir = Path(__file__).resolve().parent +sys.path.insert(0, str(_script_dir)) + + +def _log(msg): + """Log to stderr (not captured by Aegis JSON parser).""" + print(f"[Benchmark] {msg}", file=sys.stderr, flush=True) + + +def _download_test_image(url, dest_path): + """Download a test image from URL.""" + try: + from urllib.request import urlretrieve + _log(f"Downloading test image: {url}") + urlretrieve(url, dest_path) + return True + except Exception as e: + _log(f"Download failed: {e}") + return False + + +def _get_test_image(test_image_url): + """Get or download a test image, returns path or None.""" + # Check for cached test image + cache_dir = Path.home() / ".aegis-ai" / "tmp" / "benchmark" + cache_dir.mkdir(parents=True, exist_ok=True) + cached = cache_dir / "test_image.jpg" + + if cached.exists(): + return str(cached) + + if test_image_url: + if _download_test_image(test_image_url, str(cached)): + return str(cached) + + # Generate a synthetic test image as fallback + try: + import numpy as np + import cv2 + _log("Generating synthetic 640x480 test image") + img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8) + cv2.imwrite(str(cached), img) + return str(cached) + except ImportError: + _log("ERROR: cv2/numpy not available — cannot generate test image") + return None + + +def _resolve_device(device_pref, compute_units): + """ + Resolve the compute device from CLI args. + + Priority: + 1. Explicit --device (cuda, cpu, mps) + 2. Map --compute-units to device (CoreML-specific, macOS only) + 3. Auto-detect + """ + import platform + + if device_pref and device_pref != "auto": + return device_pref + + # Map compute_units → device hint (for Aegis UI compatibility) + if compute_units and compute_units != "all": + cu_map = { + "gpu": "cuda", # Aegis UI "GPU" → CUDA + "cpu": "cpu", + "npu": "mps", # On macOS, NPU maps to Neural Engine via CoreML + "cpu_npu": "mps", # CPU + Neural Engine + "cpu_and_ne": "mps", # CoreML compute units string + } + mapped = cu_map.get(compute_units) + if mapped: + return mapped + + # Auto-detect + try: + import torch + if torch.cuda.is_available(): + return "cuda" + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return "mps" + except ImportError: + pass + + if platform.system() == "Darwin": + return "mps" + + return "cpu" + + +def run_benchmark(args): + """Run the depth estimation benchmark and output JSON results.""" + import cv2 + import numpy as np + + variant_id = args.variant + num_runs = args.runs + colormap_name = args.colormap + device = _resolve_device(args.device, args.compute_units) + test_image_url = args.test_image_url + + _log(f"Benchmark: variant={variant_id}, runs={num_runs}, device={device}, colormap={colormap_name}") + + # Colormap lookup + colormap_map = { + "inferno": 1, "viridis": 16, "plasma": 13, "magma": 12, + "jet": 2, "turbo": 18, "hot": 11, "cool": 8, + } + colormap_id = colormap_map.get(colormap_name, 16) + + # Load test image + test_image_path = _get_test_image(test_image_url) + if not test_image_path: + result = {"error": "Could not obtain test image"} + print(json.dumps(result)) + return + + image = cv2.imread(test_image_path) + if image is None: + result = {"error": f"Failed to read test image: {test_image_path}"} + print(json.dumps(result)) + return + + _log(f"Test image: {image.shape[1]}x{image.shape[0]}") + + # Determine backend and load model + import platform as plat + is_mac = plat.system() == "Darwin" + backend = None + model = None + model_load_ms = 0.0 + + # Try CoreML first on macOS (if variant looks like a CoreML model) + coreml_variants = {"DepthAnythingV2SmallF16", "DepthAnythingV2SmallF16INT8", "DepthAnythingV2SmallF32"} + if is_mac and variant_id in coreml_variants: + try: + import coremltools as ct + from PIL import Image + + models_dir = Path.home() / ".aegis-ai" / "models" / "feature-extraction" + model_path = models_dir / f"{variant_id}.mlpackage" + + if model_path.exists(): + _log(f"Loading CoreML model: {model_path}") + t0 = time.perf_counter() + + # Map compute_units string to coremltools enum + cu_str = args.compute_units or "all" + cu_enum_map = { + "all": ct.ComputeUnit.ALL, + "cpu": ct.ComputeUnit.CPU_ONLY, + "cpu_and_ne": ct.ComputeUnit.CPU_AND_NE, + "npu": ct.ComputeUnit.CPU_AND_NE, + "cpu_npu": ct.ComputeUnit.CPU_AND_NE, + "gpu": ct.ComputeUnit.CPU_AND_GPU, + } + cu_enum = cu_enum_map.get(cu_str, ct.ComputeUnit.ALL) + + model = ct.models.MLModel(str(model_path), compute_units=cu_enum) + model_load_ms = (time.perf_counter() - t0) * 1000 + backend = "coreml" + _log(f"CoreML model loaded in {model_load_ms:.0f}ms (compute_units={cu_str})") + else: + _log(f"CoreML model not found at {model_path}, falling back to PyTorch") + except Exception as e: + _log(f"CoreML load failed: {e}, trying PyTorch") + + # PyTorch fallback + if model is None: + try: + import torch + from depth_anything_v2.dpt import DepthAnythingV2 + from huggingface_hub import hf_hub_download + + # Map variant_id to PyTorch config + pytorch_configs = { + "depth_anything_v2_vits": { + "encoder": "vits", "features": 64, + "out_channels": [48, 96, 192, 384], + "repo": "depth-anything/Depth-Anything-V2-Small", + "filename": "depth_anything_v2_vits.pth", + }, + "depth_anything_v2_vitb": { + "encoder": "vitb", "features": 128, + "out_channels": [96, 192, 384, 768], + "repo": "depth-anything/Depth-Anything-V2-Base", + "filename": "depth_anything_v2_vitb.pth", + }, + "depth_anything_v2_vitl": { + "encoder": "vitl", "features": 256, + "out_channels": [256, 512, 1024, 1024], + "repo": "depth-anything/Depth-Anything-V2-Large", + "filename": "depth_anything_v2_vitl.pth", + }, + } + + # Also accept CoreML variant names and map to PyTorch + coreml_to_pytorch = { + "DepthAnythingV2SmallF16": "depth_anything_v2_vits", + "DepthAnythingV2SmallF16INT8": "depth_anything_v2_vits", + "DepthAnythingV2SmallF32": "depth_anything_v2_vits", + } + + pytorch_variant = coreml_to_pytorch.get(variant_id, variant_id) + cfg = pytorch_configs.get(pytorch_variant) + if not cfg: + result = {"error": f"Unknown variant: {variant_id}. Available: {list(pytorch_configs.keys())}"} + print(json.dumps(result)) + return + + _log(f"Loading PyTorch model: {pytorch_variant} on {device}") + t0 = time.perf_counter() + + weights_path = hf_hub_download(cfg["repo"], cfg["filename"]) + model = DepthAnythingV2( + encoder=cfg["encoder"], + features=cfg["features"], + out_channels=cfg["out_channels"], + ) + model.load_state_dict(torch.load(weights_path, map_location=device, weights_only=True)) + model.to(device) + model.eval() + + model_load_ms = (time.perf_counter() - t0) * 1000 + backend = "pytorch" + _log(f"PyTorch model loaded in {model_load_ms:.0f}ms on {device}") + + except Exception as e: + result = {"error": f"Failed to load model: {e}"} + print(json.dumps(result)) + return + + # ── Run benchmark ────────────────────────────────────────────── + + timings = [] + errors = 0 + extraction_data = None + + for i in range(num_runs): + try: + t0 = time.perf_counter() + + if backend == "coreml": + from PIL import Image as PILImage + # CoreML inference + input_w, input_h = 518, 392 + rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + resized = cv2.resize(rgb, (input_w, input_h)) + pil_img = PILImage.fromarray(resized, mode="RGB") + prediction = model.predict({"image": pil_img}) + output_key = list(prediction.keys())[0] + depth_map = np.array(prediction[output_key]) + if depth_map.ndim > 2: + depth_map = np.squeeze(depth_map) + else: + # PyTorch inference + import torch + rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + with torch.no_grad(): + depth_map = model.infer_image(rgb) + + # Normalize and colorize + d_min, d_max = depth_map.min(), depth_map.max() + depth_norm = ((depth_map - d_min) / (d_max - d_min + 1e-8) * 255).astype(np.uint8) + depth_colored = cv2.applyColorMap(depth_norm, colormap_id) + depth_colored = cv2.resize(depth_colored, (image.shape[1], image.shape[0])) + + elapsed_ms = (time.perf_counter() - t0) * 1000 + timings.append(elapsed_ms) + + # Capture extraction result from last run + if i == num_runs - 1: + import base64 + _, buf = cv2.imencode(".jpg", depth_colored, [cv2.IMWRITE_JPEG_QUALITY, 85]) + extraction_data = base64.b64encode(buf).decode("ascii") + + _log(f" Run {i+1}/{num_runs}: {elapsed_ms:.1f}ms") + + except Exception as e: + _log(f" Run {i+1}/{num_runs}: ERROR — {e}") + errors += 1 + + # ── Build results ────────────────────────────────────────────── + + if not timings: + result = {"error": f"All {num_runs} runs failed"} + print(json.dumps(result)) + return + + avg = statistics.mean(timings) + fps = 1000.0 / avg if avg > 0 else 0 + + result = { + "model_id": variant_id, + "variant_id": variant_id, + "backend": backend, + "device": device, + "num_runs": num_runs, + "successful_runs": len(timings), + "errors": errors, + "avg_time_ms": round(avg, 2), + "min_time_ms": round(min(timings), 2), + "max_time_ms": round(max(timings), 2), + "std_time_ms": round(statistics.stdev(timings), 2) if len(timings) > 1 else 0.0, + "fps": round(fps, 2), + "model_load_ms": round(model_load_ms, 1), + "image_size": f"{image.shape[1]}x{image.shape[0]}", + "colormap": colormap_name, + } + + # Include depth map preview from last run + if extraction_data: + result["extraction_result"] = { + "success": True, + "feature_type": "depth_estimation", + "feature_data": extraction_data, + "processing_time": round(timings[-1], 2), + "metadata": { + "backend": backend, + "device": device, + "colormap": colormap_name, + }, + } + + _log(f"Results: {avg:.1f}ms avg, {fps:.1f} FPS, {len(timings)}/{num_runs} successful") + + # Output JSON on stdout (Aegis parses the last line) + print(json.dumps(result)) + + +def main(): + parser = argparse.ArgumentParser(description="Depth Estimation Benchmark") + parser.add_argument("--variant", type=str, required=True, + help="Model variant ID (e.g. depth_anything_v2_vits, DepthAnythingV2SmallF16)") + parser.add_argument("--runs", type=int, default=10, + help="Number of benchmark runs (default: 10)") + parser.add_argument("--colormap", type=str, default="viridis", + choices=["inferno", "viridis", "plasma", "magma", "jet", "turbo", "hot", "cool"]) + parser.add_argument("--device", type=str, default="auto", + choices=["auto", "cpu", "cuda", "mps"], + help="Compute device (default: auto-detect)") + parser.add_argument("--compute-units", type=str, default="all", + help="Compute units for CoreML (all, cpu, cpu_and_ne, gpu, npu, cpu_npu)") + parser.add_argument("--test-image-url", type=str, + default="https://ultralytics.com/images/bus.jpg", + help="URL of test image to download") + args = parser.parse_args() + + run_benchmark(args) + + +if __name__ == "__main__": + main() diff --git a/skills/transformation/depth-estimation/scripts/transform.py b/skills/transformation/depth-estimation/scripts/transform.py index c4013c3..6fcd0da 100644 --- a/skills/transformation/depth-estimation/scripts/transform.py +++ b/skills/transformation/depth-estimation/scripts/transform.py @@ -4,6 +4,7 @@ Backend selection: macOS → CoreML (.mlpackage via coremltools) — runs on Neural Engine + Win/Linux + NVIDIA + TensorRT → TensorRT FP16 (.trt engine) — fastest Other → PyTorch (depth_anything_v2 pip package + HF weights) — runs on CUDA/MPS/CPU Implements the TransformSkillBase interface to provide real-time depth map @@ -12,6 +13,7 @@ Usage: python transform.py --model depth-anything-v2-small --device auto + python transform.py --model depth-anything-v2-small --backend tensorrt python transform.py --config config.json """ @@ -70,6 +72,9 @@ # Where Aegis DepthVisionStudio stores downloaded models MODELS_DIR = Path.home() / ".aegis-ai" / "models" / "feature-extraction" +# TensorRT engine cache directory (engines are GPU-specific) +TRT_CACHE_DIR = MODELS_DIR / "trt_engines" + # PyTorch model configs (fallback on non-macOS) PYTORCH_CONFIGS = { "depth-anything-v2-small": { @@ -110,6 +115,13 @@ def __init__(self): self.opacity = 0.5 self.blend_mode = "depth_only" # Default for privacy: depth_only anonymizes self._coreml_input_size = COREML_INPUT_SIZE + # TensorRT state (populated by _load_tensorrt) + self._trt_context = None + self._trt_input_name = None + self._trt_output_name = None + self._trt_input_tensor = None + self._trt_output_tensor = None + self._trt_stream = None def parse_extra_args(self, parser: argparse.ArgumentParser): parser.add_argument("--model", type=str, default="depth-anything-v2-small", @@ -117,6 +129,9 @@ def parse_extra_args(self, parser: argparse.ArgumentParser): "depth-anything-v2-large"]) parser.add_argument("--variant", type=str, default=DEFAULT_COREML_VARIANT, help="CoreML variant ID (macOS only)") + parser.add_argument("--backend", type=str, default="auto", + choices=["auto", "tensorrt", "pytorch", "coreml"], + help="Inference backend (auto = TRT if available, else PyTorch)") parser.add_argument("--colormap", type=str, default="inferno", choices=list(COLORMAP_MAP.keys())) parser.add_argument("--blend-mode", type=str, default="depth_only", @@ -137,6 +152,15 @@ def load_model(self, config: dict) -> dict: except Exception as e: _log(f"CoreML load failed ({e}), falling back to PyTorch", self._tag) + # Try TensorRT on Windows/Linux with CUDA (if available) + backend_pref = config.get("backend", "auto") + if backend_pref in ("auto", "tensorrt") and self.device == "cuda": + try: + info = self._load_tensorrt(model_name, config) + return info + except Exception as e: + _log(f"TensorRT load failed ({e}), falling back to PyTorch", self._tag) + # Fallback: PyTorch return self._load_pytorch(model_name, config) @@ -196,6 +220,155 @@ def _download_coreml_model(self, variant_id: str): _log(f"CoreML model download failed: {e}", self._tag) raise + # ── TensorRT backend (Windows/Linux NVIDIA) ─────────────────────── + + def _load_tensorrt(self, model_name: str, config: dict) -> dict: + """Load or build a TensorRT FP16 engine for fastest NVIDIA inference.""" + import torch + import tensorrt as trt + + _log(f"Attempting TensorRT FP16 for {model_name}", self._tag) + + cfg = PYTORCH_CONFIGS.get(model_name) + if not cfg: + raise ValueError(f"Unknown model: {model_name}") + + # Engine filename includes GPU name to avoid cross-GPU issues + gpu_tag = torch.cuda.get_device_name(0).replace(" ", "_").lower() + engine_path = TRT_CACHE_DIR / f"{cfg['filename'].replace('.pth', '')}_fp16_{gpu_tag}.trt" + + if engine_path.exists(): + _log(f"Loading cached TRT engine: {engine_path}", self._tag) + engine = self._deserialize_engine(engine_path) + else: + _log("No cached engine — building from ONNX (this takes 30-120s)...", self._tag) + engine = self._build_trt_engine(model_name, cfg, engine_path) + + if engine is None: + raise RuntimeError("TensorRT engine build/load failed") + + # Create execution context and pre-allocate buffers + self._trt_context = engine.create_execution_context() + self._trt_input_name = engine.get_tensor_name(0) + self._trt_output_name = engine.get_tensor_name(1) + + # Pre-allocate a reference input to set shapes (1, 3, 518, 518) + input_shape = engine.get_tensor_shape(self._trt_input_name) + fixed_shape = tuple(1 if d == -1 else d for d in input_shape) + self._trt_context.set_input_shape(self._trt_input_name, fixed_shape) + + # Pre-allocate GPU tensors + self._trt_input_tensor = torch.zeros(fixed_shape, dtype=torch.float32, device="cuda") + actual_out_shape = self._trt_context.get_tensor_shape(self._trt_output_name) + self._trt_output_tensor = torch.empty(list(actual_out_shape), dtype=torch.float32, device="cuda") + + # Set tensor addresses + self._trt_context.set_tensor_address(self._trt_input_name, self._trt_input_tensor.data_ptr()) + self._trt_context.set_tensor_address(self._trt_output_name, self._trt_output_tensor.data_ptr()) + self._trt_stream = torch.cuda.current_stream().cuda_stream + + self.backend = "tensorrt" + _log(f"TensorRT FP16 engine ready: {engine_path.name}", self._tag) + return { + "model": model_name, + "device": "cuda", + "blend_mode": self.blend_mode, + "colormap": config.get("colormap", "inferno"), + "backend": "tensorrt", + "engine": engine_path.name, + } + + def _build_trt_engine(self, model_name: str, cfg: dict, engine_path: Path): + """Export PyTorch → ONNX → build TRT FP16 engine → serialize.""" + import torch + import tensorrt as trt + from depth_anything_v2.dpt import DepthAnythingV2 + from huggingface_hub import hf_hub_download + + # Load PyTorch model temporarily for ONNX export + weights_path = hf_hub_download(cfg["repo"], cfg["filename"]) + pt_model = DepthAnythingV2( + encoder=cfg["encoder"], features=cfg["features"], + out_channels=cfg["out_channels"], + ) + pt_model.load_state_dict(torch.load(weights_path, map_location="cuda", weights_only=True)) + pt_model.to("cuda").eval() + + # Create dummy input and export to ONNX + dummy = torch.randn(1, 3, 518, 518, device="cuda") + onnx_path = TRT_CACHE_DIR / f"{cfg['filename'].replace('.pth', '')}.onnx" + TRT_CACHE_DIR.mkdir(parents=True, exist_ok=True) + + _log(f"Exporting ONNX: {onnx_path.name}", self._tag) + torch.onnx.export( + pt_model, dummy, str(onnx_path), + input_names=["input"], output_names=["depth"], + dynamic_axes={"input": {0: "batch"}, "depth": {0: "batch"}}, + opset_version=17, + ) + + # Free PyTorch model — no longer needed + del pt_model + torch.cuda.empty_cache() + + # Build TRT engine + logger = trt.Logger(trt.Logger.WARNING) + builder = trt.Builder(logger) + network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + parser = trt.OnnxParser(network, logger) + + _log("Parsing ONNX for TensorRT...", self._tag) + with open(str(onnx_path), "rb") as f: + if not parser.parse(f.read()): + for i in range(parser.num_errors): + _log(f" ONNX parse error: {parser.get_error(i)}", self._tag) + return None + + config = builder.create_builder_config() + config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB + config.set_flag(trt.BuilderFlag.FP16) + + # Handle dynamic batch dimension + inp = network.get_input(0) + if any(d == -1 for d in inp.shape): + profile = builder.create_optimization_profile() + fixed = tuple(1 if d == -1 else d for d in inp.shape) + profile.set_shape(inp.name, fixed, fixed, fixed) + config.add_optimization_profile(profile) + + _log("Building TRT FP16 engine (this is slow, ~30-120s)...", self._tag) + serialized = builder.build_serialized_network(network, config) + if serialized is None: + _log("TRT engine build failed!", self._tag) + return None + + # TRT 10.15+ returns IHostMemory, not raw bytes — convert + engine_bytes = bytes(serialized) + + # Serialize to disk for future starts + engine_path.parent.mkdir(parents=True, exist_ok=True) + with open(str(engine_path), "wb") as f: + f.write(engine_bytes) + _log(f"Engine serialized: {engine_path} ({len(engine_bytes) / 1e6:.1f} MB)", self._tag) + + # Clean up ONNX (no longer needed) + try: + onnx_path.unlink() + except OSError: + pass + + runtime = trt.Runtime(logger) + return runtime.deserialize_cuda_engine(engine_bytes) + + @staticmethod + def _deserialize_engine(engine_path: Path): + """Load a previously serialized TRT engine from disk.""" + import tensorrt as trt + logger = trt.Logger(trt.Logger.WARNING) + runtime = trt.Runtime(logger) + with open(str(engine_path), "rb") as f: + return runtime.deserialize_cuda_engine(f.read()) + # ── PyTorch backend (fallback) ──────────────────────────────────── def _load_pytorch(self, model_name: str, config: dict) -> dict: @@ -242,6 +415,8 @@ def transform_frame(self, image, metadata: dict): if self.backend == "coreml": depth_colored = self._infer_coreml(image) + elif self.backend == "tensorrt": + depth_colored = self._infer_tensorrt(image) else: depth_colored = self._infer_pytorch(image) @@ -254,6 +429,43 @@ def transform_frame(self, image, metadata: dict): return output + def _infer_tensorrt(self, image): + """Run TensorRT FP16 inference and return colorized depth map.""" + import torch + import cv2 + import numpy as np + + original_h, original_w = image.shape[:2] + rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + # Preprocess: resize → normalize → NCHW tensor (same as PyTorch path) + resized = cv2.resize(rgb, (518, 518), interpolation=cv2.INTER_LINEAR) + img_float = resized.astype(np.float32) / 255.0 + # ImageNet normalization + mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) + std = np.array([0.229, 0.224, 0.225], dtype=np.float32) + img_float = (img_float - mean) / std + img_nchw = np.transpose(img_float, (2, 0, 1))[np.newaxis] # (1, 3, 518, 518) + + # Copy to pre-allocated GPU tensor + self._trt_input_tensor.copy_(torch.from_numpy(img_nchw)) + + # Execute + self._trt_context.execute_async_v3(self._trt_stream) + torch.cuda.synchronize() + + # Read output + depth = self._trt_output_tensor.cpu().numpy() + depth = np.squeeze(depth) + + # Normalize → colormap → resize + d_min, d_max = depth.min(), depth.max() + depth_norm = ((depth - d_min) / (d_max - d_min + 1e-8) * 255).astype(np.uint8) + depth_colored = cv2.applyColorMap(depth_norm, self.colormap_id) + depth_colored = cv2.resize(depth_colored, (original_w, original_h)) + + return depth_colored + def _infer_coreml(self, image): """Run CoreML inference and return colorized depth map (BGR, original size).""" import cv2 diff --git a/skills/transformation/depth-estimation/scripts/transform_base.py b/skills/transformation/depth-estimation/scripts/transform_base.py index 48f251a..73178e5 100644 --- a/skills/transformation/depth-estimation/scripts/transform_base.py +++ b/skills/transformation/depth-estimation/scripts/transform_base.py @@ -415,7 +415,13 @@ def _load_config(self, args) -> dict: if config_path.exists(): with open(config_path) as f: return json.load(f) - return {"device": args.device} + # Merge all CLI args into config (--model, --colormap, --blend-mode, etc.) + config = {} + for k, v in vars(args).items(): + if k != "config" and v is not None: + # Convert hyphens to underscores for consistency (e.g. blend-mode → blend_mode) + config[k.replace("-", "_")] = v + return config @staticmethod def _detect_hardware(device_pref: str = "auto") -> HardwareEnv: diff --git a/skills/transformation/depth-estimation/scripts/trt_benchmark.py b/skills/transformation/depth-estimation/scripts/trt_benchmark.py new file mode 100644 index 0000000..f91de2d --- /dev/null +++ b/skills/transformation/depth-estimation/scripts/trt_benchmark.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +""" +Native TensorRT Benchmark for Depth Anything V2. + +Builds a TensorRT engine from ONNX, benchmarks FP32 and FP16, +and compares against vanilla PyTorch CUDA and ONNX CUDA. +""" +import sys, os, time, json, statistics +from pathlib import Path + +_script_dir = Path(__file__).resolve().parent +sys.path.insert(0, str(_script_dir)) + +def _log(msg): + print(f"[TRT] {msg}", file=sys.stderr, flush=True) + +def get_test_image(): + cache = Path.home() / ".aegis-ai" / "tmp" / "benchmark" / "test_image.jpg" + cache.parent.mkdir(parents=True, exist_ok=True) + if cache.exists(): return str(cache) + from urllib.request import urlretrieve + _log("Downloading test image...") + urlretrieve("https://ultralytics.com/images/bus.jpg", str(cache)) + return str(cache) + +def stats(times, label): + avg = statistics.mean(times) + return {"label": label, "runs": len(times), + "avg_ms": round(avg,2), "min_ms": round(min(times),2), + "max_ms": round(max(times),2), + "std_ms": round(statistics.stdev(times),2) if len(times)>1 else 0, + "fps": round(1000/avg,2) if avg>0 else 0} + +def build_trt_engine(onnx_path, fp16=False): + """Build a TensorRT engine from an ONNX model.""" + import tensorrt as trt + logger = trt.Logger(trt.Logger.WARNING) + builder = trt.Builder(logger) + network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + parser = trt.OnnxParser(network, logger) + + _log(f" Parsing ONNX: {onnx_path}") + with open(onnx_path, "rb") as f: + if not parser.parse(f.read()): + for i in range(parser.num_errors): + _log(f" ONNX Parse Error: {parser.get_error(i)}") + return None + + config = builder.create_builder_config() + config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB + + # Handle dynamic shapes — set optimization profile for batch dimension + inp = network.get_input(0) + if any(d == -1 for d in inp.shape): + profile = builder.create_optimization_profile() + # Use fixed shape (batch=1) for the actual input dimensions + shape_list = list(inp.shape) + fixed_shape = tuple(1 if d == -1 else d for d in shape_list) + profile.set_shape(inp.name, fixed_shape, fixed_shape, fixed_shape) + config.add_optimization_profile(profile) + _log(f" Set optimization profile: {fixed_shape}") + + if fp16: + config.set_flag(trt.BuilderFlag.FP16) + _log(" Building TRT engine (FP16)...") + else: + _log(" Building TRT engine (FP32)...") + + t0 = time.perf_counter() + serialized = builder.build_serialized_network(network, config) + build_time = (time.perf_counter() - t0) * 1000 + + if serialized is None: + _log(" Engine build failed!") + return None + + runtime = trt.Runtime(logger) + engine = runtime.deserialize_cuda_engine(serialized) + _log(f" Engine built in {build_time:.0f}ms") + return engine, build_time + +def bench_trt_engine(engine, input_data, num_runs=15, warmup=5, label="TRT"): + """Benchmark a TensorRT engine.""" + import tensorrt as trt + import numpy as np + + try: + # Try newer API first (TRT 10+) + context = engine.create_execution_context() + + # Get binding info + num_io = engine.num_io_tensors + input_name = engine.get_tensor_name(0) + output_name = engine.get_tensor_name(1) + input_shape = engine.get_tensor_shape(input_name) + output_shape = engine.get_tensor_shape(output_name) + + _log(f" Input: {input_name} {list(input_shape)}") + _log(f" Output: {output_name} {list(output_shape)}") + + # Allocate CUDA memory + import ctypes + + # Use pycuda or cuda-python for memory management + try: + import cuda # Try nvidia cuda-python + has_cuda_python = True + except ImportError: + has_cuda_python = False + + # Fallback: use torch for GPU memory management (simplest) + import torch + + input_tensor = torch.from_numpy(input_data).cuda() + + # Determine output shape - handle dynamic dims + out_shape = list(output_shape) + for i, s in enumerate(out_shape): + if s == -1: + if i == 0: out_shape[i] = input_data.shape[0] # batch + else: out_shape[i] = 1 # placeholder + + # Set input shape for dynamic dims + context.set_input_shape(input_name, input_data.shape) + + # Get actual output shape after setting input + actual_out_shape = context.get_tensor_shape(output_name) + output_tensor = torch.empty(list(actual_out_shape), dtype=torch.float32, device='cuda') + + # Set tensor addresses + context.set_tensor_address(input_name, input_tensor.data_ptr()) + context.set_tensor_address(output_name, output_tensor.data_ptr()) + + # Get CUDA stream + stream = torch.cuda.current_stream().cuda_stream + + # Warmup + for _ in range(warmup): + context.execute_async_v3(stream) + torch.cuda.synchronize() + + # Benchmark + times = [] + for i in range(num_runs): + torch.cuda.synchronize() + t0 = time.perf_counter() + context.execute_async_v3(stream) + torch.cuda.synchronize() + elapsed = (time.perf_counter() - t0) * 1000 + times.append(elapsed) + _log(f" [{label}] Run {i+1}/{num_runs}: {elapsed:.1f}ms") + + return times + + except Exception as e: + _log(f" Engine execution error: {e}") + import traceback + traceback.print_exc(file=sys.stderr) + return None + +def main(): + import torch, cv2, numpy as np + + device = "cuda" + N, W = 15, 5 + + _log(f"PyTorch {torch.__version__}, CUDA {torch.version.cuda}") + _log(f"GPU: {torch.cuda.get_device_name(0)}") + + import tensorrt as trt + _log(f"TensorRT: {trt.__version__}") + + # Load image and model + image = cv2.imread(get_test_image()) + _log(f"Image: {image.shape[1]}x{image.shape[0]}") + + from depth_anything_v2.dpt import DepthAnythingV2 + from huggingface_hub import hf_hub_download + cfg = {"encoder":"vits","features":64,"out_channels":[48,96,192,384], + "repo":"depth-anything/Depth-Anything-V2-Small","filename":"depth_anything_v2_vits.pth"} + weights = hf_hub_download(cfg["repo"], cfg["filename"]) + model = DepthAnythingV2(encoder=cfg["encoder"], features=cfg["features"], out_channels=cfg["out_channels"]) + model.load_state_dict(torch.load(weights, map_location=device, weights_only=True)) + model.to(device).eval() + + rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + img_tensor, (h, w) = model.image2tensor(rgb, input_size=518) + img_tensor = img_tensor.to(device) + _log(f"Tensor: {img_tensor.shape}") + + results = [] + + # 1. PyTorch CUDA baseline + _log("\n== 1. PyTorch CUDA ==") + for _ in range(W): + with torch.no_grad(): model.forward(img_tensor); torch.cuda.synchronize() + times = [] + for i in range(N): + torch.cuda.synchronize(); t0 = time.perf_counter() + with torch.no_grad(): model.forward(img_tensor) + torch.cuda.synchronize(); times.append((time.perf_counter()-t0)*1000) + _log(f" [PyTorch] Run {i+1}/{N}: {times[-1]:.1f}ms") + results.append(stats(times, "PyTorch CUDA")) + + # 2. ONNX CUDA + _log("\n== 2. ONNX CUDA ==") + onnx_path = Path.home() / ".aegis-ai" / "tmp" / "benchmark" / "dav2_small.onnx" + onnx_path.parent.mkdir(parents=True, exist_ok=True) + try: + _log(" Exporting ONNX...") + torch.onnx.export(model, img_tensor, str(onnx_path), + input_names=["input"], output_names=["depth"], + dynamic_axes={"input":{0:"batch"}, "depth":{0:"batch"}}, opset_version=17) + import onnxruntime as ort + sess = ort.InferenceSession(str(onnx_path), providers=["CUDAExecutionProvider"]) + in_name = sess.get_inputs()[0].name + inp = img_tensor.cpu().numpy() + for _ in range(W): sess.run(None, {in_name: inp}) + times = [] + for i in range(N): + t0 = time.perf_counter() + sess.run(None, {in_name: inp}) + times.append((time.perf_counter()-t0)*1000) + _log(f" [ONNX CUDA] Run {i+1}/{N}: {times[-1]:.1f}ms") + results.append(stats(times, "ONNX CUDA")) + except Exception as e: + _log(f" FAILED: {e}") + results.append({"label":"ONNX CUDA","error":str(e)[:100]}) + + # 3. TensorRT FP32 + _log("\n== 3. TensorRT FP32 ==") + try: + engine_result = build_trt_engine(str(onnx_path), fp16=False) + if engine_result: + engine, build_ms = engine_result + inp = img_tensor.cpu().numpy() + times = bench_trt_engine(engine, inp, N, W, "TRT FP32") + if times: + r = stats(times, "TensorRT FP32") + r["build_ms"] = round(build_ms, 0) + results.append(r) + else: + results.append({"label":"TensorRT FP32","error":"execution failed"}) + else: + results.append({"label":"TensorRT FP32","error":"engine build failed"}) + except Exception as e: + _log(f" FAILED: {e}") + results.append({"label":"TensorRT FP32","error":str(e)[:100]}) + + # 4. TensorRT FP16 + _log("\n== 4. TensorRT FP16 ==") + try: + engine_result = build_trt_engine(str(onnx_path), fp16=True) + if engine_result: + engine, build_ms = engine_result + inp = img_tensor.cpu().numpy() + times = bench_trt_engine(engine, inp, N, W, "TRT FP16") + if times: + r = stats(times, "TensorRT FP16") + r["build_ms"] = round(build_ms, 0) + results.append(r) + else: + results.append({"label":"TensorRT FP16","error":"execution failed"}) + else: + results.append({"label":"TensorRT FP16","error":"engine build failed"}) + except Exception as e: + _log(f" FAILED: {e}") + results.append({"label":"TensorRT FP16","error":str(e)[:100]}) + + # Summary + _log("\n" + "="*70) + _log(f"{'Backend':<22} {'Avg(ms)':>8} {'Min(ms)':>8} {'FPS':>7} {'Speedup':>8}") + _log("-"*70) + base = results[0].get("avg_ms",1) + for r in results: + if "error" in r: + _log(f"{r['label']:<22} {'FAIL':>8} {r['error'][:45]}") + else: + su = base/r["avg_ms"] if r["avg_ms"]>0 else 0 + _log(f"{r['label']:<22} {r['avg_ms']:>8.1f} {r['min_ms']:>8.1f} {r['fps']:>7.1f} {su:>7.2f}x") + + print(json.dumps({"gpu": torch.cuda.get_device_name(0), "results": results}, indent=2)) + +if __name__ == "__main__": + main()