diff --git a/SKILL.md b/SKILL.md
index 050f881..b9dce91 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -65,9 +65,12 @@ When used for **privacy mode**, the `depth_only` blend mode fully anonymizes the
 | Platform | Backend | Runtime | Model |
 |----------|---------|---------|-------|
 | **macOS** | CoreML | Apple Neural Engine | `apple/coreml-depth-anything-v2-small` (.mlpackage) |
-| Linux/Windows | PyTorch | CUDA / CPU | `depth-anything/Depth-Anything-V2-Small` (.pth) |
+| Windows/Linux | **TensorRT** | NVIDIA TRT FP16 | Auto-built `.trt` engine (from ONNX) |
+| Windows/Linux | PyTorch | CUDA / CPU | `depth-anything/Depth-Anything-V2-Small` (.pth) |
 
-On macOS, CoreML runs on the Neural Engine, leaving the GPU free for other tasks. The model is auto-downloaded from HuggingFace and stored at `~/.aegis-ai/models/feature-extraction/`.
+On macOS, CoreML runs on the Neural Engine, leaving the GPU free for other tasks.
+On Windows/Linux with NVIDIA GPUs, TensorRT FP16 provides 2-4x speedup over vanilla PyTorch CUDA. Engines are auto-built on first run and cached at `~/.aegis-ai/models/feature-extraction/trt_engines/`.
+The model is auto-downloaded from HuggingFace and stored at `~/.aegis-ai/models/feature-extraction/`.
 
 ## What You Get
 
diff --git a/skills.json b/skills.json
index 3440a5e..8ff6c87 100644
--- a/skills.json
+++ b/skills.json
@@ -137,7 +137,7 @@
       "id": "depth-estimation",
       "name": "Depth Estimation (Privacy)",
       "description": "Privacy-first depth map transforms — anonymize camera feeds with Depth Anything v2 while preserving spatial awareness.",
-      "version": "1.1.0",
+      "version": "1.2.0",
       "category": "privacy",
       "path": "skills/transformation/depth-estimation",
       "tags": [
diff --git a/skills/transformation/depth-estimation/deploy.bat b/skills/transformation/depth-estimation/deploy.bat
new file mode 100644
index 0000000..b553907
--- /dev/null
+++ b/skills/transformation/depth-estimation/deploy.bat
@@ -0,0 +1,66 @@
+@echo off
+REM deploy.bat — Windows deployment for Depth Estimation (Privacy) skill
+REM Creates venv, installs PyTorch + CUDA dependencies, verifies GPU detection.
+REM
+REM The Aegis deployment agent calls this on Windows instead of deploy.sh.
+
+setlocal enabledelayedexpansion
+
+set "SCRIPT_DIR=%~dp0"
+set "VENV_DIR=%SCRIPT_DIR%.venv"
+set "MODELS_DIR=%USERPROFILE%\.aegis-ai\models\feature-extraction"
+
+echo === Depth Estimation (Privacy) — Windows Setup ===
+
+REM ── Create venv ───────────────────────────────────────────────────
+if not exist "%VENV_DIR%" (
+    echo Creating virtual environment...
+    python -m venv "%VENV_DIR%"
+    if errorlevel 1 (
+        echo ERROR: Failed to create virtual environment. Is Python installed?
+        exit /b 1
+    )
+)
+
+set "PIP=%VENV_DIR%\Scripts\pip.exe"
+set "PYTHON=%VENV_DIR%\Scripts\python.exe"
+
+REM Upgrade pip
+"%PIP%" install --upgrade pip --quiet
+
+echo.
+echo === Windows — PyTorch backend (CUDA/CPU) ===
+echo Installing PyTorch dependencies...
+"%PIP%" install --quiet -r "%SCRIPT_DIR%requirements.txt"
+
+if errorlevel 1 (
+    echo ERROR: pip install failed. Check requirements.txt and network connectivity.
+    exit /b 1
+)
+
+echo [OK] PyTorch dependencies installed
+
+REM ── Verify installation ───────────────────────────────────────────
+"%PYTHON%" -c "import torch, cv2, numpy, PIL; from depth_anything_v2.dpt import DepthAnythingV2; cuda = 'YES' if torch.cuda.is_available() else 'NO'; gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'; print(f'[OK] Verified: torch={torch.__version__}, CUDA={cuda}, GPU={gpu}')"
+
+if errorlevel 1 (
+    echo WARNING: Verification failed. Some packages may not be installed correctly.
+    echo Trying minimal verification...
+    "%PYTHON%" -c "import torch; print(f'torch={torch.__version__}, CUDA={torch.cuda.is_available()}')"
+)
+
+REM ── TensorRT verification (optional) ──────────────────────────────
+echo.
+echo === TensorRT Check ===
+"%PYTHON%" -c "import tensorrt; print(f'[OK] TensorRT={tensorrt.__version__}')" 2>nul
+if errorlevel 1 (
+    echo [INFO] TensorRT not available — will use PyTorch CUDA backend
+    echo [INFO] To enable TensorRT: pip install tensorrt
+) else (
+    echo [OK] TensorRT FP16 acceleration available
+)
+
+echo.
+echo === Setup complete ===
+
+endlocal
diff --git a/skills/transformation/depth-estimation/models.json b/skills/transformation/depth-estimation/models.json
new file mode 100644
index 0000000..6881abf
--- /dev/null
+++ b/skills/transformation/depth-estimation/models.json
@@ -0,0 +1,123 @@
+{
+  "studio": {
+    "title": "3D Depth Vision Studio",
+    "subtitle": "Convert 2D video to 3D depth maps • Privacy-first scene understanding",
+    "icon": "layers"
+  },
+  "models_dir": "~/.aegis-ai/models/feature-extraction",
+  "models": {
+    "depth-anything-v2-small": {
+      "name": "Depth Anything V2 Small",
+      "type": "depth_estimation",
+      "input_size": [518, 392],
+      "description": "Monocular depth estimation — fast, lightweight model",
+      "platforms": {
+        "darwin": {
+          "repository": "apple/coreml-depth-anything-v2-small",
+          "format": "mlpackage",
+          "variants": {
+            "DepthAnythingV2SmallF16": {
+              "precision": "float16",
+              "size_mb": 49.8,
+              "description": "Float16 — Optimized for Neural Engine"
+            },
+            "DepthAnythingV2SmallF32": {
+              "precision": "float32",
+              "size_mb": 99.2,
+              "description": "Float32 — Highest precision"
+            }
+          }
+        },
+        "win32": {
+          "repository": "depth-anything/Depth-Anything-V2-Small",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vits": {
+              "precision": "float32",
+              "size_mb": 99,
+              "description": "PyTorch ViT-S — CUDA/CPU/MPS compatible"
+            },
+            "depth_anything_v2_vits_trt_fp16": {
+              "precision": "float16",
+              "format": "trt",
+              "size_mb": 25,
+              "description": "TensorRT FP16 — Fastest (requires NVIDIA GPU + TensorRT)",
+              "requires": "tensorrt"
+            }
+          }
+        },
+        "linux": {
+          "repository": "depth-anything/Depth-Anything-V2-Small",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vits": {
+              "precision": "float32",
+              "size_mb": 99,
+              "description": "PyTorch ViT-S — CUDA/CPU compatible"
+            }
+          }
+        }
+      }
+    },
+    "depth-anything-v2-base": {
+      "name": "Depth Anything V2 Base",
+      "type": "depth_estimation",
+      "input_size": [518, 392],
+      "description": "Monocular depth estimation — balanced speed and accuracy",
+      "platforms": {
+        "win32": {
+          "repository": "depth-anything/Depth-Anything-V2-Base",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vitb": {
+              "precision": "float32",
+              "size_mb": 390,
+              "description": "PyTorch ViT-B — CUDA/CPU/MPS compatible"
+            }
+          }
+        },
+        "linux": {
+          "repository": "depth-anything/Depth-Anything-V2-Base",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vitb": {
+              "precision": "float32",
+              "size_mb": 390,
+              "description": "PyTorch ViT-B — CUDA/CPU compatible"
+            }
+          }
+        }
+      }
+    },
+    "depth-anything-v2-large": {
+      "name": "Depth Anything V2 Large",
+      "type": "depth_estimation",
+      "input_size": [518, 392],
+      "description": "Monocular depth estimation — highest accuracy, more VRAM required",
+      "platforms": {
+        "win32": {
+          "repository": "depth-anything/Depth-Anything-V2-Large",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vitl": {
+              "precision": "float32",
+              "size_mb": 1340,
+              "description": "PyTorch ViT-L — CUDA recommended (1.3 GB)"
+            }
+          }
+        },
+        "linux": {
+          "repository": "depth-anything/Depth-Anything-V2-Large",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vitl": {
+              "precision": "float32",
+              "size_mb": 1340,
+              "description": "PyTorch ViT-L — CUDA recommended (1.3 GB)"
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/skills/transformation/depth-estimation/requirements.txt b/skills/transformation/depth-estimation/requirements.txt
index 2717a00..a31ba0d 100644
--- a/skills/transformation/depth-estimation/requirements.txt
+++ b/skills/transformation/depth-estimation/requirements.txt
@@ -1,10 +1,17 @@
 # Depth Estimation — Privacy Transform Skill
 # CoreML-first on macOS (Neural Engine), PyTorch fallback on other platforms.
 #
+# INSTALL WITH: pip install --ignore-requires-python -r requirements.txt
+# The depth-anything-v2 PyPI wheel declares python_requires>=3.12 in its
+# metadata, but is pure Python (py3-none-any) and works on Python 3.11+.
+#
 # macOS: coremltools loads .mlpackage models — fast, leaves GPU free.
 # Other: PyTorch + depth-anything-v2 pip package + HF weights.
 # Common: opencv, numpy, pillow, huggingface_hub for model download.
 
+# ── CUDA (NVIDIA systems get CUDA wheels, CPU-only falls back) ───────
+--extra-index-url https://download.pytorch.org/whl/cu126
+
 # ── CoreML (macOS only) ──────────────────────────────────────────────
 coremltools>=8.0; sys_platform == "darwin"
 
@@ -20,3 +27,10 @@ numpy>=1.24.0
 opencv-python-headless>=4.8.0
 Pillow>=10.0.0
 matplotlib>=3.7.0
+
+# ── TensorRT acceleration (optional, NVIDIA only) ───────────────────
+# Provides ~2-4x speedup over vanilla PyTorch CUDA via FP16 inference.
+# Requires NVIDIA GPU with Compute Capability >= 7.0 (Turing+).
+# If not installed, the skill gracefully falls back to PyTorch.
+tensorrt>=10.0; sys_platform != "darwin"
+onnxruntime-gpu>=1.17.0; sys_platform != "darwin"
diff --git a/skills/transformation/depth-estimation/scripts/benchmark.py b/skills/transformation/depth-estimation/scripts/benchmark.py
new file mode 100644
index 0000000..bbcba12
--- /dev/null
+++ b/skills/transformation/depth-estimation/scripts/benchmark.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+"""
+Depth Estimation Benchmark — Cross-platform performance measurement.
+
+Measures inference latency for depth-estimation models across CoreML (macOS)
+and PyTorch (CUDA/MPS/CPU). Outputs JSON results compatible with
+Aegis DepthVisionStudio's benchmark UI.
+
+Usage:
+  python benchmark.py --variant depth_anything_v2_vits --runs 10 --device auto
+  python benchmark.py --variant DepthAnythingV2SmallF16 --runs 5 --compute-units cpu_and_ne
+
+The --compute-units flag is macOS/CoreML only (maps to coremltools.ComputeUnit).
+On other platforms, use --device to select cuda/cpu/mps.
+"""
+
+import sys
+import os
+import json
+import time
+import argparse
+import tempfile
+import statistics
+from pathlib import Path
+
+# Add parent for transform imports
+_script_dir = Path(__file__).resolve().parent
+sys.path.insert(0, str(_script_dir))
+
+
+def _log(msg):
+    """Log to stderr (not captured by Aegis JSON parser)."""
+    print(f"[Benchmark] {msg}", file=sys.stderr, flush=True)
+
+
+def _download_test_image(url, dest_path):
+    """Download a test image from URL."""
+    try:
+        from urllib.request import urlretrieve
+        _log(f"Downloading test image: {url}")
+        urlretrieve(url, dest_path)
+        return True
+    except Exception as e:
+        _log(f"Download failed: {e}")
+        return False
+
+
+def _get_test_image(test_image_url):
+    """Get or download a test image, returns path or None."""
+    # Check for cached test image
+    cache_dir = Path.home() / ".aegis-ai" / "tmp" / "benchmark"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    cached = cache_dir / "test_image.jpg"
+
+    if cached.exists():
+        return str(cached)
+
+    if test_image_url:
+        if _download_test_image(test_image_url, str(cached)):
+            return str(cached)
+
+    # Generate a synthetic test image as fallback
+    try:
+        import numpy as np
+        import cv2
+        _log("Generating synthetic 640x480 test image")
+        img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
+        cv2.imwrite(str(cached), img)
+        return str(cached)
+    except ImportError:
+        _log("ERROR: cv2/numpy not available — cannot generate test image")
+        return None
+
+
+def _resolve_device(device_pref, compute_units):
+    """
+    Resolve the compute device from CLI args.
+
+    Priority:
+      1. Explicit --device (cuda, cpu, mps)
+      2. Map --compute-units to device (CoreML-specific, macOS only)
+      3. Auto-detect
+    """
+    import platform
+
+    if device_pref and device_pref != "auto":
+        return device_pref
+
+    # Map compute_units → device hint (for Aegis UI compatibility)
+    if compute_units and compute_units != "all":
+        cu_map = {
+            "gpu": "cuda",           # Aegis UI "GPU" → CUDA
+            "cpu": "cpu",
+            "npu": "mps",            # On macOS, NPU maps to Neural Engine via CoreML
+            "cpu_npu": "mps",        # CPU + Neural Engine
+            "cpu_and_ne": "mps",     # CoreML compute units string
+        }
+        mapped = cu_map.get(compute_units)
+        if mapped:
+            return mapped
+
+    # Auto-detect
+    try:
+        import torch
+        if torch.cuda.is_available():
+            return "cuda"
+        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            return "mps"
+    except ImportError:
+        pass
+
+    if platform.system() == "Darwin":
+        return "mps"
+
+    return "cpu"
+
+
+def run_benchmark(args):
+    """Run the depth estimation benchmark and output JSON results."""
+    import cv2
+    import numpy as np
+
+    variant_id = args.variant
+    num_runs = args.runs
+    colormap_name = args.colormap
+    device = _resolve_device(args.device, args.compute_units)
+    test_image_url = args.test_image_url
+
+    _log(f"Benchmark: variant={variant_id}, runs={num_runs}, device={device}, colormap={colormap_name}")
+
+    # Colormap lookup
+    colormap_map = {
+        "inferno": 1, "viridis": 16, "plasma": 13, "magma": 12,
+        "jet": 2, "turbo": 18, "hot": 11, "cool": 8,
+    }
+    colormap_id = colormap_map.get(colormap_name, 16)
+
+    # Load test image
+    test_image_path = _get_test_image(test_image_url)
+    if not test_image_path:
+        result = {"error": "Could not obtain test image"}
+        print(json.dumps(result))
+        return
+
+    image = cv2.imread(test_image_path)
+    if image is None:
+        result = {"error": f"Failed to read test image: {test_image_path}"}
+        print(json.dumps(result))
+        return
+
+    _log(f"Test image: {image.shape[1]}x{image.shape[0]}")
+
+    # Determine backend and load model
+    import platform as plat
+    is_mac = plat.system() == "Darwin"
+    backend = None
+    model = None
+    model_load_ms = 0.0
+
+    # Try CoreML first on macOS (if variant looks like a CoreML model)
+    coreml_variants = {"DepthAnythingV2SmallF16", "DepthAnythingV2SmallF16INT8", "DepthAnythingV2SmallF32"}
+    if is_mac and variant_id in coreml_variants:
+        try:
+            import coremltools as ct
+            from PIL import Image
+
+            models_dir = Path.home() / ".aegis-ai" / "models" / "feature-extraction"
+            model_path = models_dir / f"{variant_id}.mlpackage"
+
+            if model_path.exists():
+                _log(f"Loading CoreML model: {model_path}")
+                t0 = time.perf_counter()
+
+                # Map compute_units string to coremltools enum
+                cu_str = args.compute_units or "all"
+                cu_enum_map = {
+                    "all": ct.ComputeUnit.ALL,
+                    "cpu": ct.ComputeUnit.CPU_ONLY,
+                    "cpu_and_ne": ct.ComputeUnit.CPU_AND_NE,
+                    "npu": ct.ComputeUnit.CPU_AND_NE,
+                    "cpu_npu": ct.ComputeUnit.CPU_AND_NE,
+                    "gpu": ct.ComputeUnit.CPU_AND_GPU,
+                }
+                cu_enum = cu_enum_map.get(cu_str, ct.ComputeUnit.ALL)
+
+                model = ct.models.MLModel(str(model_path), compute_units=cu_enum)
+                model_load_ms = (time.perf_counter() - t0) * 1000
+                backend = "coreml"
+                _log(f"CoreML model loaded in {model_load_ms:.0f}ms (compute_units={cu_str})")
+            else:
+                _log(f"CoreML model not found at {model_path}, falling back to PyTorch")
+        except Exception as e:
+            _log(f"CoreML load failed: {e}, trying PyTorch")
+
+    # PyTorch fallback
+    if model is None:
+        try:
+            import torch
+            from depth_anything_v2.dpt import DepthAnythingV2
+            from huggingface_hub import hf_hub_download
+
+            # Map variant_id to PyTorch config
+            pytorch_configs = {
+                "depth_anything_v2_vits": {
+                    "encoder": "vits", "features": 64,
+                    "out_channels": [48, 96, 192, 384],
+                    "repo": "depth-anything/Depth-Anything-V2-Small",
+                    "filename": "depth_anything_v2_vits.pth",
+                },
+                "depth_anything_v2_vitb": {
+                    "encoder": "vitb", "features": 128,
+                    "out_channels": [96, 192, 384, 768],
+                    "repo": "depth-anything/Depth-Anything-V2-Base",
+                    "filename": "depth_anything_v2_vitb.pth",
+                },
+                "depth_anything_v2_vitl": {
+                    "encoder": "vitl", "features": 256,
+                    "out_channels": [256, 512, 1024, 1024],
+                    "repo": "depth-anything/Depth-Anything-V2-Large",
+                    "filename": "depth_anything_v2_vitl.pth",
+                },
+            }
+
+            # Also accept CoreML variant names and map to PyTorch
+            coreml_to_pytorch = {
+                "DepthAnythingV2SmallF16": "depth_anything_v2_vits",
+                "DepthAnythingV2SmallF16INT8": "depth_anything_v2_vits",
+                "DepthAnythingV2SmallF32": "depth_anything_v2_vits",
+            }
+
+            pytorch_variant = coreml_to_pytorch.get(variant_id, variant_id)
+            cfg = pytorch_configs.get(pytorch_variant)
+            if not cfg:
+                result = {"error": f"Unknown variant: {variant_id}. Available: {list(pytorch_configs.keys())}"}
+                print(json.dumps(result))
+                return
+
+            _log(f"Loading PyTorch model: {pytorch_variant} on {device}")
+            t0 = time.perf_counter()
+
+            weights_path = hf_hub_download(cfg["repo"], cfg["filename"])
+            model = DepthAnythingV2(
+                encoder=cfg["encoder"],
+                features=cfg["features"],
+                out_channels=cfg["out_channels"],
+            )
+            model.load_state_dict(torch.load(weights_path, map_location=device, weights_only=True))
+            model.to(device)
+            model.eval()
+
+            model_load_ms = (time.perf_counter() - t0) * 1000
+            backend = "pytorch"
+            _log(f"PyTorch model loaded in {model_load_ms:.0f}ms on {device}")
+
+        except Exception as e:
+            result = {"error": f"Failed to load model: {e}"}
+            print(json.dumps(result))
+            return
+
+    # ── Run benchmark ──────────────────────────────────────────────
+
+    timings = []
+    errors = 0
+    extraction_data = None
+
+    for i in range(num_runs):
+        try:
+            t0 = time.perf_counter()
+
+            if backend == "coreml":
+                from PIL import Image as PILImage
+                # CoreML inference
+                input_w, input_h = 518, 392
+                rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                resized = cv2.resize(rgb, (input_w, input_h))
+                pil_img = PILImage.fromarray(resized, mode="RGB")
+                prediction = model.predict({"image": pil_img})
+                output_key = list(prediction.keys())[0]
+                depth_map = np.array(prediction[output_key])
+                if depth_map.ndim > 2:
+                    depth_map = np.squeeze(depth_map)
+            else:
+                # PyTorch inference
+                import torch
+                rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                with torch.no_grad():
+                    depth_map = model.infer_image(rgb)
+
+            # Normalize and colorize
+            d_min, d_max = depth_map.min(), depth_map.max()
+            depth_norm = ((depth_map - d_min) / (d_max - d_min + 1e-8) * 255).astype(np.uint8)
+            depth_colored = cv2.applyColorMap(depth_norm, colormap_id)
+            depth_colored = cv2.resize(depth_colored, (image.shape[1], image.shape[0]))
+
+            elapsed_ms = (time.perf_counter() - t0) * 1000
+            timings.append(elapsed_ms)
+
+            # Capture extraction result from last run
+            if i == num_runs - 1:
+                import base64
+                _, buf = cv2.imencode(".jpg", depth_colored, [cv2.IMWRITE_JPEG_QUALITY, 85])
+                extraction_data = base64.b64encode(buf).decode("ascii")
+
+            _log(f"  Run {i+1}/{num_runs}: {elapsed_ms:.1f}ms")
+
+        except Exception as e:
+            _log(f"  Run {i+1}/{num_runs}: ERROR — {e}")
+            errors += 1
+
+    # ── Build results ──────────────────────────────────────────────
+
+    if not timings:
+        result = {"error": f"All {num_runs} runs failed"}
+        print(json.dumps(result))
+        return
+
+    avg = statistics.mean(timings)
+    fps = 1000.0 / avg if avg > 0 else 0
+
+    result = {
+        "model_id": variant_id,
+        "variant_id": variant_id,
+        "backend": backend,
+        "device": device,
+        "num_runs": num_runs,
+        "successful_runs": len(timings),
+        "errors": errors,
+        "avg_time_ms": round(avg, 2),
+        "min_time_ms": round(min(timings), 2),
+        "max_time_ms": round(max(timings), 2),
+        "std_time_ms": round(statistics.stdev(timings), 2) if len(timings) > 1 else 0.0,
+        "fps": round(fps, 2),
+        "model_load_ms": round(model_load_ms, 1),
+        "image_size": f"{image.shape[1]}x{image.shape[0]}",
+        "colormap": colormap_name,
+    }
+
+    # Include depth map preview from last run
+    if extraction_data:
+        result["extraction_result"] = {
+            "success": True,
+            "feature_type": "depth_estimation",
+            "feature_data": extraction_data,
+            "processing_time": round(timings[-1], 2),
+            "metadata": {
+                "backend": backend,
+                "device": device,
+                "colormap": colormap_name,
+            },
+        }
+
+    _log(f"Results: {avg:.1f}ms avg, {fps:.1f} FPS, {len(timings)}/{num_runs} successful")
+
+    # Output JSON on stdout (Aegis parses the last line)
+    print(json.dumps(result))
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Depth Estimation Benchmark")
+    parser.add_argument("--variant", type=str, required=True,
+                        help="Model variant ID (e.g. depth_anything_v2_vits, DepthAnythingV2SmallF16)")
+    parser.add_argument("--runs", type=int, default=10,
+                        help="Number of benchmark runs (default: 10)")
+    parser.add_argument("--colormap", type=str, default="viridis",
+                        choices=["inferno", "viridis", "plasma", "magma", "jet", "turbo", "hot", "cool"])
+    parser.add_argument("--device", type=str, default="auto",
+                        choices=["auto", "cpu", "cuda", "mps"],
+                        help="Compute device (default: auto-detect)")
+    parser.add_argument("--compute-units", type=str, default="all",
+                        help="Compute units for CoreML (all, cpu, cpu_and_ne, gpu, npu, cpu_npu)")
+    parser.add_argument("--test-image-url", type=str,
+                        default="https://ultralytics.com/images/bus.jpg",
+                        help="URL of test image to download")
+    args = parser.parse_args()
+
+    run_benchmark(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/transformation/depth-estimation/scripts/transform.py b/skills/transformation/depth-estimation/scripts/transform.py
index c4013c3..6fcd0da 100644
--- a/skills/transformation/depth-estimation/scripts/transform.py
+++ b/skills/transformation/depth-estimation/scripts/transform.py
@@ -4,6 +4,7 @@
 
 Backend selection:
   macOS  → CoreML (.mlpackage via coremltools) — runs on Neural Engine
+  Win/Linux + NVIDIA + TensorRT → TensorRT FP16 (.trt engine) — fastest
   Other  → PyTorch (depth_anything_v2 pip package + HF weights) — runs on CUDA/MPS/CPU
 
 Implements the TransformSkillBase interface to provide real-time depth map
@@ -12,6 +13,7 @@
 
 Usage:
   python transform.py --model depth-anything-v2-small --device auto
+  python transform.py --model depth-anything-v2-small --backend tensorrt
   python transform.py --config config.json
 """
 
@@ -70,6 +72,9 @@
 # Where Aegis DepthVisionStudio stores downloaded models
 MODELS_DIR = Path.home() / ".aegis-ai" / "models" / "feature-extraction"
 
+# TensorRT engine cache directory (engines are GPU-specific)
+TRT_CACHE_DIR = MODELS_DIR / "trt_engines"
+
 # PyTorch model configs (fallback on non-macOS)
 PYTORCH_CONFIGS = {
     "depth-anything-v2-small": {
@@ -110,6 +115,13 @@ def __init__(self):
         self.opacity = 0.5
         self.blend_mode = "depth_only"  # Default for privacy: depth_only anonymizes
         self._coreml_input_size = COREML_INPUT_SIZE
+        # TensorRT state (populated by _load_tensorrt)
+        self._trt_context = None
+        self._trt_input_name = None
+        self._trt_output_name = None
+        self._trt_input_tensor = None
+        self._trt_output_tensor = None
+        self._trt_stream = None
 
     def parse_extra_args(self, parser: argparse.ArgumentParser):
         parser.add_argument("--model", type=str, default="depth-anything-v2-small",
@@ -117,6 +129,9 @@ def parse_extra_args(self, parser: argparse.ArgumentParser):
                                      "depth-anything-v2-large"])
         parser.add_argument("--variant", type=str, default=DEFAULT_COREML_VARIANT,
                             help="CoreML variant ID (macOS only)")
+        parser.add_argument("--backend", type=str, default="auto",
+                            choices=["auto", "tensorrt", "pytorch", "coreml"],
+                            help="Inference backend (auto = TRT if available, else PyTorch)")
         parser.add_argument("--colormap", type=str, default="inferno",
                             choices=list(COLORMAP_MAP.keys()))
         parser.add_argument("--blend-mode", type=str, default="depth_only",
@@ -137,6 +152,15 @@ def load_model(self, config: dict) -> dict:
             except Exception as e:
                 _log(f"CoreML load failed ({e}), falling back to PyTorch", self._tag)
 
+        # Try TensorRT on Windows/Linux with CUDA (if available)
+        backend_pref = config.get("backend", "auto")
+        if backend_pref in ("auto", "tensorrt") and self.device == "cuda":
+            try:
+                info = self._load_tensorrt(model_name, config)
+                return info
+            except Exception as e:
+                _log(f"TensorRT load failed ({e}), falling back to PyTorch", self._tag)
+
         # Fallback: PyTorch
         return self._load_pytorch(model_name, config)
 
@@ -196,6 +220,155 @@ def _download_coreml_model(self, variant_id: str):
             _log(f"CoreML model download failed: {e}", self._tag)
             raise
 
+    # ── TensorRT backend (Windows/Linux NVIDIA) ───────────────────────
+
+    def _load_tensorrt(self, model_name: str, config: dict) -> dict:
+        """Load or build a TensorRT FP16 engine for fastest NVIDIA inference."""
+        import torch
+        import tensorrt as trt
+
+        _log(f"Attempting TensorRT FP16 for {model_name}", self._tag)
+
+        cfg = PYTORCH_CONFIGS.get(model_name)
+        if not cfg:
+            raise ValueError(f"Unknown model: {model_name}")
+
+        # Engine filename includes GPU name to avoid cross-GPU issues
+        gpu_tag = torch.cuda.get_device_name(0).replace(" ", "_").lower()
+        engine_path = TRT_CACHE_DIR / f"{cfg['filename'].replace('.pth', '')}_fp16_{gpu_tag}.trt"
+
+        if engine_path.exists():
+            _log(f"Loading cached TRT engine: {engine_path}", self._tag)
+            engine = self._deserialize_engine(engine_path)
+        else:
+            _log("No cached engine — building from ONNX (this takes 30-120s)...", self._tag)
+            engine = self._build_trt_engine(model_name, cfg, engine_path)
+
+        if engine is None:
+            raise RuntimeError("TensorRT engine build/load failed")
+
+        # Create execution context and pre-allocate buffers
+        self._trt_context = engine.create_execution_context()
+        self._trt_input_name = engine.get_tensor_name(0)
+        self._trt_output_name = engine.get_tensor_name(1)
+
+        # Pre-allocate a reference input to set shapes (1, 3, 518, 518)
+        input_shape = engine.get_tensor_shape(self._trt_input_name)
+        fixed_shape = tuple(1 if d == -1 else d for d in input_shape)
+        self._trt_context.set_input_shape(self._trt_input_name, fixed_shape)
+
+        # Pre-allocate GPU tensors
+        self._trt_input_tensor = torch.zeros(fixed_shape, dtype=torch.float32, device="cuda")
+        actual_out_shape = self._trt_context.get_tensor_shape(self._trt_output_name)
+        self._trt_output_tensor = torch.empty(list(actual_out_shape), dtype=torch.float32, device="cuda")
+
+        # Set tensor addresses
+        self._trt_context.set_tensor_address(self._trt_input_name, self._trt_input_tensor.data_ptr())
+        self._trt_context.set_tensor_address(self._trt_output_name, self._trt_output_tensor.data_ptr())
+        self._trt_stream = torch.cuda.current_stream().cuda_stream
+
+        self.backend = "tensorrt"
+        _log(f"TensorRT FP16 engine ready: {engine_path.name}", self._tag)
+        return {
+            "model": model_name,
+            "device": "cuda",
+            "blend_mode": self.blend_mode,
+            "colormap": config.get("colormap", "inferno"),
+            "backend": "tensorrt",
+            "engine": engine_path.name,
+        }
+
+    def _build_trt_engine(self, model_name: str, cfg: dict, engine_path: Path):
+        """Export PyTorch → ONNX → build TRT FP16 engine → serialize."""
+        import torch
+        import tensorrt as trt
+        from depth_anything_v2.dpt import DepthAnythingV2
+        from huggingface_hub import hf_hub_download
+
+        # Load PyTorch model temporarily for ONNX export
+        weights_path = hf_hub_download(cfg["repo"], cfg["filename"])
+        pt_model = DepthAnythingV2(
+            encoder=cfg["encoder"], features=cfg["features"],
+            out_channels=cfg["out_channels"],
+        )
+        pt_model.load_state_dict(torch.load(weights_path, map_location="cuda", weights_only=True))
+        pt_model.to("cuda").eval()
+
+        # Create dummy input and export to ONNX
+        dummy = torch.randn(1, 3, 518, 518, device="cuda")
+        onnx_path = TRT_CACHE_DIR / f"{cfg['filename'].replace('.pth', '')}.onnx"
+        TRT_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+
+        _log(f"Exporting ONNX: {onnx_path.name}", self._tag)
+        torch.onnx.export(
+            pt_model, dummy, str(onnx_path),
+            input_names=["input"], output_names=["depth"],
+            dynamic_axes={"input": {0: "batch"}, "depth": {0: "batch"}},
+            opset_version=17,
+        )
+
+        # Free PyTorch model — no longer needed
+        del pt_model
+        torch.cuda.empty_cache()
+
+        # Build TRT engine
+        logger = trt.Logger(trt.Logger.WARNING)
+        builder = trt.Builder(logger)
+        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        parser = trt.OnnxParser(network, logger)
+
+        _log("Parsing ONNX for TensorRT...", self._tag)
+        with open(str(onnx_path), "rb") as f:
+            if not parser.parse(f.read()):
+                for i in range(parser.num_errors):
+                    _log(f"  ONNX parse error: {parser.get_error(i)}", self._tag)
+                return None
+
+        config = builder.create_builder_config()
+        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB
+        config.set_flag(trt.BuilderFlag.FP16)
+
+        # Handle dynamic batch dimension
+        inp = network.get_input(0)
+        if any(d == -1 for d in inp.shape):
+            profile = builder.create_optimization_profile()
+            fixed = tuple(1 if d == -1 else d for d in inp.shape)
+            profile.set_shape(inp.name, fixed, fixed, fixed)
+            config.add_optimization_profile(profile)
+
+        _log("Building TRT FP16 engine (this is slow, ~30-120s)...", self._tag)
+        serialized = builder.build_serialized_network(network, config)
+        if serialized is None:
+            _log("TRT engine build failed!", self._tag)
+            return None
+
+        # TRT 10.15+ returns IHostMemory, not raw bytes — convert
+        engine_bytes = bytes(serialized)
+
+        # Serialize to disk for future starts
+        engine_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(str(engine_path), "wb") as f:
+            f.write(engine_bytes)
+        _log(f"Engine serialized: {engine_path} ({len(engine_bytes) / 1e6:.1f} MB)", self._tag)
+
+        # Clean up ONNX (no longer needed)
+        try:
+            onnx_path.unlink()
+        except OSError:
+            pass
+
+        runtime = trt.Runtime(logger)
+        return runtime.deserialize_cuda_engine(engine_bytes)
+
+    @staticmethod
+    def _deserialize_engine(engine_path: Path):
+        """Load a previously serialized TRT engine from disk."""
+        import tensorrt as trt
+        logger = trt.Logger(trt.Logger.WARNING)
+        runtime = trt.Runtime(logger)
+        with open(str(engine_path), "rb") as f:
+            return runtime.deserialize_cuda_engine(f.read())
+
     # ── PyTorch backend (fallback) ────────────────────────────────────
 
     def _load_pytorch(self, model_name: str, config: dict) -> dict:
@@ -242,6 +415,8 @@ def transform_frame(self, image, metadata: dict):
 
         if self.backend == "coreml":
             depth_colored = self._infer_coreml(image)
+        elif self.backend == "tensorrt":
+            depth_colored = self._infer_tensorrt(image)
         else:
             depth_colored = self._infer_pytorch(image)
 
@@ -254,6 +429,43 @@ def transform_frame(self, image, metadata: dict):
 
         return output
 
+    def _infer_tensorrt(self, image):
+        """Run TensorRT FP16 inference and return colorized depth map."""
+        import torch
+        import cv2
+        import numpy as np
+
+        original_h, original_w = image.shape[:2]
+        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        # Preprocess: resize → normalize → NCHW tensor (same as PyTorch path)
+        resized = cv2.resize(rgb, (518, 518), interpolation=cv2.INTER_LINEAR)
+        img_float = resized.astype(np.float32) / 255.0
+        # ImageNet normalization
+        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+        img_float = (img_float - mean) / std
+        img_nchw = np.transpose(img_float, (2, 0, 1))[np.newaxis]  # (1, 3, 518, 518)
+
+        # Copy to pre-allocated GPU tensor
+        self._trt_input_tensor.copy_(torch.from_numpy(img_nchw))
+
+        # Execute
+        self._trt_context.execute_async_v3(self._trt_stream)
+        torch.cuda.synchronize()
+
+        # Read output
+        depth = self._trt_output_tensor.cpu().numpy()
+        depth = np.squeeze(depth)
+
+        # Normalize → colormap → resize
+        d_min, d_max = depth.min(), depth.max()
+        depth_norm = ((depth - d_min) / (d_max - d_min + 1e-8) * 255).astype(np.uint8)
+        depth_colored = cv2.applyColorMap(depth_norm, self.colormap_id)
+        depth_colored = cv2.resize(depth_colored, (original_w, original_h))
+
+        return depth_colored
+
     def _infer_coreml(self, image):
         """Run CoreML inference and return colorized depth map (BGR, original size)."""
         import cv2
diff --git a/skills/transformation/depth-estimation/scripts/transform_base.py b/skills/transformation/depth-estimation/scripts/transform_base.py
index 48f251a..73178e5 100644
--- a/skills/transformation/depth-estimation/scripts/transform_base.py
+++ b/skills/transformation/depth-estimation/scripts/transform_base.py
@@ -415,7 +415,13 @@ def _load_config(self, args) -> dict:
             if config_path.exists():
                 with open(config_path) as f:
                     return json.load(f)
-        return {"device": args.device}
+        # Merge all CLI args into config (--model, --colormap, --blend-mode, etc.)
+        config = {}
+        for k, v in vars(args).items():
+            if k != "config" and v is not None:
+                # Convert hyphens to underscores for consistency (e.g. blend-mode → blend_mode)
+                config[k.replace("-", "_")] = v
+        return config
 
     @staticmethod
     def _detect_hardware(device_pref: str = "auto") -> HardwareEnv:
diff --git a/skills/transformation/depth-estimation/scripts/trt_benchmark.py b/skills/transformation/depth-estimation/scripts/trt_benchmark.py
new file mode 100644
index 0000000..f91de2d
--- /dev/null
+++ b/skills/transformation/depth-estimation/scripts/trt_benchmark.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+"""
+Native TensorRT Benchmark for Depth Anything V2.
+
+Builds a TensorRT engine from ONNX, benchmarks FP32 and FP16, 
+and compares against vanilla PyTorch CUDA and ONNX CUDA.
+"""
+import sys, os, time, json, statistics
+from pathlib import Path
+
+_script_dir = Path(__file__).resolve().parent
+sys.path.insert(0, str(_script_dir))
+
+def _log(msg):
+    print(f"[TRT] {msg}", file=sys.stderr, flush=True)
+
+def get_test_image():
+    cache = Path.home() / ".aegis-ai" / "tmp" / "benchmark" / "test_image.jpg"
+    cache.parent.mkdir(parents=True, exist_ok=True)
+    if cache.exists(): return str(cache)
+    from urllib.request import urlretrieve
+    _log("Downloading test image...")
+    urlretrieve("https://ultralytics.com/images/bus.jpg", str(cache))
+    return str(cache)
+
+def stats(times, label):
+    avg = statistics.mean(times)
+    return {"label": label, "runs": len(times),
+            "avg_ms": round(avg,2), "min_ms": round(min(times),2),
+            "max_ms": round(max(times),2),
+            "std_ms": round(statistics.stdev(times),2) if len(times)>1 else 0,
+            "fps": round(1000/avg,2) if avg>0 else 0}
+
+def build_trt_engine(onnx_path, fp16=False):
+    """Build a TensorRT engine from an ONNX model."""
+    import tensorrt as trt
+    logger = trt.Logger(trt.Logger.WARNING)
+    builder = trt.Builder(logger)
+    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+    parser = trt.OnnxParser(network, logger)
+
+    _log(f"  Parsing ONNX: {onnx_path}")
+    with open(onnx_path, "rb") as f:
+        if not parser.parse(f.read()):
+            for i in range(parser.num_errors):
+                _log(f"  ONNX Parse Error: {parser.get_error(i)}")
+            return None
+
+    config = builder.create_builder_config()
+    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB
+
+    # Handle dynamic shapes — set optimization profile for batch dimension
+    inp = network.get_input(0)
+    if any(d == -1 for d in inp.shape):
+        profile = builder.create_optimization_profile()
+        # Use fixed shape (batch=1) for the actual input dimensions
+        shape_list = list(inp.shape)
+        fixed_shape = tuple(1 if d == -1 else d for d in shape_list)
+        profile.set_shape(inp.name, fixed_shape, fixed_shape, fixed_shape)
+        config.add_optimization_profile(profile)
+        _log(f"  Set optimization profile: {fixed_shape}")
+    
+    if fp16:
+        config.set_flag(trt.BuilderFlag.FP16)
+        _log("  Building TRT engine (FP16)...")
+    else:
+        _log("  Building TRT engine (FP32)...")
+
+    t0 = time.perf_counter()
+    serialized = builder.build_serialized_network(network, config)
+    build_time = (time.perf_counter() - t0) * 1000
+    
+    if serialized is None:
+        _log("  Engine build failed!")
+        return None
+
+    runtime = trt.Runtime(logger)
+    engine = runtime.deserialize_cuda_engine(serialized)
+    _log(f"  Engine built in {build_time:.0f}ms")
+    return engine, build_time
+
+def bench_trt_engine(engine, input_data, num_runs=15, warmup=5, label="TRT"):
+    """Benchmark a TensorRT engine."""
+    import tensorrt as trt
+    import numpy as np
+    
+    try:
+        # Try newer API first (TRT 10+)
+        context = engine.create_execution_context()
+        
+        # Get binding info
+        num_io = engine.num_io_tensors
+        input_name = engine.get_tensor_name(0)
+        output_name = engine.get_tensor_name(1)
+        input_shape = engine.get_tensor_shape(input_name)
+        output_shape = engine.get_tensor_shape(output_name)
+        
+        _log(f"  Input: {input_name} {list(input_shape)}")
+        _log(f"  Output: {output_name} {list(output_shape)}")
+        
+        # Allocate CUDA memory
+        import ctypes
+        
+        # Use pycuda or cuda-python for memory management
+        try:
+            import cuda  # Try nvidia cuda-python
+            has_cuda_python = True
+        except ImportError:
+            has_cuda_python = False
+        
+        # Fallback: use torch for GPU memory management (simplest)
+        import torch
+        
+        input_tensor = torch.from_numpy(input_data).cuda()
+        
+        # Determine output shape - handle dynamic dims
+        out_shape = list(output_shape)
+        for i, s in enumerate(out_shape):
+            if s == -1:
+                if i == 0: out_shape[i] = input_data.shape[0]  # batch
+                else: out_shape[i] = 1  # placeholder
+        
+        # Set input shape for dynamic dims
+        context.set_input_shape(input_name, input_data.shape)
+        
+        # Get actual output shape after setting input
+        actual_out_shape = context.get_tensor_shape(output_name)
+        output_tensor = torch.empty(list(actual_out_shape), dtype=torch.float32, device='cuda')
+        
+        # Set tensor addresses
+        context.set_tensor_address(input_name, input_tensor.data_ptr())
+        context.set_tensor_address(output_name, output_tensor.data_ptr())
+        
+        # Get CUDA stream
+        stream = torch.cuda.current_stream().cuda_stream
+        
+        # Warmup
+        for _ in range(warmup):
+            context.execute_async_v3(stream)
+            torch.cuda.synchronize()
+        
+        # Benchmark
+        times = []
+        for i in range(num_runs):
+            torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            context.execute_async_v3(stream)
+            torch.cuda.synchronize()
+            elapsed = (time.perf_counter() - t0) * 1000
+            times.append(elapsed)
+            _log(f"  [{label}] Run {i+1}/{num_runs}: {elapsed:.1f}ms")
+        
+        return times
+        
+    except Exception as e:
+        _log(f"  Engine execution error: {e}")
+        import traceback
+        traceback.print_exc(file=sys.stderr)
+        return None
+
+def main():
+    import torch, cv2, numpy as np
+
+    device = "cuda"
+    N, W = 15, 5
+
+    _log(f"PyTorch {torch.__version__}, CUDA {torch.version.cuda}")
+    _log(f"GPU: {torch.cuda.get_device_name(0)}")
+
+    import tensorrt as trt
+    _log(f"TensorRT: {trt.__version__}")
+
+    # Load image and model
+    image = cv2.imread(get_test_image())
+    _log(f"Image: {image.shape[1]}x{image.shape[0]}")
+
+    from depth_anything_v2.dpt import DepthAnythingV2
+    from huggingface_hub import hf_hub_download
+    cfg = {"encoder":"vits","features":64,"out_channels":[48,96,192,384],
+           "repo":"depth-anything/Depth-Anything-V2-Small","filename":"depth_anything_v2_vits.pth"}
+    weights = hf_hub_download(cfg["repo"], cfg["filename"])
+    model = DepthAnythingV2(encoder=cfg["encoder"], features=cfg["features"], out_channels=cfg["out_channels"])
+    model.load_state_dict(torch.load(weights, map_location=device, weights_only=True))
+    model.to(device).eval()
+
+    rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    img_tensor, (h, w) = model.image2tensor(rgb, input_size=518)
+    img_tensor = img_tensor.to(device)
+    _log(f"Tensor: {img_tensor.shape}")
+
+    results = []
+
+    # 1. PyTorch CUDA baseline
+    _log("\n== 1. PyTorch CUDA ==")
+    for _ in range(W):
+        with torch.no_grad(): model.forward(img_tensor); torch.cuda.synchronize()
+    times = []
+    for i in range(N):
+        torch.cuda.synchronize(); t0 = time.perf_counter()
+        with torch.no_grad(): model.forward(img_tensor)
+        torch.cuda.synchronize(); times.append((time.perf_counter()-t0)*1000)
+        _log(f"  [PyTorch] Run {i+1}/{N}: {times[-1]:.1f}ms")
+    results.append(stats(times, "PyTorch CUDA"))
+
+    # 2. ONNX CUDA
+    _log("\n== 2. ONNX CUDA ==")
+    onnx_path = Path.home() / ".aegis-ai" / "tmp" / "benchmark" / "dav2_small.onnx"
+    onnx_path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        _log("  Exporting ONNX...")
+        torch.onnx.export(model, img_tensor, str(onnx_path),
+            input_names=["input"], output_names=["depth"],
+            dynamic_axes={"input":{0:"batch"}, "depth":{0:"batch"}}, opset_version=17)
+        import onnxruntime as ort
+        sess = ort.InferenceSession(str(onnx_path), providers=["CUDAExecutionProvider"])
+        in_name = sess.get_inputs()[0].name
+        inp = img_tensor.cpu().numpy()
+        for _ in range(W): sess.run(None, {in_name: inp})
+        times = []
+        for i in range(N):
+            t0 = time.perf_counter()
+            sess.run(None, {in_name: inp})
+            times.append((time.perf_counter()-t0)*1000)
+            _log(f"  [ONNX CUDA] Run {i+1}/{N}: {times[-1]:.1f}ms")
+        results.append(stats(times, "ONNX CUDA"))
+    except Exception as e:
+        _log(f"  FAILED: {e}")
+        results.append({"label":"ONNX CUDA","error":str(e)[:100]})
+
+    # 3. TensorRT FP32
+    _log("\n== 3. TensorRT FP32 ==")
+    try:
+        engine_result = build_trt_engine(str(onnx_path), fp16=False)
+        if engine_result:
+            engine, build_ms = engine_result
+            inp = img_tensor.cpu().numpy()
+            times = bench_trt_engine(engine, inp, N, W, "TRT FP32")
+            if times:
+                r = stats(times, "TensorRT FP32")
+                r["build_ms"] = round(build_ms, 0)
+                results.append(r)
+            else:
+                results.append({"label":"TensorRT FP32","error":"execution failed"})
+        else:
+            results.append({"label":"TensorRT FP32","error":"engine build failed"})
+    except Exception as e:
+        _log(f"  FAILED: {e}")
+        results.append({"label":"TensorRT FP32","error":str(e)[:100]})
+
+    # 4. TensorRT FP16
+    _log("\n== 4. TensorRT FP16 ==")
+    try:
+        engine_result = build_trt_engine(str(onnx_path), fp16=True)
+        if engine_result:
+            engine, build_ms = engine_result
+            inp = img_tensor.cpu().numpy()
+            times = bench_trt_engine(engine, inp, N, W, "TRT FP16")
+            if times:
+                r = stats(times, "TensorRT FP16")
+                r["build_ms"] = round(build_ms, 0)
+                results.append(r)
+            else:
+                results.append({"label":"TensorRT FP16","error":"execution failed"})
+        else:
+            results.append({"label":"TensorRT FP16","error":"engine build failed"})
+    except Exception as e:
+        _log(f"  FAILED: {e}")
+        results.append({"label":"TensorRT FP16","error":str(e)[:100]})
+
+    # Summary
+    _log("\n" + "="*70)
+    _log(f"{'Backend':<22} {'Avg(ms)':>8} {'Min(ms)':>8} {'FPS':>7} {'Speedup':>8}")
+    _log("-"*70)
+    base = results[0].get("avg_ms",1)
+    for r in results:
+        if "error" in r:
+            _log(f"{r['label']:<22} {'FAIL':>8}  {r['error'][:45]}")
+        else:
+            su = base/r["avg_ms"] if r["avg_ms"]>0 else 0
+            _log(f"{r['label']:<22} {r['avg_ms']:>8.1f} {r['min_ms']:>8.1f} {r['fps']:>7.1f} {su:>7.2f}x")
+
+    print(json.dumps({"gpu": torch.cuda.get_device_name(0), "results": results}, indent=2))
+
+if __name__ == "__main__":
+    main()