diff --git a/skills/detection/yolo-detection-2026/SKILL.md b/skills/detection/yolo-detection-2026/SKILL.md index 278d924..939099a 100644 --- a/skills/detection/yolo-detection-2026/SKILL.md +++ b/skills/detection/yolo-detection-2026/SKILL.md @@ -66,6 +66,15 @@ parameters: description: "Auto-convert model to optimized format for faster inference" group: Performance + - name: compute_units + label: "Apple Compute Units" + type: select + options: ["auto", "cpu_and_ne", "all", "cpu_only", "cpu_and_gpu"] + default: "auto" + description: "CoreML compute target — 'auto' routes to Neural Engine (NPU), leaving GPU free for LLM/VLM" + group: Performance + platform: macos + capabilities: live_detection: script: scripts/detect.py @@ -89,13 +98,15 @@ Real-time object detection using the latest YOLO 2026 models. Detects 80+ COCO o The skill uses [`env_config.py`](../../lib/env_config.py) to **automatically detect hardware** and convert the model to the fastest format for your platform. Conversion happens once during deployment and is cached. -| Platform | Backend | Optimized Format | Expected Speedup | -|----------|---------|------------------|:----------------:| -| NVIDIA GPU | CUDA | TensorRT `.engine` | ~3-5x | -| Apple Silicon (M1+) | MPS | CoreML `.mlpackage` | ~2x | -| Intel CPU/GPU/NPU | OpenVINO | OpenVINO IR `.xml` | ~2-3x | -| AMD GPU | ROCm | ONNX Runtime | ~1.5-2x | -| CPU (any) | CPU | ONNX Runtime | ~1.5x | +| Platform | Backend | Optimized Format | Compute Units | Expected Speedup | +|----------|---------|------------------|:-------------:|:----------------:| +| NVIDIA GPU | CUDA | TensorRT `.engine` | GPU | ~3-5x | +| Apple Silicon (M1+) | MPS | CoreML `.mlpackage` | **Neural Engine** (NPU) | ~2x | +| Intel CPU/GPU/NPU | OpenVINO | OpenVINO IR `.xml` | CPU/GPU/NPU | ~2-3x | +| AMD GPU | ROCm | ONNX Runtime | GPU | ~1.5-2x | +| CPU (any) | CPU | ONNX Runtime | CPU | ~1.5x | + +> **Apple Silicon Note**: Detection defaults to `cpu_and_ne` (CPU + Neural Engine), keeping the GPU free for LLM/VLM inference. Set `compute_units: all` to include GPU if not running local LLM. ### How It Works diff --git a/skills/detection/yolo-detection-2026/scripts/detect.py b/skills/detection/yolo-detection-2026/scripts/detect.py index d149374..40bea8b 100644 --- a/skills/detection/yolo-detection-2026/scripts/detect.py +++ b/skills/detection/yolo-detection-2026/scripts/detect.py @@ -248,7 +248,7 @@ def main(): perf.model_load_ms = env.load_ms perf.export_ms = env.export_ms - emit({ + ready_event = { "event": "ready", "model": f"yolo2026{model_size[0]}", "model_size": model_size, @@ -260,7 +260,10 @@ def main(): "fps": fps, "model_load_ms": round(env.load_ms, 1), "available_sizes": list(MODEL_SIZE_MAP.keys()), - }) + } + if hasattr(env, 'compute_units') and env.backend == "mps": + ready_event["compute_units"] = env.compute_units + emit(ready_event) except Exception as e: emit({"event": "error", "message": f"Failed to load model: {e}", "retriable": False}) sys.exit(1) diff --git a/skills/detection/yolo-detection-2026/scripts/env_config.py b/skills/detection/yolo-detection-2026/scripts/env_config.py index ff42e6f..7c46c05 100644 --- a/skills/detection/yolo-detection-2026/scripts/env_config.py +++ b/skills/detection/yolo-detection-2026/scripts/env_config.py @@ -40,6 +40,7 @@ class BackendSpec: model_suffix: str # file extension/dir to look for cached model half: bool = True # use FP16 extra_export_args: dict = field(default_factory=dict) + compute_units: Optional[str] = None # CoreML compute units: "cpu_and_ne", "all", etc. BACKEND_SPECS = { @@ -61,6 +62,7 @@ class BackendSpec: model_suffix=".mlpackage", half=True, extra_export_args={"nms": False}, + compute_units="cpu_and_ne", # Route to Neural Engine, leave GPU free for LLM/VLM ), "intel": BackendSpec( name="intel", @@ -86,6 +88,7 @@ class HardwareEnv: backend: str = "cpu" # "cuda" | "rocm" | "mps" | "intel" | "cpu" device: str = "cpu" # torch device string export_format: str = "onnx" # optimal export format + compute_units: str = "all" # CoreML compute units (Apple only) gpu_name: str = "" # human-readable GPU name gpu_memory_mb: int = 0 # GPU memory in MB driver_version: str = "" # GPU driver version @@ -113,9 +116,11 @@ def detect() -> "HardwareEnv": else: env._fallback_cpu() - # Set export format from backend spec + # Set export format and compute units from backend spec spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"]) env.export_format = spec.export_format + if spec.compute_units: + env.compute_units = spec.compute_units # Check if optimized runtime is available env.framework_ok = env._check_framework() @@ -439,6 +444,58 @@ def export_model(self, model, model_name: str) -> Optional[Path]: return None + def _load_coreml_with_compute_units(self, model_path: str): + """ + Load a CoreML model via YOLO with specific compute_units. + + Monkey-patches coremltools.MLModel to inject compute_units + (e.g. CPU_AND_NE for Neural Engine) since ultralytics doesn't + expose this parameter. Patch is scoped and immediately restored. + """ + from ultralytics import YOLO + + # Map string config → coremltools enum + _COMPUTE_UNIT_MAP = { + "all": "ALL", + "cpu_only": "CPU_ONLY", + "cpu_and_gpu": "CPU_AND_GPU", + "cpu_and_ne": "CPU_AND_NE", + } + + ct_enum_name = _COMPUTE_UNIT_MAP.get(self.compute_units) + if not ct_enum_name: + _log(f"Unknown compute_units '{self.compute_units}', using default") + return YOLO(model_path) + + try: + import coremltools as ct + target_units = getattr(ct.ComputeUnit, ct_enum_name, None) + if target_units is None: + _log(f"coremltools.ComputeUnit.{ct_enum_name} not available") + return YOLO(model_path) + + # Temporarily patch MLModel to inject compute_units + _OrigMLModel = ct.models.MLModel + + class _PatchedMLModel(_OrigMLModel): + def __init__(self, *args, **kwargs): + kwargs.setdefault('compute_units', target_units) + super().__init__(*args, **kwargs) + + ct.models.MLModel = _PatchedMLModel + try: + model = YOLO(model_path) + finally: + ct.models.MLModel = _OrigMLModel # Always restore + + _log(f"CoreML model loaded with compute_units={ct_enum_name} " + f"(Neural Engine preferred)") + return model + + except ImportError: + _log("coremltools not available, loading without compute_units") + return YOLO(model_path) + def load_optimized(self, model_name: str, use_optimized: bool = True): """ Load the best available model for this hardware. @@ -455,7 +512,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True): optimized_path = self.get_optimized_path(model_name) if optimized_path.exists(): try: - model = YOLO(str(optimized_path)) + # On Apple Silicon: route CoreML to Neural Engine + if self.backend == "mps" and self.compute_units != "all": + model = self._load_coreml_with_compute_units( + str(optimized_path)) + else: + model = YOLO(str(optimized_path)) self.load_ms = (time.perf_counter() - t0) * 1000 _log(f"Loaded {self.export_format} model ({self.load_ms:.0f}ms)") return model, self.export_format @@ -467,7 +529,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True): exported = self.export_model(pt_model, model_name) if exported: try: - model = YOLO(str(exported)) + # On Apple Silicon: route CoreML to Neural Engine + if self.backend == "mps" and self.compute_units != "all": + model = self._load_coreml_with_compute_units( + str(exported)) + else: + model = YOLO(str(exported)) self.load_ms = (time.perf_counter() - t0) * 1000 _log(f"Loaded freshly exported {self.export_format} model ({self.load_ms:.0f}ms)") return model, self.export_format @@ -508,7 +575,7 @@ def load_optimized(self, model_name: str, use_optimized: bool = True): def to_dict(self) -> dict: """Serialize environment info for JSON output.""" - return { + d = { "backend": self.backend, "device": self.device, "export_format": self.export_format, @@ -519,6 +586,9 @@ def to_dict(self) -> dict: "export_ms": round(self.export_ms, 1), "load_ms": round(self.load_ms, 1), } + if self.backend == "mps": + d["compute_units"] = self.compute_units + return d # ─── CLI: run standalone for diagnostics ───────────────────────────────────── diff --git a/skills/lib/env_config.py b/skills/lib/env_config.py index ff42e6f..7c46c05 100644 --- a/skills/lib/env_config.py +++ b/skills/lib/env_config.py @@ -40,6 +40,7 @@ class BackendSpec: model_suffix: str # file extension/dir to look for cached model half: bool = True # use FP16 extra_export_args: dict = field(default_factory=dict) + compute_units: Optional[str] = None # CoreML compute units: "cpu_and_ne", "all", etc. BACKEND_SPECS = { @@ -61,6 +62,7 @@ class BackendSpec: model_suffix=".mlpackage", half=True, extra_export_args={"nms": False}, + compute_units="cpu_and_ne", # Route to Neural Engine, leave GPU free for LLM/VLM ), "intel": BackendSpec( name="intel", @@ -86,6 +88,7 @@ class HardwareEnv: backend: str = "cpu" # "cuda" | "rocm" | "mps" | "intel" | "cpu" device: str = "cpu" # torch device string export_format: str = "onnx" # optimal export format + compute_units: str = "all" # CoreML compute units (Apple only) gpu_name: str = "" # human-readable GPU name gpu_memory_mb: int = 0 # GPU memory in MB driver_version: str = "" # GPU driver version @@ -113,9 +116,11 @@ def detect() -> "HardwareEnv": else: env._fallback_cpu() - # Set export format from backend spec + # Set export format and compute units from backend spec spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"]) env.export_format = spec.export_format + if spec.compute_units: + env.compute_units = spec.compute_units # Check if optimized runtime is available env.framework_ok = env._check_framework() @@ -439,6 +444,58 @@ def export_model(self, model, model_name: str) -> Optional[Path]: return None + def _load_coreml_with_compute_units(self, model_path: str): + """ + Load a CoreML model via YOLO with specific compute_units. + + Monkey-patches coremltools.MLModel to inject compute_units + (e.g. CPU_AND_NE for Neural Engine) since ultralytics doesn't + expose this parameter. Patch is scoped and immediately restored. + """ + from ultralytics import YOLO + + # Map string config → coremltools enum + _COMPUTE_UNIT_MAP = { + "all": "ALL", + "cpu_only": "CPU_ONLY", + "cpu_and_gpu": "CPU_AND_GPU", + "cpu_and_ne": "CPU_AND_NE", + } + + ct_enum_name = _COMPUTE_UNIT_MAP.get(self.compute_units) + if not ct_enum_name: + _log(f"Unknown compute_units '{self.compute_units}', using default") + return YOLO(model_path) + + try: + import coremltools as ct + target_units = getattr(ct.ComputeUnit, ct_enum_name, None) + if target_units is None: + _log(f"coremltools.ComputeUnit.{ct_enum_name} not available") + return YOLO(model_path) + + # Temporarily patch MLModel to inject compute_units + _OrigMLModel = ct.models.MLModel + + class _PatchedMLModel(_OrigMLModel): + def __init__(self, *args, **kwargs): + kwargs.setdefault('compute_units', target_units) + super().__init__(*args, **kwargs) + + ct.models.MLModel = _PatchedMLModel + try: + model = YOLO(model_path) + finally: + ct.models.MLModel = _OrigMLModel # Always restore + + _log(f"CoreML model loaded with compute_units={ct_enum_name} " + f"(Neural Engine preferred)") + return model + + except ImportError: + _log("coremltools not available, loading without compute_units") + return YOLO(model_path) + def load_optimized(self, model_name: str, use_optimized: bool = True): """ Load the best available model for this hardware. @@ -455,7 +512,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True): optimized_path = self.get_optimized_path(model_name) if optimized_path.exists(): try: - model = YOLO(str(optimized_path)) + # On Apple Silicon: route CoreML to Neural Engine + if self.backend == "mps" and self.compute_units != "all": + model = self._load_coreml_with_compute_units( + str(optimized_path)) + else: + model = YOLO(str(optimized_path)) self.load_ms = (time.perf_counter() - t0) * 1000 _log(f"Loaded {self.export_format} model ({self.load_ms:.0f}ms)") return model, self.export_format @@ -467,7 +529,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True): exported = self.export_model(pt_model, model_name) if exported: try: - model = YOLO(str(exported)) + # On Apple Silicon: route CoreML to Neural Engine + if self.backend == "mps" and self.compute_units != "all": + model = self._load_coreml_with_compute_units( + str(exported)) + else: + model = YOLO(str(exported)) self.load_ms = (time.perf_counter() - t0) * 1000 _log(f"Loaded freshly exported {self.export_format} model ({self.load_ms:.0f}ms)") return model, self.export_format @@ -508,7 +575,7 @@ def load_optimized(self, model_name: str, use_optimized: bool = True): def to_dict(self) -> dict: """Serialize environment info for JSON output.""" - return { + d = { "backend": self.backend, "device": self.device, "export_format": self.export_format, @@ -519,6 +586,9 @@ def to_dict(self) -> dict: "export_ms": round(self.export_ms, 1), "load_ms": round(self.load_ms, 1), } + if self.backend == "mps": + d["compute_units"] = self.compute_units + return d # ─── CLI: run standalone for diagnostics ───────────────────────────────────── diff --git a/skills/lib/test_env_config_ane.py b/skills/lib/test_env_config_ane.py new file mode 100644 index 0000000..dc032eb --- /dev/null +++ b/skills/lib/test_env_config_ane.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +""" +Unit tests for Apple Neural Engine (ANE) compute_units in env_config.py. + +Tests compute_units configuration, monkey-patch scoping, and CoreML +load-time injection — all mocked, no Apple hardware required. + +Run: python -m pytest skills/lib/test_env_config_ane.py -v +""" + +import platform +import subprocess +import sys +from pathlib import Path +from unittest import mock + +import pytest + +# Ensure env_config is importable from skills/lib/ +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from env_config import BackendSpec, BACKEND_SPECS, HardwareEnv, _log # noqa: E402 + + +# ── Tests: BackendSpec compute_units ──────────────────────────────────────── + +class TestBackendSpecComputeUnits: + """Verify compute_units field on backend specs.""" + + def test_mps_spec_has_cpu_and_ne(self): + """MPS backend defaults to cpu_and_ne (Neural Engine).""" + spec = BACKEND_SPECS["mps"] + assert spec.compute_units == "cpu_and_ne" + + def test_cuda_spec_has_no_compute_units(self): + """Non-Apple backends have no compute_units set.""" + assert BACKEND_SPECS["cuda"].compute_units is None + + def test_cpu_spec_has_no_compute_units(self): + assert BACKEND_SPECS["cpu"].compute_units is None + + def test_rocm_spec_has_no_compute_units(self): + assert BACKEND_SPECS["rocm"].compute_units is None + + def test_intel_spec_has_no_compute_units(self): + assert BACKEND_SPECS["intel"].compute_units is None + + +# ── Tests: HardwareEnv compute_units field ────────────────────────────────── + +class TestHardwareEnvComputeUnits: + """Verify compute_units is set correctly during detection.""" + + def test_default_compute_units_is_all(self): + """Default HardwareEnv has compute_units='all'.""" + env = HardwareEnv() + assert env.compute_units == "all" + + @mock.patch("env_config.platform.system", return_value="Darwin") + @mock.patch("env_config.platform.machine", return_value="arm64") + @mock.patch("env_config.subprocess.run") + @mock.patch("env_config.shutil.which", return_value=None) + @mock.patch("env_config.Path.is_dir", return_value=False) + def test_mps_sets_compute_units_cpu_and_ne( + self, _dir, _which, mock_run, _machine, _system + ): + """Apple Silicon detection sets compute_units to 'cpu_and_ne'.""" + mock_run.return_value = subprocess.CompletedProcess( + args=[], returncode=0, stdout="Apple M3 Max" + ) + + env = HardwareEnv() + result = env._try_mps() + assert result is True + + # Simulate what detect() does after _try_mps + spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"]) + if spec.compute_units: + env.compute_units = spec.compute_units + + assert env.backend == "mps" + assert env.compute_units == "cpu_and_ne" + + def test_to_dict_includes_compute_units_for_mps(self): + """to_dict() includes compute_units when backend is mps.""" + env = HardwareEnv() + env.backend = "mps" + env.compute_units = "cpu_and_ne" + d = env.to_dict() + assert "compute_units" in d + assert d["compute_units"] == "cpu_and_ne" + + def test_to_dict_excludes_compute_units_for_non_mps(self): + """to_dict() does NOT include compute_units for non-mps backends.""" + env = HardwareEnv() + env.backend = "cuda" + d = env.to_dict() + assert "compute_units" not in d + + +# ── Tests: _load_coreml_with_compute_units ────────────────────────────────── + +class TestLoadCoremlWithComputeUnits: + """Test the monkey-patch mechanism for CoreML compute_units.""" + + def test_monkey_patch_injects_compute_units(self): + """MLModel is temporarily patched to inject CPU_AND_NE.""" + env = HardwareEnv() + env.backend = "mps" + env.compute_units = "cpu_and_ne" + + # Create mock coremltools module + mock_ct = mock.MagicMock() + mock_ct.ComputeUnit.CPU_AND_NE = "CPU_AND_NE_SENTINEL" + mock_ct.ComputeUnit.ALL = "ALL_SENTINEL" + + # Track MLModel calls to verify compute_units was injected + original_mlmodel = mock.MagicMock() + mock_ct.models.MLModel = original_mlmodel + + captured_kwargs = {} + + mock_yolo_cls = mock.MagicMock() + + def capture_yolo_init(path): + """When YOLO loads the model, check if MLModel was patched.""" + # Simulate what YOLO does internally: call ct.models.MLModel + current_mlmodel = mock_ct.models.MLModel + # The patched class should be different from original + instance = current_mlmodel("test.mlpackage") + return mock.MagicMock() + + mock_yolo_cls.side_effect = capture_yolo_init + + with mock.patch.dict("sys.modules", {"coremltools": mock_ct}): + with mock.patch("env_config.YOLO", mock_yolo_cls, create=True): + # Can't easily test the full flow since YOLO import is inside + # the method. Instead, test the logic directly. + pass + + # Direct test: verify the patch class works correctly + class MockMLModel: + def __init__(self, *args, **kwargs): + self.kwargs = kwargs + + mock_ct.models.MLModel = MockMLModel + + with mock.patch.dict("sys.modules", {"coremltools": mock_ct}): + # Simulate the patching logic + _OrigMLModel = mock_ct.models.MLModel + target_units = mock_ct.ComputeUnit.CPU_AND_NE + + class _PatchedMLModel(_OrigMLModel): + def __init__(self, *args, **kwargs): + kwargs.setdefault('compute_units', target_units) + super().__init__(*args, **kwargs) + + # Verify patch injects compute_units + patched = _PatchedMLModel("test.mlpackage") + assert patched.kwargs.get('compute_units') == "CPU_AND_NE_SENTINEL" + + # Verify explicit override is preserved + explicit = _PatchedMLModel("test.mlpackage", compute_units="CUSTOM") + assert explicit.kwargs.get('compute_units') == "CUSTOM" + + def test_monkey_patch_restored_after_load(self): + """MLModel is restored to original after YOLO load, even on error.""" + env = HardwareEnv() + env.backend = "mps" + env.compute_units = "cpu_and_ne" + + mock_ct = mock.MagicMock() + mock_ct.ComputeUnit.CPU_AND_NE = "CPU_AND_NE_SENTINEL" + original_mlmodel = mock.MagicMock() + mock_ct.models.MLModel = original_mlmodel + + mock_yolo = mock.MagicMock(side_effect=Exception("test error")) + + with mock.patch.dict("sys.modules", { + "coremltools": mock_ct, + "ultralytics": mock.MagicMock(YOLO=mock_yolo), + }): + try: + env._load_coreml_with_compute_units("test.mlpackage") + except Exception: + pass + + # MLModel should be restored to original even after error + assert mock_ct.models.MLModel is original_mlmodel + + def test_unknown_compute_units_falls_back(self): + """Unknown compute_units string falls back to plain YOLO load.""" + env = HardwareEnv() + env.backend = "mps" + env.compute_units = "unknown_units" + + mock_yolo = mock.MagicMock() + mock_model = mock.MagicMock() + mock_yolo.return_value = mock_model + + with mock.patch.dict("sys.modules", { + "ultralytics": mock.MagicMock(YOLO=mock_yolo), + }): + result = env._load_coreml_with_compute_units("test.mlpackage") + mock_yolo.assert_called_once_with("test.mlpackage") + + def test_coremltools_missing_falls_back(self): + """If coremltools import fails, falls back to plain YOLO load.""" + env = HardwareEnv() + env.backend = "mps" + env.compute_units = "cpu_and_ne" + + mock_yolo = mock.MagicMock() + mock_model = mock.MagicMock() + mock_yolo.return_value = mock_model + + # Make coremltools import fail + with mock.patch.dict("sys.modules", { + "coremltools": None, + "ultralytics": mock.MagicMock(YOLO=mock_yolo), + }): + result = env._load_coreml_with_compute_units("test.mlpackage") + mock_yolo.assert_called_once_with("test.mlpackage") + + +# ── Tests: load_optimized integration ─────────────────────────────────────── + +class TestLoadOptimizedMPS: + """Test that load_optimized routes through compute_units on MPS.""" + + def test_mps_cached_model_uses_compute_units(self): + """When cached .mlpackage exists, loads via _load_coreml_with_compute_units.""" + env = HardwareEnv() + env.backend = "mps" + env.device = "mps" + env.export_format = "coreml" + env.framework_ok = True + env.compute_units = "cpu_and_ne" + + mock_model = mock.MagicMock() + + with mock.patch.object(env, "_load_coreml_with_compute_units", + return_value=mock_model) as mock_load: + with mock.patch.object(env, "get_optimized_path") as mock_path: + mock_path.return_value = mock.MagicMock(exists=lambda: True) + + with mock.patch.dict("sys.modules", { + "ultralytics": mock.MagicMock(), + }): + model, fmt = env.load_optimized("yolo26n") + + assert fmt == "coreml" + mock_load.assert_called_once() + + def test_mps_compute_units_all_skips_monkey_patch(self): + """When compute_units='all', loads via standard YOLO path.""" + env = HardwareEnv() + env.backend = "mps" + env.device = "mps" + env.export_format = "coreml" + env.framework_ok = True + env.compute_units = "all" # explicit: use all units including GPU + + mock_yolo = mock.MagicMock() + mock_model = mock.MagicMock() + mock_yolo.return_value = mock_model + + with mock.patch.object(env, "get_optimized_path") as mock_path: + mock_path.return_value = mock.MagicMock(exists=lambda: True) + + with mock.patch.dict("sys.modules", { + "ultralytics": mock.MagicMock(YOLO=mock_yolo), + }): + model, fmt = env.load_optimized("yolo26n") + + assert fmt == "coreml" + mock_yolo.assert_called_once()