diff --git a/packages/cli/src/opentools/scanner/engine.py b/packages/cli/src/opentools/scanner/engine.py index c6a7db6..ffb00c1 100644 --- a/packages/cli/src/opentools/scanner/engine.py +++ b/packages/cli/src/opentools/scanner/engine.py @@ -159,7 +159,11 @@ def kill_chain_state(self) -> KillChainState: async def run(self) -> None: """Execute the full task DAG.""" + from datetime import datetime, timezone + self.scan.status = ScanStatus.RUNNING + if self.scan.started_at is None: + self.scan.started_at = datetime.now(timezone.utc) await self._schedule_loop() self._finalize() @@ -420,7 +424,9 @@ def _skip_dependents(self, failed_task_id: str) -> None: to_skip.extend(self._dependents.get(dep_id, set())) def _finalize(self) -> None: - """Set final scan status based on task outcomes.""" + """Set final scan status and populate summary fields from task outcomes.""" + from datetime import datetime, timezone + if self._cancellation.is_cancelled: self.scan.status = ScanStatus.CANCELLED elif self._completed: @@ -428,6 +434,24 @@ def _finalize(self) -> None: else: self.scan.status = ScanStatus.FAILED + # Populate tools_completed / tools_failed from the task map. + # Tools are identified by task.name (one scan task per tool in the profile). + completed_tools: list[str] = [] + failed_tools: list[str] = [] + for task_id, task in self._tasks.items(): + if task_id in self._completed: + completed_tools.append(task.name) + elif task_id in self._failed: + failed_tools.append(task.name) + + # Deduplicate while preserving order + self.scan.tools_completed = list(dict.fromkeys(completed_tools)) + self.scan.tools_failed = list(dict.fromkeys(failed_tools)) + + # Stamp completed_at if not already set + if self.scan.completed_at is None: + self.scan.completed_at = datetime.now(timezone.utc) + # ------------------------------------------------------------------ # Pipeline processing # ------------------------------------------------------------------ diff --git a/packages/cli/src/opentools/scanner/parsing/parsers/nikto.py b/packages/cli/src/opentools/scanner/parsing/parsers/nikto.py new file mode 100644 index 0000000..c1ba8f5 --- /dev/null +++ b/packages/cli/src/opentools/scanner/parsing/parsers/nikto.py @@ -0,0 +1,135 @@ +"""Nikto JSON output parser.""" + +from __future__ import annotations + +import hashlib +import uuid +from datetime import datetime, timezone +from typing import Iterator + +import orjson + +from opentools.scanner.models import ( + EvidenceQuality, + LocationPrecision, + RawFinding, +) + + +_HIGH_PATTERNS = ( + "default password", + "default credential", + "directory listing", + "directory indexing", + "remote code execution", + "command injection", +) +_LOW_PATTERNS = ( + "server banner", + "version disclosure", + "x-powered-by", + "server leaks", +) + + +def _classify_severity(msg: str) -> str: + msg_lower = msg.lower() + for pattern in _HIGH_PATTERNS: + if pattern in msg_lower: + return "high" + for pattern in _LOW_PATTERNS: + if pattern in msg_lower: + return "low" + return "medium" + + +class NiktoParser: + """Parses Nikto JSON output (`-Format json`) into RawFindings. + + Each vulnerability entry becomes a finding with endpoint-level precision. + Severity is heuristically assigned based on message patterns. + """ + + name = "nikto" + version = "1.0.0" + confidence_tier = 0.6 + + def validate(self, data: bytes) -> bool: + """Accept either a target-result dict or an array of them.""" + try: + parsed = orjson.loads(data) + except (orjson.JSONDecodeError, UnicodeDecodeError): + return False + if isinstance(parsed, dict) and "vulnerabilities" in parsed: + return True + if isinstance(parsed, list): + return any( + isinstance(item, dict) and "vulnerabilities" in item + for item in parsed + ) + return False + + def parse( + self, + data: bytes, + scan_id: str, + scan_task_id: str, + ) -> Iterator[RawFinding]: + try: + parsed = orjson.loads(data) + except orjson.JSONDecodeError: + return + + # Normalize to a list of target-result dicts. + if isinstance(parsed, dict): + targets = [parsed] + elif isinstance(parsed, list): + targets = [t for t in parsed if isinstance(t, dict)] + else: + return + + for target_result in targets: + yield from self._parse_target(target_result, scan_id, scan_task_id) + + def _parse_target( + self, + parsed: dict, + scan_id: str, + scan_task_id: str, + ) -> Iterator[RawFinding]: + host = parsed.get("host") or parsed.get("ip") or "unknown" + port = str(parsed.get("port", "") or "") + + for vuln in parsed.get("vulnerabilities", []) or []: + if not isinstance(vuln, dict): + continue + msg = vuln.get("msg", "Unknown finding") or "Unknown finding" + url = vuln.get("url", "/") or "/" + method = vuln.get("method", "GET") or "GET" + nikto_id = str(vuln.get("id", "") or "") + osvdb = str(vuln.get("OSVDB", "") or "") + + location_url = f"{host}:{port}{url}" if port else f"{host}{url}" + evidence_str = f"nikto:{nikto_id}:{location_url}:{msg}" + evidence_hash = hashlib.sha256(evidence_str.encode()).hexdigest() + + yield RawFinding( + id=str(uuid.uuid4()), + scan_task_id=scan_task_id, + scan_id=scan_id, + tool="nikto", + raw_severity=_classify_severity(msg), + title=msg, + description=msg, + file_path=None, + url=location_url, + evidence=f"Method: {method}, Nikto ID: {nikto_id}" + (f", OSVDB: {osvdb}" if osvdb else ""), + evidence_quality=EvidenceQuality.PATTERN, + evidence_hash=evidence_hash, + cwe=None, + location_fingerprint=location_url, + location_precision=LocationPrecision.ENDPOINT, + parser_version=self.version, + parser_confidence=self.confidence_tier, + discovered_at=datetime.now(timezone.utc), + ) diff --git a/packages/cli/src/opentools/scanner/parsing/parsers/nuclei.py b/packages/cli/src/opentools/scanner/parsing/parsers/nuclei.py new file mode 100644 index 0000000..355a7a1 --- /dev/null +++ b/packages/cli/src/opentools/scanner/parsing/parsers/nuclei.py @@ -0,0 +1,111 @@ +"""Nuclei JSONL output parser.""" + +from __future__ import annotations + +import hashlib +import uuid +from datetime import datetime, timezone +from typing import Iterator + +import orjson + +from opentools.scanner.models import ( + EvidenceQuality, + LocationPrecision, + RawFinding, +) + + +class NucleiParser: + """Parses Nuclei JSONL output (one JSON object per line) into RawFindings. + + Each matched template becomes a finding with URL-level location precision. + Severity is mapped directly from Nuclei's severity field. + """ + + name = "nuclei" + version = "1.0.0" + confidence_tier = 0.8 + + def validate(self, data: bytes) -> bool: + """At least one non-empty line must parse as a JSON object with 'info'.""" + if not data.strip(): + return False + for line in data.splitlines(): + line = line.strip() + if not line: + continue + try: + obj = orjson.loads(line) + if isinstance(obj, dict) and "info" in obj: + return True + except orjson.JSONDecodeError: + continue + return False + + def parse( + self, + data: bytes, + scan_id: str, + scan_task_id: str, + ) -> Iterator[RawFinding]: + for line in data.splitlines(): + line = line.strip() + if not line: + continue + try: + result = orjson.loads(line) + except orjson.JSONDecodeError: + continue + if not isinstance(result, dict): + continue + + info = result.get("info", {}) or {} + template_id = result.get("template-id", "unknown") + severity = str(info.get("severity", "info")).lower() + matched_at = result.get("matched-at", "") or result.get("host", "") + host = result.get("host", "") + + # CWE: may be a list under classification.cwe-id + cwe = None + classification = info.get("classification", {}) or {} + cwe_list = classification.get("cwe-id") or [] + if isinstance(cwe_list, list) and cwe_list: + cwe = str(cwe_list[0]).upper() + if not cwe.startswith("CWE-"): + cwe = f"CWE-{cwe.lstrip('cweCWE-:')}" + + title = info.get("name") or template_id + description_parts = [] + if info.get("description"): + description_parts.append(str(info["description"]).strip()) + if matched_at: + description_parts.append(f"Matched at: {matched_at}") + description = "\n".join(description_parts) if description_parts else None + + evidence_str = f"nuclei:{template_id}:{matched_at}" + evidence_hash = hashlib.sha256(evidence_str.encode()).hexdigest() + location_fp = matched_at or host or template_id + + yield RawFinding( + id=str(uuid.uuid4()), + scan_task_id=scan_task_id, + scan_id=scan_id, + tool="nuclei", + raw_severity=severity, + title=title, + description=description, + file_path=None, + line_start=None, + line_end=None, + url=matched_at or host or None, + evidence=str(result.get("matcher-name", "")) or matched_at or None, + evidence_quality=EvidenceQuality.STRUCTURED, + evidence_hash=evidence_hash, + cwe=cwe, + location_fingerprint=location_fp, + location_precision=LocationPrecision.ENDPOINT, + parser_version=self.version, + parser_confidence=self.confidence_tier, + discovered_at=datetime.now(timezone.utc), + ) diff --git a/packages/cli/src/opentools/scanner/parsing/parsers/waybackurls.py b/packages/cli/src/opentools/scanner/parsing/parsers/waybackurls.py new file mode 100644 index 0000000..14bfa35 --- /dev/null +++ b/packages/cli/src/opentools/scanner/parsing/parsers/waybackurls.py @@ -0,0 +1,76 @@ +"""Waybackurls plain-text URL list parser. + +Waybackurls outputs one URL per line. Each URL becomes an informational +RawFinding representing a historically-known endpoint worth probing +further in subsequent scans. Deduplicated on URL. +""" + +from __future__ import annotations + +import hashlib +import uuid +from datetime import datetime, timezone +from typing import Iterator + +from opentools.scanner.models import ( + EvidenceQuality, + LocationPrecision, + RawFinding, +) + + +class WaybackurlsParser: + name = "waybackurls" + version = "1.0.0" + confidence_tier = 0.4 + + def validate(self, data: bytes) -> bool: + """Any non-empty input with at least one URL-shaped line.""" + if not data.strip(): + return False + for raw_line in data.splitlines(): + line = raw_line.strip() + if line.startswith(b"http://") or line.startswith(b"https://"): + return True + return False + + def parse( + self, + data: bytes, + scan_id: str, + scan_task_id: str, + ) -> Iterator[RawFinding]: + seen: set[str] = set() + for raw_line in data.splitlines(): + try: + url = raw_line.decode("utf-8", errors="replace").strip() + except Exception: + continue + if not url or not (url.startswith("http://") or url.startswith("https://")): + continue + if url in seen: + continue + seen.add(url) + + evidence_hash = hashlib.sha256(f"waybackurls:{url}".encode()).hexdigest() + + yield RawFinding( + id=str(uuid.uuid4()), + scan_task_id=scan_task_id, + scan_id=scan_id, + tool="waybackurls", + raw_severity="info", + title=f"Historical URL: {url[:80]}", + description=f"Discovered via Wayback Machine archives: {url}", + file_path=None, + url=url, + evidence=url, + evidence_quality=EvidenceQuality.HEURISTIC, + evidence_hash=evidence_hash, + cwe=None, + location_fingerprint=url, + location_precision=LocationPrecision.ENDPOINT, + parser_version=self.version, + parser_confidence=self.confidence_tier, + discovered_at=datetime.now(timezone.utc), + ) diff --git a/packages/cli/src/opentools/scanner/parsing/parsers/whatweb.py b/packages/cli/src/opentools/scanner/parsing/parsers/whatweb.py new file mode 100644 index 0000000..9263f12 --- /dev/null +++ b/packages/cli/src/opentools/scanner/parsing/parsers/whatweb.py @@ -0,0 +1,154 @@ +"""WhatWeb JSON output parser. + +WhatWeb outputs an array of target objects, each listing detected plugins/ +technologies. Each detected plugin becomes an informational RawFinding — +technology fingerprinting is low-risk on its own but feeds vulnerability +correlation downstream. +""" + +from __future__ import annotations + +import hashlib +import uuid +from datetime import datetime, timezone +from typing import Iterator + +import orjson + +from opentools.scanner.models import ( + EvidenceQuality, + LocationPrecision, + RawFinding, +) + + +class WhatWebParser: + name = "whatweb" + version = "1.0.0" + confidence_tier = 0.9 + + def validate(self, data: bytes) -> bool: + """Accept if at least one target block parses as a JSON array of dicts. + + WhatWeb outputs one JSON array per invocation; when multiple targets + are scanned back-to-back, the stdout contains multiple arrays + concatenated line-by-line which is not valid single-document JSON. + """ + return any(True for _ in self._iter_targets(data)) + + @staticmethod + def _iter_targets(data: bytes): + """Yield each target dict across possibly-concatenated JSON arrays. + + Strategy: extract balanced-bracket top-level arrays and parse each. + Falls back to parsing the whole blob as a single array if that works. + """ + text = data.decode("utf-8", errors="replace") + + # Fast path: single array + try: + parsed = orjson.loads(text) + if isinstance(parsed, list): + for item in parsed: + if isinstance(item, dict): + yield item + return + except orjson.JSONDecodeError: + pass + + # Slow path: find each top-level `[...]` block by bracket-counting + depth = 0 + start = -1 + in_string = False + escape = False + for i, ch in enumerate(text): + if escape: + escape = False + continue + if ch == "\\": + escape = True + continue + if ch == '"': + in_string = not in_string + continue + if in_string: + continue + if ch == "[": + if depth == 0: + start = i + depth += 1 + elif ch == "]": + depth -= 1 + if depth == 0 and start >= 0: + block = text[start : i + 1] + try: + parsed = orjson.loads(block) + except orjson.JSONDecodeError: + start = -1 + continue + if isinstance(parsed, list): + for item in parsed: + if isinstance(item, dict): + yield item + start = -1 + + def parse( + self, + data: bytes, + scan_id: str, + scan_task_id: str, + ) -> Iterator[RawFinding]: + for target in self._iter_targets(data): + if not isinstance(target, dict): + continue + url = target.get("target") or target.get("uri") or "unknown" + plugins = target.get("plugins", {}) or {} + + for plugin_name, details in plugins.items(): + # Skip noisy structural plugins + if plugin_name in ("HTTPServer", "IP", "Country", "RedirectLocation"): + continue + + versions: list[str] = [] + accounts: list[str] = [] + if isinstance(details, dict): + raw_versions = details.get("version") or details.get("string") or [] + if isinstance(raw_versions, list): + versions = [str(v) for v in raw_versions] + elif raw_versions: + versions = [str(raw_versions)] + if isinstance(details.get("account"), list): + accounts = [str(a) for a in details["account"]] + + version_str = f" {versions[0]}" if versions else "" + title = f"Detected: {plugin_name}{version_str}" + description_parts = [f"WhatWeb identified {plugin_name} at {url}"] + if versions: + description_parts.append(f"Version(s): {', '.join(versions)}") + if accounts: + description_parts.append(f"Account: {', '.join(accounts)}") + description = ". ".join(description_parts) + + evidence_str = f"whatweb:{plugin_name}:{url}:{','.join(versions)}" + evidence_hash = hashlib.sha256(evidence_str.encode()).hexdigest() + + yield RawFinding( + id=str(uuid.uuid4()), + scan_task_id=scan_task_id, + scan_id=scan_id, + tool="whatweb", + raw_severity="info", + title=title, + description=description, + file_path=None, + url=url, + evidence=str(versions) if versions else plugin_name, + evidence_quality=EvidenceQuality.STRUCTURED, + evidence_hash=evidence_hash, + cwe=None, + location_fingerprint=f"{url}#{plugin_name}", + location_precision=LocationPrecision.ENDPOINT, + parser_version=self.version, + parser_confidence=self.confidence_tier, + discovered_at=datetime.now(timezone.utc), + ) diff --git a/packages/cli/src/opentools/scanner/pipeline.py b/packages/cli/src/opentools/scanner/pipeline.py index f069e8a..4fcb163 100644 --- a/packages/cli/src/opentools/scanner/pipeline.py +++ b/packages/cli/src/opentools/scanner/pipeline.py @@ -87,6 +87,26 @@ def _register_builtin_parsers(self) -> None: self.router.register(GenericJsonParser()) except ImportError: pass + try: + from opentools.scanner.parsing.parsers.nuclei import NucleiParser + self.router.register(NucleiParser()) + except ImportError: + pass + try: + from opentools.scanner.parsing.parsers.nikto import NiktoParser + self.router.register(NiktoParser()) + except ImportError: + pass + try: + from opentools.scanner.parsing.parsers.whatweb import WhatWebParser + self.router.register(WhatWebParser()) + except ImportError: + pass + try: + from opentools.scanner.parsing.parsers.waybackurls import WaybackurlsParser + self.router.register(WaybackurlsParser()) + except ImportError: + pass async def process_task_output( self, diff --git a/packages/cli/src/opentools/scanner/profiles/web_quick.yaml b/packages/cli/src/opentools/scanner/profiles/web_quick.yaml index 0c0d494..3237611 100644 --- a/packages/cli/src/opentools/scanner/profiles/web_quick.yaml +++ b/packages/cli/src/opentools/scanner/profiles/web_quick.yaml @@ -9,7 +9,7 @@ phases: tools: - tool: whatweb task_type: shell - command_template: "whatweb --color=never --log-json=- {target}" + command_template: "docker exec whatweb-mcp sh -c \"whatweb --color=never --log-json=/tmp/whatweb.json {target} > /dev/null 2>&1; cat /tmp/whatweb.json\"" parser: whatweb priority: 10 tier: fast @@ -17,7 +17,7 @@ phases: preferred_output_format: json - tool: waybackurls task_type: shell - command_template: "echo {target_host} | waybackurls" + command_template: "docker exec waybackurls-mcp sh -c \"echo {target_host} | waybackurls\"" parser: waybackurls priority: 20 tier: fast @@ -27,7 +27,7 @@ phases: tools: - tool: nuclei task_type: shell - command_template: "nuclei -u {target} -json" + command_template: "docker exec nuclei-mcp nuclei -u {target} -jsonl -silent -severity critical,high,medium,low,info" parser: nuclei priority: 30 tier: normal @@ -35,7 +35,7 @@ phases: preferred_output_format: json - tool: nikto task_type: shell - command_template: "nikto -h {target} -Format json" + command_template: "docker exec nikto-mcp sh -c \"nikto -h {target} -Format json -output /tmp/nikto.json -maxtime 180 >/dev/null 2>&1; cat /tmp/nikto.json\"" parser: nikto priority: 40 tier: normal diff --git a/packages/cli/src/opentools/scanner/scan_cli.py b/packages/cli/src/opentools/scanner/scan_cli.py index 2e2063e..8128ef3 100644 --- a/packages/cli/src/opentools/scanner/scan_cli.py +++ b/packages/cli/src/opentools/scanner/scan_cli.py @@ -223,7 +223,28 @@ async def scan_run( for t in tasks: await store.save_task(t) - result = await api.execute(scan, tasks) + result = await api.execute(scan, tasks, store=store) + + # Populate finding_count from the pipeline-persisted deduplicated + # findings. The engine updates scan summary fields but has no way to + # know how many findings the pipeline emitted — that lives in the store. + try: + scan_findings = await store.get_scan_findings(result.id) + result.finding_count = len(scan_findings) + except Exception: + pass + + # Persist the final scan state (status, tools_completed, finding_count). + # The in-memory scan is updated by the engine, but the DB row still + # reflects the initial "pending" save unless we write the terminal + # state back through the store. + await store.save_scan(result) + + # Persist terminal task state (status, exit_code, stdout, stderr, duration_ms). + # Tasks are mutated in memory by the engine but never re-saved, so the + # initial "pending" rows would otherwise remain stale after completion. + for t in tasks: + await store.save_task(t) if json_output: out.print(result.model_dump_json(indent=2)) diff --git a/packages/cli/src/opentools/scanner/store.py b/packages/cli/src/opentools/scanner/store.py index 3cf4207..bad771a 100644 --- a/packages/cli/src/opentools/scanner/store.py +++ b/packages/cli/src/opentools/scanner/store.py @@ -240,10 +240,15 @@ def _require_conn(self) -> aiosqlite.Connection: # ------------------------------------------------------------------ async def save_scan(self, scan: Scan) -> None: - """Insert a scan record (JSON blob).""" + """Upsert a scan record (JSON blob). + + Idempotent — safe to call both during plan (initial persist) and + after execute (terminal state persist). Uses INSERT OR REPLACE so + subsequent saves overwrite the row with the latest scan state. + """ conn = self._require_conn() await conn.execute( - "INSERT INTO scan (id, data) VALUES (?, ?)", + "INSERT OR REPLACE INTO scan (id, data) VALUES (?, ?)", (scan.id, scan.model_dump_json()), ) await conn.commit() @@ -289,10 +294,14 @@ async def list_scans(self, engagement_id: str | None = None) -> list[Scan]: # ------------------------------------------------------------------ async def save_task(self, task: ScanTask) -> None: - """Insert a task record (JSON blob).""" + """Upsert a task record (JSON blob). + + Idempotent — safe to call before execution (to persist planned state) + and after (to persist terminal state with stdout/stderr/exit_code). + """ conn = self._require_conn() await conn.execute( - "INSERT INTO scan_task (id, scan_id, data) VALUES (?, ?, ?)", + "INSERT OR REPLACE INTO scan_task (id, scan_id, data) VALUES (?, ?, ?)", (task.id, task.scan_id, task.model_dump_json()), ) await conn.commit()