From ffd196e941942b046c934d5778eab55dada603a7 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 27 Apr 2026 11:56:06 +0300 Subject: [PATCH 01/12] ssh: pass configured key to paramiko, fix host_keys arg Two related bugs in SSHClient._connect: 1. ssh.connect() never received key_filename, so paramiko fell back to the SSH agent and hardcoded defaults (~/.ssh/id_rsa, id_ed25519, ...), silently ignoring servers[]['key'] from settings.py. With no agent running, this manifests as "Incorrect padding" when the default id_rsa is in OpenSSH format. 2. load_system_host_keys(filename=self.key) was passing the private-key path as a known_hosts file, corrupting the host_keys store. The call was masked by AutoAddPolicy() but is meaningless. Drop the argument so paramiko loads the real system known_hosts. Same fix as origin/AzureServer (a53d6baa), applied surgically. --- arc/job/ssh.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arc/job/ssh.py b/arc/job/ssh.py index 9431c4c1fe..9ea31da745 100644 --- a/arc/job/ssh.py +++ b/arc/job/ssh.py @@ -370,16 +370,16 @@ def _connect(self) -> tuple[paramiko.sftp_client.SFTPClient, paramiko.SSHClient] """ ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - ssh.load_system_host_keys(filename=self.key) + ssh.load_system_host_keys() try: # If the server accepts the connection but the SSH daemon doesn't respond in # 15 seconds (default in paramiko) due to network congestion, faulty switches, # etc..., common solution is enlarging the timeout variable. - ssh.connect(hostname=self.address, username=self.un, banner_timeout=200) + ssh.connect(hostname=self.address, username=self.un, banner_timeout=200, key_filename=self.key) except: # This sometimes gives "SSHException: Error reading SSH protocol banner[Error 104] Connection reset by peer" # Try again: - ssh.connect(hostname=self.address, username=self.un, banner_timeout=200) + ssh.connect(hostname=self.address, username=self.un, banner_timeout=200, key_filename=self.key) sftp = ssh.open_sftp() return sftp, ssh From 3c351015dd8534efe5f7f369535660f7a58df41c Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 27 Apr 2026 11:56:15 +0300 Subject: [PATCH 02/12] tckdb: add integration module and ARC.py wiring Adds arc/tckdb/ package with: - config.py: TCKDBConfig dataclass parsed from input YAML 'tckdb' block - adapter.py: HTTP adapter for posting payloads to a TCKDB instance - payload_writer.py: serializes ARC species/reaction artifacts to disk - idempotency.py: avoids re-uploading completed jobs - *_test.py: unit tests for each module ARC.py reads the optional 'tckdb' key from the input YAML, builds a TCKDBConfig, and attaches it to the ARC object before execute(). When absent, behavior is unchanged. Docs: docs/tckdb-integration.md. --- ARC.py | 7 + arc/tckdb/__init__.py | 10 + arc/tckdb/adapter.py | 457 +++++++++++++++++++++++++++++++ arc/tckdb/adapter_test.py | 339 +++++++++++++++++++++++ arc/tckdb/config.py | 75 +++++ arc/tckdb/config_test.py | 77 ++++++ arc/tckdb/idempotency.py | 93 +++++++ arc/tckdb/idempotency_test.py | 65 +++++ arc/tckdb/payload_writer.py | 145 ++++++++++ arc/tckdb/payload_writer_test.py | 91 ++++++ docs/tckdb-integration.md | 216 +++++++++++++++ 11 files changed, 1575 insertions(+) create mode 100644 arc/tckdb/__init__.py create mode 100644 arc/tckdb/adapter.py create mode 100644 arc/tckdb/adapter_test.py create mode 100644 arc/tckdb/config.py create mode 100644 arc/tckdb/config_test.py create mode 100644 arc/tckdb/idempotency.py create mode 100644 arc/tckdb/idempotency_test.py create mode 100644 arc/tckdb/payload_writer.py create mode 100644 arc/tckdb/payload_writer_test.py create mode 100644 docs/tckdb-integration.md diff --git a/ARC.py b/ARC.py index 0707ec2130..c1455e312b 100644 --- a/ARC.py +++ b/ARC.py @@ -11,6 +11,7 @@ from arc.common import read_yaml_file from arc.main import ARC +from arc.tckdb.config import TCKDBConfig def parse_command_line_arguments(command_line_args=None): @@ -59,7 +60,13 @@ def main(): input_dict['verbose'] = input_dict['verbose'] if 'verbose' in input_dict else verbose if 'project_directory' not in input_dict or not input_dict['project_directory']: input_dict['project_directory'] = project_directory + + tckdb_config = TCKDBConfig.from_dict(input_dict.pop('tckdb', None)) + arc_object = ARC(**input_dict) + arc_object.tckdb_config = tckdb_config + if tckdb_config is not None: + logging.info('TCKDB integration enabled: %s', tckdb_config.base_url) arc_object.execute() diff --git a/arc/tckdb/__init__.py b/arc/tckdb/__init__.py new file mode 100644 index 0000000000..af006428c5 --- /dev/null +++ b/arc/tckdb/__init__.py @@ -0,0 +1,10 @@ +"""ARC-side TCKDB integration: build, write, and optionally upload conformer/calculation payloads. + +The chemistry/provenance mapping lives here in ARC. Transport lives in +``tckdb-client``. Server-side validation/persistence lives in TCKDB. +""" + +from arc.tckdb.adapter import TCKDBAdapter, UploadOutcome +from arc.tckdb.config import TCKDBConfig + +__all__ = ["TCKDBAdapter", "TCKDBConfig", "UploadOutcome"] diff --git a/arc/tckdb/adapter.py b/arc/tckdb/adapter.py new file mode 100644 index 0000000000..9ef552b487 --- /dev/null +++ b/arc/tckdb/adapter.py @@ -0,0 +1,457 @@ +"""Adapter that turns ARC objects into TCKDB conformer-upload payloads. + +The adapter is the only ARC module that knows the shape of a TCKDB +upload. It builds a JSON payload matching ``ConformerUploadRequest``, +writes it to disk, and (optionally) hands it to ``tckdb-client``. + +Three guarantees: + +1. If the adapter is disabled or no config is provided, it is a no-op. +2. The payload is on disk *before* any network call. Replay tooling + only needs ``payload_file + endpoint + idempotency_key``. +3. By default, an upload failure is logged + recorded in the sidecar + but does not raise. ``strict=True`` flips that. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Mapping + +from arc.tckdb.config import TCKDBConfig +from arc.tckdb.idempotency import IdempotencyInputs, build_idempotency_key +from arc.tckdb.payload_writer import ( + PayloadWriter, + SidecarMetadata, + WrittenPayload, +) + + +logger = logging.getLogger("arc") + +CONFORMER_UPLOAD_ENDPOINT = "/uploads/conformers" +PAYLOAD_KIND = "conformer_calculation" + + +@dataclass(frozen=True) +class UploadOutcome: + """Result of one adapter invocation; mirrors the sidecar status.""" + + status: str # pending | uploaded | failed | skipped + payload_path: Path + sidecar_path: Path + idempotency_key: str + error: str | None = None + response: Any = None + + +class TCKDBAdapter: + """Build, write, and optionally upload one conformer/calculation payload. + + The adapter holds the config and a payload writer. ``client_factory`` + is overridable so tests can inject a mocked ``TCKDBClient`` without + touching the network. + """ + + def __init__( + self, + config: TCKDBConfig, + *, + project_directory: str | Path | None = None, + client_factory=None, + ): + self._config = config + # Resolve payload_dir against the project directory if it's relative, + # so payloads land under the active ARC project rather than CWD. + payload_root = Path(config.payload_dir) + if not payload_root.is_absolute() and project_directory is not None: + payload_root = Path(project_directory) / payload_root + self._writer = PayloadWriter(payload_root) + self._client_factory = client_factory + + # ------------------------------------------------------------------ + # Public entry points + # ------------------------------------------------------------------ + + def submit_conformer( + self, + *, + species, + level, + xyz: str | Mapping[str, Any] | None = None, + conformer_index: int = 0, + calculation_type: str = "opt", + calculation_quality: str = "raw", + opt_result: Mapping[str, Any] | None = None, + freq_result: Mapping[str, Any] | None = None, + sp_result: Mapping[str, Any] | None = None, + arc_version: str | None = None, + arc_git_commit: str | None = None, + extra_label: str | None = None, + ) -> UploadOutcome | None: + """Build, write, and (if configured) upload one conformer payload. + + Returns ``None`` if the adapter is disabled (so callers can write + ``adapter.submit_conformer(...)`` without an enabled-check). + """ + if not self._config.enabled: + return None + + payload = self._build_payload( + species=species, + level=level, + xyz=xyz, + calculation_type=calculation_type, + calculation_quality=calculation_quality, + opt_result=opt_result, + freq_result=freq_result, + sp_result=sp_result, + arc_version=arc_version, + arc_git_commit=arc_git_commit, + ) + + species_label = getattr(species, "label", None) or "unlabeled" + conformer_label = extra_label or f"conf{conformer_index}" + idempotency_inputs = IdempotencyInputs.from_payload( + project_label=self._config.project_label, + species_label=species_label, + conformer_label=conformer_label, + payload_kind=PAYLOAD_KIND, + payload=payload, + ) + idempotency_key = build_idempotency_key(idempotency_inputs) + + written = self._writer.write( + label=f"{species_label}.{conformer_label}", + payload=payload, + endpoint=CONFORMER_UPLOAD_ENDPOINT, + idempotency_key=idempotency_key, + payload_kind=PAYLOAD_KIND, + base_url=self._config.base_url, + ) + logger.info( + "TCKDB payload written: %s (key=%s)", + written.payload_path, + idempotency_key, + ) + + if not self._config.upload: + return self._finalize_skipped(written) + + return self._upload(written, payload) + + # ------------------------------------------------------------------ + # Payload construction + # ------------------------------------------------------------------ + + def _build_payload( + self, + *, + species, + level, + xyz, + calculation_type: str, + calculation_quality: str, + opt_result: Mapping[str, Any] | None, + freq_result: Mapping[str, Any] | None, + sp_result: Mapping[str, Any] | None, + arc_version: str | None, + arc_git_commit: str | None, + ) -> dict[str, Any]: + """Construct the JSON payload accepted by ``ConformerUploadRequest``. + + Designed to fail soft on missing optional ARC attributes — many + ARC species objects in mid-flight don't carry every field. + Required-by-schema fields (smiles, charge, multiplicity, xyz_text, + method, software name) come from the species/level objects. + """ + species_entry = self._species_entry_payload(species) + geometry_payload = {"xyz_text": _coerce_xyz_text(xyz, species)} + calculation_payload = self._calculation_payload( + level=level, + calculation_type=calculation_type, + calculation_quality=calculation_quality, + opt_result=opt_result, + freq_result=freq_result, + sp_result=sp_result, + arc_version=arc_version, + arc_git_commit=arc_git_commit, + ) + + payload: dict[str, Any] = { + "species_entry": species_entry, + "geometry": geometry_payload, + "calculation": calculation_payload, + "scientific_origin": "computed", + } + label = getattr(species, "label", None) + if label: + payload["label"] = str(label)[:64] + return payload + + @staticmethod + def _species_entry_payload(species) -> dict[str, Any]: + smiles = _resolve_smiles(species) + if not smiles: + raise ValueError( + "TCKDB upload requires a SMILES on the species; " + f"got species.label={getattr(species, 'label', None)!r} with no resolvable SMILES." + ) + is_ts = bool(getattr(species, "is_ts", False)) + return { + "molecule_kind": "molecule", + "smiles": smiles, + "charge": int(getattr(species, "charge", 0) or 0), + "multiplicity": int(getattr(species, "multiplicity", 1) or 1), + "species_entry_kind": "transition_state" if is_ts else "minimum", + } + + @staticmethod + def _calculation_payload( + *, + level, + calculation_type: str, + calculation_quality: str, + opt_result: Mapping[str, Any] | None, + freq_result: Mapping[str, Any] | None, + sp_result: Mapping[str, Any] | None, + arc_version: str | None, + arc_git_commit: str | None, + ) -> dict[str, Any]: + method = getattr(level, "method", None) if level is not None else None + if not method: + raise ValueError("TCKDB upload requires a level-of-theory method.") + software_name = getattr(level, "software", None) if level is not None else None + if not software_name: + raise ValueError("TCKDB upload requires the ESS name (level.software).") + + level_of_theory: dict[str, Any] = {"method": method} + for src, dst in ( + ("basis", "basis"), + ("auxiliary_basis", "aux_basis"), + ("cabs", "cabs_basis"), + ("dispersion", "dispersion"), + ("solvent", "solvent"), + ("solvation_method", "solvent_model"), + ): + value = getattr(level, src, None) + if value: + level_of_theory[dst] = value + + software_release: dict[str, Any] = {"name": software_name} + version = getattr(level, "software_version", None) + if version is not None: + software_release["version"] = str(version) + + calc: dict[str, Any] = { + "type": calculation_type, + "quality": calculation_quality, + "software_release": software_release, + "level_of_theory": level_of_theory, + } + + if arc_version or arc_git_commit: + wt: dict[str, Any] = {"name": "ARC"} + if arc_version: + wt["version"] = arc_version + if arc_git_commit: + wt["git_commit"] = arc_git_commit + calc["workflow_tool_release"] = wt + + if calculation_type == "opt" and opt_result: + calc["opt_result"] = dict(opt_result) + if calculation_type == "freq" and freq_result: + calc["freq_result"] = dict(freq_result) + if calculation_type == "sp" and sp_result: + calc["sp_result"] = dict(sp_result) + + return calc + + # ------------------------------------------------------------------ + # Upload + sidecar finalization + # ------------------------------------------------------------------ + + def _finalize_skipped(self, written: WrittenPayload) -> UploadOutcome: + sc = written.sidecar + sc.status = "skipped" + self._writer.update_sidecar(written.sidecar_path, sc) + logger.info("TCKDB upload skipped (upload=false): %s", written.payload_path) + return UploadOutcome( + status="skipped", + payload_path=written.payload_path, + sidecar_path=written.sidecar_path, + idempotency_key=sc.idempotency_key, + ) + + def _upload(self, written: WrittenPayload, payload: dict[str, Any]) -> UploadOutcome: + from arc.tckdb.payload_writer import _utcnow_iso # local import keeps deps tidy + + sc = written.sidecar + api_key = self._config.resolve_api_key() + if not api_key: + msg = ( + f"TCKDB API key env var '{self._config.api_key_env}' is not set; " + "skipping network call and recording sidecar as failed." + ) + return self._record_failure(written, msg, raised=ValueError(msg)) + + try: + client = self._make_client(api_key) + except Exception as exc: # pragma: no cover - defensive + return self._record_failure(written, f"client init failed: {exc}", exc) + + try: + response = client.request_json( + "POST", + CONFORMER_UPLOAD_ENDPOINT, + json=payload, + idempotency_key=sc.idempotency_key, + ) + except Exception as exc: + client_close = getattr(client, "close", None) + if callable(client_close): + try: + client_close() + except Exception: # pragma: no cover - close errors swallowed + logger.debug("TCKDB client close errored after upload failure", exc_info=True) + return self._record_failure(written, str(exc), exc) + else: + client_close = getattr(client, "close", None) + if callable(client_close): + try: + client_close() + except Exception: # pragma: no cover - close errors swallowed + logger.debug("TCKDB client close errored after upload success", exc_info=True) + + sc.status = "uploaded" + sc.uploaded_at = _utcnow_iso() + sc.response_status_code = getattr(response, "status_code", None) + sc.response_body = _summarize_response_body(getattr(response, "data", None)) + sc.idempotency_replayed = bool(getattr(response, "idempotency_replayed", False)) + sc.last_error = None + self._writer.update_sidecar(written.sidecar_path, sc) + if sc.idempotency_replayed: + logger.info( + "TCKDB upload replayed (idempotent): %s key=%s", + written.payload_path, + sc.idempotency_key, + ) + else: + logger.info( + "TCKDB upload succeeded: %s key=%s", + written.payload_path, + sc.idempotency_key, + ) + return UploadOutcome( + status="uploaded", + payload_path=written.payload_path, + sidecar_path=written.sidecar_path, + idempotency_key=sc.idempotency_key, + response=sc.response_body, + ) + + def _record_failure( + self, written: WrittenPayload, message: str, raised: BaseException + ) -> UploadOutcome: + sc = written.sidecar + sc.status = "failed" + sc.last_error = message + self._writer.update_sidecar(written.sidecar_path, sc) + logger.warning( + "TCKDB upload failed (strict=%s): %s key=%s err=%s", + self._config.strict, + written.payload_path, + sc.idempotency_key, + message, + ) + if self._config.strict: + raise raised + return UploadOutcome( + status="failed", + payload_path=written.payload_path, + sidecar_path=written.sidecar_path, + idempotency_key=sc.idempotency_key, + error=message, + ) + + def _make_client(self, api_key: str): + if self._client_factory is not None: + return self._client_factory(self._config, api_key) + # Lazy import: keep tckdb_client out of the import path when the + # adapter is unused. + from tckdb_client import TCKDBClient + + return TCKDBClient( + self._config.base_url, + api_key=api_key, + timeout=self._config.timeout_seconds, + ) + + +# ---------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------- + + +def _resolve_smiles(species) -> str | None: + """Best-effort SMILES extraction from an ARC species or duck-typed stand-in.""" + smiles = getattr(species, "smiles", None) + if smiles: + return str(smiles) + mol = getattr(species, "mol", None) + if mol is None: + return None + to_smiles = getattr(mol, "to_smiles", None) or getattr(mol, "smiles", None) + if callable(to_smiles): + try: + return str(to_smiles()) + except Exception: + logger.debug("mol.to_smiles() raised; falling back to None", exc_info=True) + return None + if isinstance(to_smiles, str): + return to_smiles + return None + + +def _coerce_xyz_text(xyz, species) -> str: + """Accept xyz as a string or ARC xyz dict; fall back to species attrs.""" + candidate = xyz + if candidate is None: + candidate = ( + getattr(species, "final_xyz", None) + or getattr(species, "initial_xyz", None) + ) + if candidate is None: + raise ValueError("TCKDB upload requires xyz coordinates.") + if isinstance(candidate, str): + text = candidate.strip() + if not text: + raise ValueError("TCKDB upload requires non-empty xyz coordinates.") + return text + # ARC xyz dict: {'symbols': (...), 'coords': ((x,y,z),...), ...} + symbols = candidate.get("symbols") if isinstance(candidate, Mapping) else None + coords = candidate.get("coords") if isinstance(candidate, Mapping) else None + if not symbols or not coords: + raise ValueError(f"Unrecognized xyz container: {type(candidate).__name__}") + lines = [ + f"{sym} {x:.8f} {y:.8f} {z:.8f}" + for sym, (x, y, z) in zip(symbols, coords) + ] + return "\n".join(lines) + + +def _summarize_response_body(body: Any, *, max_chars: int = 2000) -> Any: + """Truncate huge bodies so the sidecar stays small and grep-friendly.""" + if body is None: + return None + if isinstance(body, (dict, list)): + return body + text = str(body) + if len(text) > max_chars: + return text[:max_chars] + "..." + return text + + +__all__ = ["TCKDBAdapter", "UploadOutcome", "CONFORMER_UPLOAD_ENDPOINT", "PAYLOAD_KIND"] diff --git a/arc/tckdb/adapter_test.py b/arc/tckdb/adapter_test.py new file mode 100644 index 0000000000..4ead0b7c36 --- /dev/null +++ b/arc/tckdb/adapter_test.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +"""Unit tests for arc.tckdb.adapter. + +These tests do not require a live TCKDB server. The TCKDBClient is +replaced by a stub via the adapter's ``client_factory`` parameter. +""" + +import json +import os +import shutil +import tempfile +import unittest +from types import SimpleNamespace +from unittest import mock + +from arc.tckdb.adapter import ( + CONFORMER_UPLOAD_ENDPOINT, + PAYLOAD_KIND, + TCKDBAdapter, + UploadOutcome, +) +from arc.tckdb.config import TCKDBConfig + + +# --------------------------------------------------------------------------- +# Stubs +# --------------------------------------------------------------------------- + + +class _StubResponse: + def __init__(self, data, status_code=201, replayed=False): + self.data = data + self.status_code = status_code + self.idempotency_replayed = replayed + + +class _StubClient: + """Minimal TCKDBClient lookalike for adapter tests.""" + + def __init__(self, *, response=None, raise_exc=None): + self._response = response + self._raise_exc = raise_exc + self.calls = [] + self.closed = False + + def request_json(self, method, path, *, json=None, idempotency_key=None): + self.calls.append( + dict(method=method, path=path, json=json, idempotency_key=idempotency_key) + ) + if self._raise_exc is not None: + raise self._raise_exc + return self._response + + def close(self): + self.closed = True + + +def _fake_species(label="ethanol", smiles="CCO", charge=0, multiplicity=1, is_ts=False): + return SimpleNamespace( + label=label, + smiles=smiles, + charge=charge, + multiplicity=multiplicity, + is_ts=is_ts, + final_xyz="C 0.0 0.0 0.0\nH 1.0 0.0 0.0", + ) + + +def _fake_level(): + return SimpleNamespace( + method="wb97xd", + basis="def2-tzvp", + auxiliary_basis=None, + cabs=None, + dispersion=None, + solvation_method=None, + solvent=None, + software="gaussian", + software_version="16", + ) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestAdapterDisabled(unittest.TestCase): + """Test 1: disabled config does nothing.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + + def test_disabled_returns_none_and_writes_nothing(self): + cfg = TCKDBConfig(enabled=False, base_url="http://x", payload_dir=self.tmp) + client = _StubClient(response=_StubResponse({"ok": True})) + adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) + outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + self.assertIsNone(outcome) + self.assertEqual(os.listdir(self.tmp), []) + self.assertEqual(client.calls, []) + + +class TestAdapterPayloadAndUpload(unittest.TestCase): + """Tests 2, 3, 9, 10, 11: payload + sidecar + replay metadata + no DB IDs.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + ) + + def _adapter(self, client): + return TCKDBAdapter(self.cfg, client_factory=lambda c, k: client) + + def test_payload_written_before_failed_upload(self): + """Test 2: payload written even when upload fails.""" + client = _StubClient(raise_exc=RuntimeError("network down")) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + + self.assertEqual(outcome.status, "failed") + self.assertTrue(outcome.payload_path.exists()) + self.assertTrue(outcome.sidecar_path.exists()) + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sc["status"], "failed") + self.assertIn("network down", sc["last_error"]) + # Payload file is intact and parseable + json.loads(outcome.payload_path.read_text()) + + def test_upload_success_updates_sidecar(self): + """Test 3: upload success -> sidecar status = uploaded.""" + client = _StubClient(response=_StubResponse({"conformer_observation_id": 42})) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + + self.assertEqual(outcome.status, "uploaded") + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sc["status"], "uploaded") + self.assertIsNotNone(sc["uploaded_at"]) + self.assertEqual(sc["response_status_code"], 201) + self.assertEqual(sc["idempotency_replayed"], False) + # Test 11: replay-ready metadata present + self.assertEqual(sc["endpoint"], CONFORMER_UPLOAD_ENDPOINT) + self.assertEqual(sc["payload_kind"], PAYLOAD_KIND) + self.assertEqual(sc["payload_file"], str(outcome.payload_path)) + self.assertTrue(sc["idempotency_key"]) + self.assertEqual(sc["base_url"], self.cfg.base_url) + + def test_upload_records_idempotency_replay(self): + client = _StubClient( + response=_StubResponse({"conformer_observation_id": 42}, replayed=True) + ) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertTrue(sc["idempotency_replayed"]) + + def test_payload_contains_no_db_ids(self): + """Test 9: payload must not include raw TCKDB DB IDs.""" + client = _StubClient(response=_StubResponse({"id": 1})) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + flat = outcome.payload_path.read_text() + for forbidden in ( + '"species_id"', + '"species_entry_id"', + '"calculation_id"', + '"conformer_observation_id"', + '"literature_id"', + '"software_release_id"', + '"workflow_tool_release_id"', + ): + self.assertNotIn(forbidden, flat, msg=f"raw DB id in payload: {forbidden}") + + def test_payload_validates_against_expected_shape(self): + """Test 10: payload has the conformer-upload top-level shape.""" + client = _StubClient(response=_StubResponse({"id": 1})) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + payload = json.loads(outcome.payload_path.read_text()) + self.assertIn("species_entry", payload) + self.assertIn("geometry", payload) + self.assertIn("calculation", payload) + self.assertEqual(payload["species_entry"]["smiles"], "CCO") + self.assertEqual(payload["geometry"]["xyz_text"], "C 0.0 0.0 0.0\nH 1.0 0.0 0.0") + self.assertEqual(payload["calculation"]["type"], "opt") + self.assertEqual(payload["calculation"]["software_release"]["name"], "gaussian") + self.assertEqual(payload["calculation"]["software_release"]["version"], "16") + self.assertEqual(payload["calculation"]["level_of_theory"]["method"], "wb97xd") + self.assertEqual(payload["calculation"]["level_of_theory"]["basis"], "def2-tzvp") + + +class TestAdapterSkipped(unittest.TestCase): + """Test 4: upload=false -> payload written, sidecar skipped, no network.""" + + def test_upload_skipped_writes_payload_no_call(self): + tmp = tempfile.mkdtemp(prefix="arc-tckdb-") + self.addCleanup(shutil.rmtree, tmp, ignore_errors=True) + cfg = TCKDBConfig( + enabled=True, + base_url="http://x", + payload_dir=tmp, + upload=False, + ) + client = _StubClient(response=_StubResponse({"ok": True})) + adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) + outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + self.assertEqual(outcome.status, "skipped") + self.assertTrue(outcome.payload_path.exists()) + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sc["status"], "skipped") + self.assertEqual(client.calls, []) + + +class TestAdapterStrict(unittest.TestCase): + """Tests 5, 6: strict raises; non-strict swallows.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + + def test_strict_mode_raises_and_records(self): + cfg = TCKDBConfig( + enabled=True, + base_url="http://x", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + strict=True, + ) + client = _StubClient(raise_exc=RuntimeError("503")) + adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + with self.assertRaises(RuntimeError): + adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + # Sidecar still written, status failed + files = os.listdir(os.path.join(self.tmp, "conformer_calculation")) + sidecar = [f for f in files if f.endswith(".meta.json")][0] + sc = json.loads(open(os.path.join(self.tmp, "conformer_calculation", sidecar)).read()) + self.assertEqual(sc["status"], "failed") + + def test_non_strict_does_not_raise(self): + cfg = TCKDBConfig( + enabled=True, + base_url="http://x", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + strict=False, + ) + client = _StubClient(raise_exc=RuntimeError("503")) + adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + self.assertEqual(outcome.status, "failed") + + +class TestAdapterIdempotency(unittest.TestCase): + """Test 7: same logical input -> same key; changed input -> different key.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://x", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + ) + + def test_idempotency_key_stable_and_distinct(self): + client_a = _StubClient(response=_StubResponse({"id": 1})) + adapter = TCKDBAdapter(self.cfg, client_factory=lambda c, k: client_a) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + o1 = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + o2 = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + o3 = adapter.submit_conformer( + species=_fake_species(label="methanol", smiles="CO"), level=_fake_level() + ) + self.assertEqual(o1.idempotency_key, o2.idempotency_key) + self.assertNotEqual(o1.idempotency_key, o3.idempotency_key) + # The header sent must match the recorded key. + sent_keys = {call["idempotency_key"] for call in client_a.calls} + self.assertEqual(sent_keys, {o1.idempotency_key, o3.idempotency_key}) + + +class TestAdapterApiKey(unittest.TestCase): + """Test 8: missing API key with upload=true produces failed sidecar (no raise).""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + + def test_missing_api_key_records_failure(self): + cfg = TCKDBConfig( + enabled=True, + base_url="http://x", + payload_dir=self.tmp, + api_key_env="DEFINITELY_NOT_SET_X_X", + ) + client = _StubClient(response=_StubResponse({"ok": True})) + adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) + os.environ.pop("DEFINITELY_NOT_SET_X_X", None) + outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + self.assertEqual(outcome.status, "failed") + self.assertEqual(client.calls, []) # never called the network + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertIn("DEFINITELY_NOT_SET_X_X", sc["last_error"]) + + def test_missing_api_key_strict_raises(self): + cfg = TCKDBConfig( + enabled=True, + base_url="http://x", + payload_dir=self.tmp, + api_key_env="DEFINITELY_NOT_SET_X_X", + strict=True, + ) + client = _StubClient(response=_StubResponse({"ok": True})) + adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) + os.environ.pop("DEFINITELY_NOT_SET_X_X", None) + with self.assertRaises(ValueError): + adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + + +if __name__ == "__main__": + unittest.main() diff --git a/arc/tckdb/config.py b/arc/tckdb/config.py new file mode 100644 index 0000000000..8fd7c5ba0c --- /dev/null +++ b/arc/tckdb/config.py @@ -0,0 +1,75 @@ +"""Configuration for the ARC TCKDB adapter. + +The adapter is opt-in. If no ``tckdb`` block is present in the ARC input +(or ``enabled`` is false), :func:`TCKDBConfig.from_dict` returns ``None`` +and the adapter is a no-op. + +API keys are never read from input files. The config carries only the +*name* of the env var (``api_key_env``); the adapter resolves the key +at upload time. +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from typing import Any, Mapping + + +DEFAULT_PAYLOAD_DIR = "tckdb_payloads" +DEFAULT_TIMEOUT_SECONDS = 30.0 +DEFAULT_API_KEY_ENV = "TCKDB_API_KEY" + + +@dataclass(frozen=True) +class TCKDBConfig: + """ARC-side TCKDB adapter configuration. + + ``enabled`` is the master switch. ``upload`` controls whether the + adapter contacts the network at all — when false, payloads are + written to disk and the sidecar is marked ``skipped``. + + Strict mode raises on upload failure; the default is to log a + warning, record the error in the sidecar, and let ARC continue. + """ + + enabled: bool = False + base_url: str | None = None + api_key_env: str = DEFAULT_API_KEY_ENV + payload_dir: str = DEFAULT_PAYLOAD_DIR + upload: bool = True + strict: bool = False + timeout_seconds: float = DEFAULT_TIMEOUT_SECONDS + project_label: str | None = field( + default=None, + metadata={"help": "Optional ARC project/run label baked into idempotency keys."}, + ) + + @classmethod + def from_dict(cls, raw: Mapping[str, Any] | None) -> "TCKDBConfig | None": + """Build a config from a raw mapping, or return ``None`` when disabled. + + Returning ``None`` for the disabled case lets callers write + ``if cfg is None: return`` rather than checking a flag. + """ + if not raw: + return None + if not raw.get("enabled", False): + return None + base_url = raw.get("base_url") + if not isinstance(base_url, str) or not base_url: + raise ValueError("tckdb.base_url is required when tckdb.enabled is true.") + return cls( + enabled=True, + base_url=base_url, + api_key_env=str(raw.get("api_key_env", DEFAULT_API_KEY_ENV)), + payload_dir=str(raw.get("payload_dir", DEFAULT_PAYLOAD_DIR)), + upload=bool(raw.get("upload", True)), + strict=bool(raw.get("strict", False)), + timeout_seconds=float(raw.get("timeout_seconds", DEFAULT_TIMEOUT_SECONDS)), + project_label=raw.get("project_label"), + ) + + def resolve_api_key(self) -> str | None: + """Read the API key from the configured env var. Never logged.""" + return os.environ.get(self.api_key_env) diff --git a/arc/tckdb/config_test.py b/arc/tckdb/config_test.py new file mode 100644 index 0000000000..d33bb69267 --- /dev/null +++ b/arc/tckdb/config_test.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +"""Unit tests for arc.tckdb.config.""" + +import os +import unittest +from unittest import mock + +from arc.tckdb.config import ( + DEFAULT_API_KEY_ENV, + DEFAULT_PAYLOAD_DIR, + DEFAULT_TIMEOUT_SECONDS, + TCKDBConfig, +) + + +class TestTCKDBConfig(unittest.TestCase): + + def test_from_dict_returns_none_when_missing(self): + self.assertIsNone(TCKDBConfig.from_dict(None)) + self.assertIsNone(TCKDBConfig.from_dict({})) + + def test_from_dict_returns_none_when_disabled(self): + self.assertIsNone( + TCKDBConfig.from_dict({"enabled": False, "base_url": "http://x"}) + ) + + def test_from_dict_requires_base_url_when_enabled(self): + with self.assertRaises(ValueError): + TCKDBConfig.from_dict({"enabled": True}) + + def test_from_dict_uses_defaults(self): + cfg = TCKDBConfig.from_dict( + {"enabled": True, "base_url": "http://localhost:8000/api/v1"} + ) + self.assertIsNotNone(cfg) + self.assertTrue(cfg.enabled) + self.assertEqual(cfg.api_key_env, DEFAULT_API_KEY_ENV) + self.assertEqual(cfg.payload_dir, DEFAULT_PAYLOAD_DIR) + self.assertEqual(cfg.timeout_seconds, DEFAULT_TIMEOUT_SECONDS) + self.assertTrue(cfg.upload) + self.assertFalse(cfg.strict) + + def test_from_dict_overrides(self): + cfg = TCKDBConfig.from_dict( + { + "enabled": True, + "base_url": "http://srv/api/v1", + "api_key_env": "MY_KEY", + "payload_dir": "/tmp/payloads", + "upload": False, + "strict": True, + "timeout_seconds": 5, + "project_label": "proj-A", + } + ) + self.assertEqual(cfg.api_key_env, "MY_KEY") + self.assertEqual(cfg.payload_dir, "/tmp/payloads") + self.assertFalse(cfg.upload) + self.assertTrue(cfg.strict) + self.assertEqual(cfg.timeout_seconds, 5.0) + self.assertEqual(cfg.project_label, "proj-A") + + def test_resolve_api_key_from_env(self): + cfg = TCKDBConfig(enabled=True, base_url="http://x", api_key_env="X_TEST_KEY") + with mock.patch.dict(os.environ, {"X_TEST_KEY": "secret"}, clear=False): + self.assertEqual(cfg.resolve_api_key(), "secret") + + def test_resolve_api_key_missing(self): + cfg = TCKDBConfig(enabled=True, base_url="http://x", api_key_env="DOES_NOT_EXIST_X") + os.environ.pop("DOES_NOT_EXIST_X", None) + self.assertIsNone(cfg.resolve_api_key()) + + +if __name__ == "__main__": + unittest.main() diff --git a/arc/tckdb/idempotency.py b/arc/tckdb/idempotency.py new file mode 100644 index 0000000000..1004ecc0f6 --- /dev/null +++ b/arc/tckdb/idempotency.py @@ -0,0 +1,93 @@ +"""Idempotency-key generation for ARC -> TCKDB uploads. + +Wraps :func:`tckdb_client.make_idempotency_key`. The composition rules +(*which* ARC-level facts go into the key) are intentionally kept here in +ARC, because they decide what "the same logical upload" means. Stable +across retries, distinct across logically-different inputs. +""" + +from __future__ import annotations + +import hashlib +import json +from dataclasses import dataclass +from typing import Any + + +@dataclass(frozen=True) +class IdempotencyInputs: + """Stable inputs that identify one logical conformer/calculation upload. + + Every field here MUST be deterministic for the same ARC output — + no timestamps, no PIDs, no random suffixes. The combination is what + the TCKDB server uses to deduplicate replays. + """ + + project_label: str | None + species_label: str + conformer_label: str + payload_kind: str + payload_hash: str + + @classmethod + def from_payload( + cls, + *, + project_label: str | None, + species_label: str, + conformer_label: str, + payload_kind: str, + payload: Any, + ) -> "IdempotencyInputs": + """Hash the canonical-JSON payload to produce a stable identity tail. + + Sorted-key JSON is used so dict ordering does not change the + hash; this lets a re-built payload with the same content produce + the same key on retry. + """ + canonical = json.dumps(payload, sort_keys=True, default=str).encode("utf-8") + digest = hashlib.sha256(canonical).hexdigest()[:16] + return cls( + project_label=project_label, + species_label=species_label, + conformer_label=conformer_label, + payload_kind=payload_kind, + payload_hash=digest, + ) + + +def build_idempotency_key(inputs: IdempotencyInputs) -> str: + """Compose a stable idempotency key from :class:`IdempotencyInputs`. + + Shape: ``arc:::::``. + + - ``"arc"`` namespaces the key against other producers (e.g. RMG). + - ``project_label`` keeps two runs of the same species under + different projects from replaying each other. + - The trailing ``payload_hash`` makes content-changes produce a + *different* key, so edited geometries do not silently replay the + old upload. + + ``make_idempotency_key`` sanitizes illegal characters and validates + the result against the server constraint + ``^[A-Za-z0-9._:-]{16,200}$``; callers don't need to pre-clean parts. + """ + # Lazy import so arc.tckdb is importable when the adapter is unused + # and tckdb-client is not installed. + from tckdb_client import make_idempotency_key + + parts: list[str] = ["arc"] + if inputs.project_label: + parts.append(inputs.project_label) + parts.extend( + [ + inputs.species_label, + inputs.conformer_label, + inputs.payload_kind, + inputs.payload_hash, + ] + ) + return make_idempotency_key(*parts) + + +__all__ = ["IdempotencyInputs", "build_idempotency_key"] diff --git a/arc/tckdb/idempotency_test.py b/arc/tckdb/idempotency_test.py new file mode 100644 index 0000000000..135e745ad7 --- /dev/null +++ b/arc/tckdb/idempotency_test.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +"""Unit tests for arc.tckdb.idempotency.""" + +import re +import unittest + +from arc.tckdb.idempotency import IdempotencyInputs, build_idempotency_key + + +_KEY_PATTERN = re.compile(r"^[A-Za-z0-9._:\-]{16,200}$") + + +def _inputs(**overrides): + base = dict( + project_label="projA", + species_label="ethanol", + conformer_label="conf0", + payload_kind="conformer_calculation", + payload={"hello": "world"}, + ) + base.update(overrides) + return IdempotencyInputs.from_payload(**base) + + +class TestIdempotency(unittest.TestCase): + + def test_key_matches_server_pattern(self): + key = build_idempotency_key(_inputs()) + self.assertRegex(key, _KEY_PATTERN) + + def test_key_stable_across_calls(self): + a = build_idempotency_key(_inputs()) + b = build_idempotency_key(_inputs()) + self.assertEqual(a, b) + + def test_key_changes_on_payload_change(self): + a = build_idempotency_key(_inputs(payload={"v": 1})) + b = build_idempotency_key(_inputs(payload={"v": 2})) + self.assertNotEqual(a, b) + + def test_key_changes_on_species_change(self): + a = build_idempotency_key(_inputs(species_label="ethanol")) + b = build_idempotency_key(_inputs(species_label="methanol")) + self.assertNotEqual(a, b) + + def test_key_changes_on_project_change(self): + a = build_idempotency_key(_inputs(project_label="projA")) + b = build_idempotency_key(_inputs(project_label="projB")) + self.assertNotEqual(a, b) + + def test_no_project_label_still_works(self): + key = build_idempotency_key(_inputs(project_label=None)) + self.assertRegex(key, _KEY_PATTERN) + self.assertTrue(key.startswith("arc:")) + + def test_payload_dict_ordering_does_not_change_key(self): + a = build_idempotency_key(_inputs(payload={"a": 1, "b": 2})) + b = build_idempotency_key(_inputs(payload={"b": 2, "a": 1})) + self.assertEqual(a, b) + + +if __name__ == "__main__": + unittest.main() diff --git a/arc/tckdb/payload_writer.py b/arc/tckdb/payload_writer.py new file mode 100644 index 0000000000..e0c5e38c48 --- /dev/null +++ b/arc/tckdb/payload_writer.py @@ -0,0 +1,145 @@ +"""Write conformer/calculation payloads + sidecar metadata to disk. + +Two invariants: + +1. The payload JSON is written *once* and never rewritten — replay + tooling needs the exact bytes that were (or would have been) sent. +2. The sidecar JSON is written eagerly as ``status="pending"`` *before* + any network call, then updated in-place when the upload resolves. + That way a crash mid-upload leaves a clear ``pending`` record on + disk rather than no trace at all. +""" + +from __future__ import annotations + +import json +import os +import re +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +_SAFE_LABEL = re.compile(r"[^A-Za-z0-9._-]+") + + +def _utcnow_iso() -> str: + """Z-suffixed UTC timestamp; the standard for sidecar timestamps.""" + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _safe_label(label: str) -> str: + """Sanitize a label for use as a filename component.""" + cleaned = _SAFE_LABEL.sub("-", label).strip("-.") or "unlabeled" + return cleaned[:120] + + +@dataclass +class SidecarMetadata: + """On-disk record of one upload attempt; updated in place after upload.""" + + payload_file: str + endpoint: str + idempotency_key: str + payload_kind: str + created_at: str = field(default_factory=_utcnow_iso) + uploaded_at: str | None = None + status: str = "pending" + response_status_code: int | None = None + response_body: Any = None + idempotency_replayed: bool | None = None + last_error: str | None = None + base_url: str | None = None + + def to_json(self) -> dict[str, Any]: + return asdict(self) + + +@dataclass(frozen=True) +class WrittenPayload: + """Handle returned by :meth:`PayloadWriter.write` for downstream upload + sidecar updates.""" + + payload_path: Path + sidecar_path: Path + sidecar: SidecarMetadata + + +class PayloadWriter: + """File-system surface for the conformer-calculation upload payload. + + Layout:: + + /conformer_calculation/.payload.json + /conformer_calculation/.meta.json + + The writer is intentionally dumb: no schema awareness, no upload + logic. The adapter composes the payload, hands it here, then drives + the upload separately. + """ + + SUBDIR = "conformer_calculation" + PAYLOAD_SUFFIX = ".payload.json" + SIDECAR_SUFFIX = ".meta.json" + + def __init__(self, root_dir: str | os.PathLike[str]): + self._root = Path(root_dir) + + @property + def root(self) -> Path: + return self._root + + def write( + self, + *, + label: str, + payload: Any, + endpoint: str, + idempotency_key: str, + payload_kind: str = "conformer_calculation", + base_url: str | None = None, + ) -> WrittenPayload: + """Write payload JSON and an initial ``pending`` sidecar atomically. + + Returns a :class:`WrittenPayload` carrying both paths and the + in-memory sidecar dataclass. Callers update the sidecar via + :meth:`update_sidecar` after the upload resolves. + """ + directory = self._root / self.SUBDIR + directory.mkdir(parents=True, exist_ok=True) + safe = _safe_label(label) + payload_path = directory / f"{safe}{self.PAYLOAD_SUFFIX}" + sidecar_path = directory / f"{safe}{self.SIDECAR_SUFFIX}" + + self._write_json_atomic(payload_path, payload) + + sidecar = SidecarMetadata( + payload_file=str(payload_path), + endpoint=endpoint, + idempotency_key=idempotency_key, + payload_kind=payload_kind, + base_url=base_url, + ) + self._write_json_atomic(sidecar_path, sidecar.to_json()) + + return WrittenPayload( + payload_path=payload_path, + sidecar_path=sidecar_path, + sidecar=sidecar, + ) + + def update_sidecar(self, sidecar_path: Path, sidecar: SidecarMetadata) -> None: + """Rewrite the sidecar in place with the latest status.""" + self._write_json_atomic(sidecar_path, sidecar.to_json()) + + @staticmethod + def _write_json_atomic(path: Path, data: Any) -> None: + """Write JSON via tmp+rename so a crash mid-write cannot leave a partial file.""" + tmp = path.with_suffix(path.suffix + ".tmp") + with tmp.open("w", encoding="utf-8") as fh: + json.dump(data, fh, indent=2, sort_keys=True, default=str) + fh.write("\n") + os.replace(tmp, path) + + +__all__ = ["PayloadWriter", "SidecarMetadata", "WrittenPayload"] diff --git a/arc/tckdb/payload_writer_test.py b/arc/tckdb/payload_writer_test.py new file mode 100644 index 0000000000..31997f4073 --- /dev/null +++ b/arc/tckdb/payload_writer_test.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +"""Unit tests for arc.tckdb.payload_writer.""" + +import json +import os +import shutil +import tempfile +import unittest + +from arc.tckdb.payload_writer import PayloadWriter, SidecarMetadata + + +class TestPayloadWriter(unittest.TestCase): + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-writer-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.writer = PayloadWriter(self.tmp) + + def test_write_creates_payload_and_sidecar(self): + result = self.writer.write( + label="ethanol.conf0", + payload={"species_entry": {"smiles": "CCO"}}, + endpoint="/uploads/conformers", + idempotency_key="arc:proj:ethanol:conf0:conformer_calculation:abc1234567890def", + ) + self.assertTrue(result.payload_path.exists()) + self.assertTrue(result.sidecar_path.exists()) + with open(result.payload_path) as fh: + payload = json.load(fh) + self.assertEqual(payload["species_entry"]["smiles"], "CCO") + with open(result.sidecar_path) as fh: + sc = json.load(fh) + self.assertEqual(sc["status"], "pending") + self.assertEqual(sc["endpoint"], "/uploads/conformers") + self.assertIsNone(sc["uploaded_at"]) + self.assertIn("created_at", sc) + + def test_write_sanitizes_label(self): + result = self.writer.write( + label="ethanol/conf 0?!", + payload={"x": 1}, + endpoint="/uploads/conformers", + idempotency_key="arc:proj:ethanol:conf0:conformer_calculation:abc1234567890def", + ) + # Final filename has no slashes/spaces/punctuation + name = result.payload_path.name + self.assertNotIn("/", name) + self.assertNotIn(" ", name) + self.assertNotIn("?", name) + self.assertTrue(name.endswith(".payload.json")) + + def test_update_sidecar_in_place(self): + result = self.writer.write( + label="ethanol", + payload={"x": 1}, + endpoint="/uploads/conformers", + idempotency_key="arc:proj:ethanol:conf0:conformer_calculation:abc1234567890def", + ) + sc = result.sidecar + sc.status = "uploaded" + sc.uploaded_at = "2026-04-26T12:00:00Z" + sc.response_status_code = 200 + sc.response_body = {"id": 7} + self.writer.update_sidecar(result.sidecar_path, sc) + with open(result.sidecar_path) as fh: + on_disk = json.load(fh) + self.assertEqual(on_disk["status"], "uploaded") + self.assertEqual(on_disk["response_status_code"], 200) + self.assertEqual(on_disk["response_body"], {"id": 7}) + + def test_payload_unchanged_after_sidecar_update(self): + result = self.writer.write( + label="ethanol", + payload={"x": 42}, + endpoint="/uploads/conformers", + idempotency_key="arc:proj:ethanol:conf0:conformer_calculation:abc1234567890def", + ) + before = result.payload_path.read_bytes() + sc = result.sidecar + sc.status = "failed" + sc.last_error = "boom" + self.writer.update_sidecar(result.sidecar_path, sc) + after = result.payload_path.read_bytes() + self.assertEqual(before, after) + + +if __name__ == "__main__": + unittest.main() diff --git a/docs/tckdb-integration.md b/docs/tckdb-integration.md new file mode 100644 index 0000000000..b539a94472 --- /dev/null +++ b/docs/tckdb-integration.md @@ -0,0 +1,216 @@ +# ARC ↔ TCKDB integration (v0) + +This is the v0 of the ARC-side TCKDB adapter. It builds a +**conformer/calculation upload payload** from ARC objects, writes it to +disk, and (optionally) uploads it via +[`tckdb-client`](https://github.com/tckdb). + +## What v0 does + +1. Builds a JSON payload matching TCKDB's `ConformerUploadRequest`. +2. Writes the payload to disk *before* attempting any network call. +3. Writes a sidecar `*.meta.json` with the endpoint, idempotency key, + timestamps, and upload status. +4. Optionally uploads via `tckdb-client`, sending a stable idempotency + key. +5. Records success/failure in the sidecar. +6. Does **not** fail ARC by default if the upload errors — strict mode + is opt-in. + +## What v0 does not do + +- thermo upload +- kinetics upload +- bundle submission +- hosted contribution flow +- background retry daemon +- direct DB access +- chemistry logic in `tckdb-client` + +These are deferred to later milestones. + +## Configuration + +Add a `tckdb` block to your ARC input. The block is **optional** — if +absent or `enabled: false`, the adapter is a no-op. + +```yaml +tckdb: + enabled: true + base_url: "http://localhost:8000/api/v1" + api_key_env: "TCKDB_API_KEY" + payload_dir: "tckdb_payloads" + upload: true + strict: false + timeout_seconds: 30 + project_label: "my-project" # optional; baked into idempotency key +``` + +| Field | Default | Notes | +| ----------------- | -------------------- | --------------------------------------------------------- | +| `enabled` | `false` | Master switch. | +| `base_url` | _(required)_ | TCKDB API root. | +| `api_key_env` | `TCKDB_API_KEY` | Env var holding the API key. Never store the key in YAML. | +| `payload_dir` | `tckdb_payloads` | Relative paths resolve under the ARC project directory. | +| `upload` | `true` | If `false`, write payload only and mark sidecar `skipped`.| +| `strict` | `false` | If `true`, upload failure raises. | +| `timeout_seconds` | `30` | Per-request timeout. | +| `project_label` | `null` | Optional run/project tag baked into the idempotency key. | + +### API key + +The adapter reads `os.environ[api_key_env]`. If the var is unset and +`upload: true`, the adapter records a failed sidecar (or raises in +strict mode) and never contacts the network. + +```bash +export TCKDB_API_KEY="tck_replace_me" +``` + +## Local TCKDB example + +Run TCKDB locally, then point ARC at it: + +```bash +# in TCKDB_v2/ +docker compose -f docker-compose.local.yml up -d + +# in your ARC input: +tckdb: + enabled: true + base_url: "http://localhost:8000/api/v1" + api_key_env: "TCKDB_API_KEY" +``` + +## Payload directory layout + +``` +/tckdb_payloads/ + conformer_calculation/ + ..payload.json + ..meta.json +``` + +- The `*.payload.json` file is **immutable** after first write — it is + the exact JSON that was (or would have been) sent. +- The `*.meta.json` sidecar is written eagerly as `pending` and updated + in place once the upload resolves. + +## Sidecar shape + +```json +{ + "payload_file": ".../ethanol.conf0.payload.json", + "endpoint": "/uploads/conformers", + "idempotency_key": "arc:my-project:ethanol:conf0:conformer_calculation:abc1234567890def", + "payload_kind": "conformer_calculation", + "created_at": "2026-04-26T12:00:00Z", + "uploaded_at": "2026-04-26T12:00:01Z", + "status": "uploaded", + "response_status_code": 201, + "response_body": {"...": "..."}, + "idempotency_replayed": false, + "last_error": null, + "base_url": "http://localhost:8000/api/v1" +} +``` + +`status` ∈ `pending | uploaded | failed | skipped`. + +## Idempotency + +Keys have the shape + +``` +arc::::: +``` + +and are generated via `tckdb_client.make_idempotency_key`. The +`payload-hash` tail is a SHA-256 prefix over the canonical-JSON payload, +so: + +- **same** logical payload → **same** key (server replays the previous + response, marking `idempotency_replayed: true`). +- **changed** geometry/level/results → **different** key (a real new + upload). + +The key contains no timestamps, no PIDs, no random suffixes. + +## Upload failure behavior + +| Mode | On upload error | +| ---------------- | -------------------------- | +| `strict: false` | Log a warning, set sidecar `status=failed` with `last_error`, return an `UploadOutcome(status="failed", ...)`. ARC continues. | +| `strict: true` | Same sidecar update, then re-raise the underlying exception. | + +## Replay idea + +There is no built-in retry daemon yet. A future replay tool only needs: + +```text +sidecar.payload_file + +sidecar.endpoint + +sidecar.idempotency_key +``` + +plus a configured `base_url` and API key. Because the idempotency key is +stable and the payload is byte-identical to the first attempt, a replay +will produce either a fresh write or a server-side replay. + +## Programmatic use + +```python +from arc.tckdb import TCKDBAdapter, TCKDBConfig + +cfg = TCKDBConfig.from_dict(arc_input.get("tckdb")) +if cfg is not None: + adapter = TCKDBAdapter(cfg, project_directory=run.project_directory) + adapter.submit_conformer( + species=species, + level=level, + xyz=species.final_xyz, + conformer_index=0, + calculation_type="opt", + opt_result={"converged": True, "final_energy_hartree": -154.5}, + arc_version=ARC_VERSION, + ) +``` + +`submit_conformer` returns `None` when the adapter is disabled, or an +`UploadOutcome` carrying the resulting status, payload path, sidecar +path, and idempotency key. + +## Dependency policy + +The adapter is an **optional** integration. The dependency policy is: + +- **ARC core does not require `tckdb-client`.** Importing `arc.tckdb` is + safe in environments that don't have it installed; both + `tckdb_client` references are lazy. +- **The TCKDB integration requires `tckdb-client`.** This is true even + when `upload: false`, because ARC still generates an idempotency key + via `tckdb_client.make_idempotency_key` and records it in the sidecar + for later replay. In other words: as soon as you set + `tckdb.enabled: true`, the package becomes a hard requirement. +- **Until `tckdb-client` is published**, install it from the local + TCKDB checkout: + + ```bash + conda run -n arc_env pip install -e /path/to/TCKDB_v2/clients/python/tckdb-client + ``` + +- **Once published**, ARC's `environment.yml` should declare it as an + optional integration dependency: + + ```yaml + - pip: + - "tckdb-client>=0.1,<0.2" + ``` + + The `>=0.1,<0.2` range is the v0 client API; bump deliberately on a + major-version client release. + +ARC's unit tests for the adapter inject a stub client via +`TCKDBAdapter(..., client_factory=...)`. The idempotency-key tests still +need `tckdb-client` because key construction is part of the +adapter contract, but no test contacts the network. From 106ef2d621bfda05fd98f8106f2152d189ff29c7 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 27 Apr 2026 12:15:46 +0300 Subject: [PATCH 03/12] ssh: harden submit/qstat parsing for PBS Three defensive parsing fixes from origin/AzureServer: 1. parse_running_jobs_ids: lstrip() PBS qstat lines before splitting. PBS output is column-aligned with leading spaces, so the existing `status_line.split('.')[0]` returned '' on every row, making ARC silently lose track of running jobs. 2. submit_job: guard each cluster_soft branch with `stdout and ...` before indexing stdout[0]. When qsub writes to stderr and stdout is empty, the elif cascade IndexError'd and masked the real submission error. PBS branch was the most exposed (no string guard at all). 3. submit_job: add a "Memory specification can not be satisfied" stderr handler alongside the existing "Requested node configuration is not available" hint. --- arc/job/ssh.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arc/job/ssh.py b/arc/job/ssh.py index 9ea31da745..b659d53239 100644 --- a/arc/job/ssh.py +++ b/arc/job/ssh.py @@ -279,7 +279,7 @@ def check_running_jobs_ids(self) -> list: cluster_soft = servers[self.server]['cluster_soft'].lower() for i, status_line in enumerate(stdout): if i > i_dict[cluster_soft]: - job_id = status_line.split(split_by_dict[cluster_soft])[0] + job_id = status_line.lstrip().split(split_by_dict[cluster_soft])[0] job_id = job_id.split('.')[0] if '.' in job_id else job_id running_job_ids.append(job_id) return running_job_ids @@ -311,19 +311,22 @@ def submit_job(self, remote_path: str, if 'Requested node configuration is not available' in line: logger.warning('User may be requesting more resources than are available. Please check server ' 'settings, such as cpus and memory, in ARC/arc/settings/settings.py') + if 'Memory specification can not be satisfied' in line: + logger.warning('User may be requesting more memory than is available. Please check server ' + 'settings, such as cpus and memory, in ARC/arc/settings/settings.py.') if cluster_soft.lower() == 'slurm' and 'AssocMaxSubmitJobLimit' in line: logger.warning(f'Max number of submitted jobs was reached, sleeping...') time.sleep(5 * 60) self.submit_job(remote_path=remote_path, recursion=True) if recursion: return None, None - elif cluster_soft.lower() in ['oge', 'sge'] and 'submitted' in stdout[0].lower(): + elif cluster_soft.lower() in ['oge', 'sge'] and stdout and 'submitted' in stdout[0].lower(): job_id = stdout[0].split()[2] - elif cluster_soft.lower() == 'slurm' and 'submitted' in stdout[0].lower(): + elif cluster_soft.lower() == 'slurm' and stdout and 'submitted' in stdout[0].lower(): job_id = stdout[0].split()[3] - elif cluster_soft.lower() == 'pbs': + elif cluster_soft.lower() == 'pbs' and stdout: job_id = stdout[0].split('.')[0] - elif cluster_soft.lower() == 'htcondor' and 'submitting' in stdout[0].lower(): + elif cluster_soft.lower() == 'htcondor' and stdout and 'submitting' in stdout[0].lower(): # Submitting job(s). # 1 job(s) submitted to cluster 443069. if len(stdout) and len(stdout[1].split()) and len(stdout[1].split()[-1].split('.')): From 40cfbf03c149f3cc4ccb160b1fde53c77397e246 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 27 Apr 2026 12:17:01 +0300 Subject: [PATCH 04/12] adapter+ssh+scheduler: clean up remote work dir after successful job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PBS jobs left their full remote work directory on the cluster after every run, eventually filling user quota and causing future qsub submissions to fail with cryptic errors. Three coordinated changes: - ssh.py: new SSHClient.remove_dir(remote_path) — wraps `rm -r` and raises ServerError on stderr. - adapter.py: new JobAdapter.remove_remote_files() — calls remove_dir on the job's remote_path, no-op for local servers or unset paths. - scheduler.py: invoke job.remove_remote_files() in end_job's success path, after rotors_dict update and before save_restart_dict. Wrapped in try/except so cleanup failure logs a warning rather than aborting the run — losing the cluster cleanup is preferable to losing the result. Same shape as origin/AzureServer. --- arc/job/adapter.py | 10 ++++++++++ arc/job/ssh.py | 13 +++++++++++++ arc/scheduler.py | 4 ++++ 3 files changed, 27 insertions(+) diff --git a/arc/job/adapter.py b/arc/job/adapter.py index 040c9920fe..ea0deb3704 100644 --- a/arc/job/adapter.py +++ b/arc/job/adapter.py @@ -409,6 +409,16 @@ def download_files(self): self.set_initial_and_final_times() self.final_time = self.final_time or datetime.datetime.now() + def remove_remote_files(self): + """ + Remove the job's remote work directory after a successful run, to keep cluster quota in check. + No-op for local servers or when no remote_path is set. + """ + if self.server is None or self.server == 'local' or not self.remote_path: + return + with SSHClient(self.server) as ssh: + ssh.remove_dir(remote_path=self.remote_path) + def set_initial_and_final_times(self, ssh: SSHClient | None = None): """ Set the end time of the job. diff --git a/arc/job/ssh.py b/arc/job/ssh.py index b659d53239..b548b5eed2 100644 --- a/arc/job/ssh.py +++ b/arc/job/ssh.py @@ -492,6 +492,19 @@ def change_mode(self, command = f'chmod{recursive} {mode} {file_name}' self._send_command_to_server(command, remote_path) + def remove_dir(self, remote_path: str) -> None: + """ + Remove a directory on the server. + + Args: + remote_path (str): The path to the directory to remove on the remote server. + """ + command = f'rm -r "{remote_path}"' + _, stderr = self._send_command_to_server(command) + if stderr: + raise ServerError( + f'Cannot remove dir for the given path ({remote_path}).\nGot: {stderr}') + def _check_file_exists(self, remote_file_path: str, ) -> bool: diff --git a/arc/scheduler.py b/arc/scheduler.py index b1b4ceae7a..3d2d490001 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -1143,6 +1143,10 @@ def end_job(self, job: JobAdapter, for rotors_dict in self.species_dict[label].rotors_dict.values(): if rotors_dict['pivots'] in [job.pivots, job.pivots[0]]: rotors_dict['scan_path'] = job.local_path_to_output_file + try: + job.remove_remote_files() + except Exception as e: + logger.warning(f'Could not remove remote files for job {job.job_name}: {e}') self.save_restart_dict() return True From e1a2141db7e8aa6506e929f9c3d66b69b34b7470 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 27 Apr 2026 12:17:21 +0300 Subject: [PATCH 05/12] scheduler: catch JobError alongside IOError on status check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit determine_job_status() can raise JobError when the remote output file is missing or unreadable over SFTP — previously this propagated and aborted the entire scheduler run instead of triggering the existing re-run logic. Treat it the same as IOError so transient remote-side failures lead to a job re-submit, not a crash. --- arc/scheduler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arc/scheduler.py b/arc/scheduler.py index 3d2d490001..84b660d7f3 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -29,6 +29,7 @@ torsions_to_scans, ) from arc.exceptions import (InputError, + JobError, SchedulerError, SpeciesError, TrshError, @@ -1061,7 +1062,7 @@ def end_job(self, job: JobAdapter, if job.job_status[0] != 'done' or job.job_status[1]['status'] != 'done': try: job.determine_job_status() # Also downloads the output file. - except IOError: + except (IOError, JobError): if job.job_type not in ['orbitals']: logger.warning(f'Tried to determine status of job {job.job_name}, ' f'but it seems like the job never ran. Re-running job.') From 2dea85ad8fbda3be1e06504836f829f674bab054 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 27 Apr 2026 12:25:06 +0300 Subject: [PATCH 06/12] adapter: only fetch job.log on HTCondor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _get_additional_job_info unconditionally tried to download remote_path/job.log for every cluster_soft. job.log is HTCondor's native event log; PBS, Slurm, OGE/SGE inline the ESS command directly in submit.sh and never produce a job.log. Result on PBS: every job logged a misleading "check that submit script has -o/-e flags" warning even when the run succeeded. Gate the third path directly on cluster_soft == 'htcondor' so the intent is explicit. Inspired by origin/AzureServer, which used a structural proxy (`'job.sh' in files_to_upload`) — same behavior, but the cluster_soft check reads as the actual condition rather than a filename coincidence. Skipping the null-byte read handler from AzureServer; trigger is unknown and the change adds two-mode read complexity for a problem that hasn't reproduced. --- arc/job/adapter.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arc/job/adapter.py b/arc/job/adapter.py index ea0deb3704..e8207394b2 100644 --- a/arc/job/adapter.py +++ b/arc/job/adapter.py @@ -774,20 +774,20 @@ def _get_additional_job_info(self): content = '' cluster_soft = servers[self.server]['cluster_soft'].lower() if cluster_soft in ['oge', 'sge', 'slurm', 'pbs', 'htcondor']: + # job.log is HTCondor's native event log; other clusters don't produce one. + include_job_log = cluster_soft == 'htcondor' local_file_path_1 = os.path.join(self.local_path, 'out.txt') local_file_path_2 = os.path.join(self.local_path, 'err.txt') - local_file_path_3 = os.path.join(self.local_path, 'job.log') + local_file_path_3 = os.path.join(self.local_path, 'job.log') if include_job_log else None if self.server != 'local' and self.remote_path is not None and not self.testing: - remote_file_path_1 = os.path.join(self.remote_path, 'out.txt') - remote_file_path_2 = os.path.join(self.remote_path, 'err.txt') - remote_file_path_3 = os.path.join(self.remote_path, 'job.log') + remote_paths = [os.path.join(self.remote_path, 'out.txt'), + os.path.join(self.remote_path, 'err.txt')] + local_paths = [local_file_path_1, local_file_path_2] + if include_job_log: + remote_paths.append(os.path.join(self.remote_path, 'job.log')) + local_paths.append(local_file_path_3) with SSHClient(self.server) as ssh: - for local_file_path, remote_file_path in zip([local_file_path_1, - local_file_path_2, - local_file_path_3], - [remote_file_path_1, - remote_file_path_2, - remote_file_path_3]): + for local_file_path, remote_file_path in zip(local_paths, remote_paths): try: ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path) @@ -797,7 +797,7 @@ def _get_additional_job_info(self): f'flags with stdout and stderr of out.txt and err.txt, respectively ' f'(e.g., "#SBATCH -o out.txt"). Error message:') logger.warning(e) - for local_file_path in [local_file_path_1, local_file_path_2, local_file_path_3]: + for local_file_path in filter(None, [local_file_path_1, local_file_path_2, local_file_path_3]): if os.path.isfile(local_file_path): with open(local_file_path, 'r') as f: lines = f.readlines() From 8f1911df4c2b59e68b839916fb03a3918b1868e7 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 27 Apr 2026 12:46:40 +0300 Subject: [PATCH 07/12] pipe: refuse pipe mode when engine resolves to a remote server MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PipeRun.submit_to_scheduler invokes qsub/sbatch on the orchestrator machine via local_submit_job, and the worker (`python -m arc.scripts.pipe_worker`) reads pipe_root from the local filesystem. When the resolved server for the task's engine is remote (e.g., a PBS cluster reached over SSH), submission fails its `shutil.which` check at pipe_run.py:270, returns 'errored' silently, and the pipe run stays registered in active_pipes — the scheduler then loops indefinitely because the planner already removed those tasks from _pending_pipe_sp and reported them as piped. Add an upfront check in should_use_pipe: if the engine's first server in ess_settings is anything other than 'local', refuse pipe. The existing fallback at scheduler.py:546-554 then routes those tasks through run_sp_job, which uses SSHClient correctly. This is a band-aid until full remote-pipe support lands (tracked on the pipe-ssh-support branch). It preserves pipe's efficiency for local/HTCondor users while preventing the silent deadlock for everyone else. --- arc/job/pipe/pipe_coordinator.py | 15 +++++++++++++++ arc/job/pipe/pipe_coordinator_test.py | 10 +++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arc/job/pipe/pipe_coordinator.py b/arc/job/pipe/pipe_coordinator.py index 0e7351fde5..a5870df1ab 100644 --- a/arc/job/pipe/pipe_coordinator.py +++ b/arc/job/pipe/pipe_coordinator.py @@ -67,6 +67,21 @@ def should_use_pipe(self, tasks: list[TaskSpec]) -> bool: min_tasks = pipe_settings.get('min_tasks', 10) if len(tasks) < min_tasks: return False + # PipeRun.submit_to_scheduler invokes qsub/sbatch on the orchestrator + # machine and the worker (`python -m arc.scripts.pipe_worker`) reads + # pipe_root from the local filesystem. If this engine's resolved + # server is remote, that submission silently errors and the run + # deadlocks. Refuse pipe so the planner falls back to per-job queue + # submissions over SSH (scheduler.py:546-554). Remote pipe support + # tracked separately on the pipe-ssh-support branch. + ess_settings = getattr(self.sched, 'ess_settings', None) or {} + servers_dict = settings['servers'] + server_list = ess_settings.get(tasks[0].engine, []) + if isinstance(server_list, str): + server_list = [server_list] + first_server = next((s for s in server_list if s in servers_dict), None) + if first_server is not None and first_server != 'local': + return False ref = tasks[0] return all(t.engine == ref.engine and t.task_family == ref.task_family diff --git a/arc/job/pipe/pipe_coordinator_test.py b/arc/job/pipe/pipe_coordinator_test.py index 087878a5a8..a5d2602b3a 100644 --- a/arc/job/pipe/pipe_coordinator_test.py +++ b/arc/job/pipe/pipe_coordinator_test.py @@ -63,11 +63,12 @@ def _make_spec(task_id, task_family='conf_opt', engine='mockter', level=None, ) -def _make_mock_sched(project_directory): +def _make_mock_sched(project_directory, ess_settings=None): """Create a mock Scheduler with the attributes PipeCoordinator needs.""" sched = MagicMock() sched.project_directory = project_directory sched.server_job_ids = list() + sched.ess_settings = ess_settings if ess_settings is not None else {'mockter': ['local']} spc = ARCSpecies(label='H2O', smiles='O') spc.conformers = [None] * 5 spc.conformer_energies = [None] * 5 @@ -127,6 +128,13 @@ def test_false_when_disabled(self): tasks = [_make_spec(f't_{i}') for i in range(15)] self.assertFalse(self.coord.should_use_pipe(tasks)) + @patch('arc.job.pipe.pipe_coordinator.settings', + {'servers': {'zeus': {'cluster_soft': 'PBS', 'address': 'z.example.edu', 'un': 'u'}}}) + def test_false_when_engine_resolves_to_remote_server(self): + coord = PipeCoordinator(_make_mock_sched(self.tmpdir, ess_settings={'mockter': ['zeus']})) + tasks = [_make_spec(f't_{i}') for i in range(15)] + self.assertFalse(coord.should_use_pipe(tasks)) + class TestSubmitPipeRun(unittest.TestCase): """Tests for PipeCoordinator.submit_pipe_run().""" From 61e71e4a2a04c40e4102825c09635841f2bcb8a3 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Wed, 29 Apr 2026 12:15:30 +0300 Subject: [PATCH 08/12] mixture of updates still in progress --- ARC.py | 259 ++- ARC_test.py | 361 +++++ arc/job/adapter.py | 142 +- arc/job/adapter_test.py | 369 +++++ arc/job/ssh.py | 2 +- arc/job/ssh_pool.py | 156 ++ arc/output.py | 79 +- arc/output_test.py | 129 ++ arc/settings/settings.py | 2 + arc/statmech/arkane.py | 92 +- arc/statmech/arkane_test.py | 126 ++ arc/tckdb/adapter.py | 1436 +++++++++++++++-- arc/tckdb/adapter_test.py | 1179 +++++++++++++- arc/tckdb/config.py | 100 +- arc/tckdb/config_test.py | 133 ++ arc/tckdb/idempotency.py | 63 +- arc/tckdb/idempotency_test.py | 69 +- arc/tckdb/payload_writer.py | 123 +- arc/tckdb/payload_writer_test.py | 86 +- .../calcs/Species/spc1/opt_a472/input.gjf | 12 + .../calcs/Species/spc1/opt_a472/submit.sh | 56 + .../spc1_and_2_others/conf_opt_a472/input.gjf | 12 + .../spc1_and_2_others/conf_opt_a472/submit.sh | 56 + .../calcs/Species/spc1/opt_101/err.txt | 17 + .../methanol_and_5_others/scan_a472/input.gjf | 20 + docs/tckdb-integration.md | 73 + 26 files changed, 4866 insertions(+), 286 deletions(-) create mode 100644 ARC_test.py create mode 100644 arc/job/ssh_pool.py create mode 100644 arc/testing/test_JobAdapter/calcs/Species/spc1/opt_a472/input.gjf create mode 100644 arc/testing/test_JobAdapter/calcs/Species/spc1/opt_a472/submit.sh create mode 100644 arc/testing/test_JobAdapter/calcs/Species/spc1_and_2_others/conf_opt_a472/input.gjf create mode 100644 arc/testing/test_JobAdapter/calcs/Species/spc1_and_2_others/conf_opt_a472/submit.sh create mode 100644 arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/opt_101/err.txt create mode 100644 arc/testing/test_JobAdapter_scan/calcs/Species/methanol_and_5_others/scan_a472/input.gjf diff --git a/ARC.py b/ARC.py index c1455e312b..5b4b21901b 100644 --- a/ARC.py +++ b/ARC.py @@ -11,7 +11,7 @@ from arc.common import read_yaml_file from arc.main import ARC -from arc.tckdb.config import TCKDBConfig +from arc.tckdb.config import TCKDBConfig, UPLOAD_MODE_COMPUTED_SPECIES def parse_command_line_arguments(command_line_args=None): @@ -66,8 +66,261 @@ def main(): arc_object = ARC(**input_dict) arc_object.tckdb_config = tckdb_config if tckdb_config is not None: - logging.info('TCKDB integration enabled: %s', tckdb_config.base_url) - arc_object.execute() + print(f'TCKDB integration enabled: {tckdb_config.base_url}') + + # Persistent SSH pool lives for the duration of the run; close it + # explicitly on every exit path (success, error, ctrl-C) so we don't + # leave paramiko Transports orphaned. Lazily instantiated on first + # remote-queue job, so this is a no-op for fully-local runs. + try: + arc_object.execute() + + if tckdb_config is not None: + from arc.tckdb.adapter import TCKDBAdapter + adapter = TCKDBAdapter(tckdb_config, project_directory=arc_object.project_directory) + _run_tckdb_upload_sweep(arc_object, adapter, tckdb_config) + finally: + from arc.job.ssh_pool import reset_default_pool + reset_default_pool() + + +def _run_tckdb_upload_sweep(arc_object, adapter, tckdb_config): + """End-of-run sweep: build/write/upload one TCKDB payload per converged species. + + Reads ``/output/output.yml`` (the consolidated run summary + from ``arc/output.py``) and dispatches per ``tckdb_config.upload_mode``: + + - ``"conformer"`` (default): one ``/uploads/conformers`` POST per + species, followed by per-artifact POSTs to + ``/calculations/{id}/artifacts`` for each configured kind. + - ``"computed_species"``: one ``/uploads/computed-species`` bundle + POST per species, with artifacts inlined under each calc; no + separate artifact sweep. + + Both paths share the same per-species iteration, error handling, + and summary print shape. TS records are deferred regardless of mode. + """ + output_path = os.path.join(arc_object.project_directory, 'output', 'output.yml') + if not os.path.exists(output_path): + # Most common cause: the run was interrupted before + # write_output_yml ran. Skip cleanly rather than scrape live + # objects — the replay path expects output.yml as the contract. + print(f'TCKDB upload skipped: {output_path} not found (run did not complete?)') + return + + output_doc = read_yaml_file(path=output_path) + species_records = list(output_doc.get('species') or []) + ts_records = list(output_doc.get('transition_states') or []) + # Both modes cover minima only; TS records are deferred to a future + # TS-specific adapter method targeting /uploads/transition-states + # (different schema, no SMILES requirement). + n_ts_deferred = sum(1 for r in ts_records if r.get('converged')) + + is_bundle_mode = tckdb_config.upload_mode == UPLOAD_MODE_COMPUTED_SPECIES + + counts = {'uploaded': 0, 'skipped': 0, 'failed': 0} + artifact_counts = {'uploaded': 0, 'skipped': 0, 'failed': 0} + failures = [] + artifact_failures = [] + n_attempted = 0 + for record in species_records: + label = record.get('label') or '' + if not record.get('converged'): + continue + n_attempted += 1 + try: + if is_bundle_mode: + # Single bundle carries species_entry + conformer + + # opt/freq/sp + (optional) thermo + inlined artifacts. + outcome = adapter.submit_computed_species_from_output( + output_doc=output_doc, species_record=record, + ) + else: + outcome = adapter.submit_from_output( + output_doc=output_doc, species_record=record, + ) + except Exception as exc: + counts['failed'] += 1 + failures.append((label, f'{type(exc).__name__}: {exc}')) + continue + if outcome is None: + continue + counts[outcome.status] = counts.get(outcome.status, 0) + 1 + if outcome.status == 'failed': + failures.append((label, outcome.error or 'unknown error')) + elif ( + outcome.status == 'uploaded' + and not is_bundle_mode + and tckdb_config.artifacts.upload + ): + # Artifact sweep is conformer-mode only — the bundle path + # carries artifacts inline under each calc. + _sweep_artifacts_for_species( + adapter=adapter, + arc_object=arc_object, + output_doc=output_doc, + species_record=record, + outcome=outcome, + counts=artifact_counts, + failures=artifact_failures, + kinds=_implementable_kinds_from_config(tckdb_config), + ) + + mode_label = 'computed-species bundle' if is_bundle_mode else 'conformer/calculation' + print(f'TCKDB v0 ({mode_label}, {n_attempted} converged species):') + print(f' uploaded: {counts["uploaded"]} skipped: {counts["skipped"]} failed: {counts["failed"]}') + if not is_bundle_mode and tckdb_config.artifacts.upload: + # Bundle mode rolls artifacts into the same upload, so a + # standalone artifact summary line would be misleading. + print( + f' artifacts: uploaded {artifact_counts["uploaded"]} ' + f'skipped {artifact_counts["skipped"]} failed {artifact_counts["failed"]}' + ) + if n_ts_deferred: + print(f' ({n_ts_deferred} converged TS deferred — TS-specific adapter not yet implemented)') + for label, err in failures: + print(f' failed: {label} — {err}') + for label, kind, err in artifact_failures: + print(f' failed artifact: {label} ({kind}) — {err}') + + +_CALC_TYPE_TO_LOG_KEY = { + 'opt': 'opt_log', + 'freq': 'freq_log', + 'sp': 'sp_log', +} + +# Companion mapping for input-deck paths, emitted by ``arc/output.py`` +# alongside the log paths. Per-job, with per-job software → per-job +# filename, and only set when the deck file is on disk. +_CALC_TYPE_TO_INPUT_KEY = { + 'opt': 'opt_input', + 'freq': 'freq_input', + 'sp': 'sp_input', +} + + +def _implementable_kinds_from_config(tckdb_config): + """Intersect user-configured kinds with ARC's IMPLEMENTED_ARTIFACT_KINDS. + + The config-parse step warns about valid-but-not-implemented kinds; + this filter is the runtime side of the same gate, so the sweep + silently skips them rather than calling the adapter (which would + skip with a defensive log message anyway). + """ + from arc.tckdb.config import IMPLEMENTED_ARTIFACT_KINDS + return tuple(k for k in tckdb_config.artifacts.kinds if k in IMPLEMENTED_ARTIFACT_KINDS) + + +def _resolve_artifact_path(*, kind, calc_type, species_record, output_doc): + """Resolve the local file path to upload for a (kind, calc_type) pair. + + Returns ``None`` if there's nothing to upload for this combination + (e.g. unsupported calc type, file not on disk, engine unknown). + + For ``output_log``, the path is keyed off the species_record's + log fields (``opt_log`` / ``freq_log`` / ``sp_log``). + + For ``input``, the input deck (``input.gjf``, ``ZMAT``, ``input.in``, + etc.) is always written as a sibling of the output log, so we + derive its name from ``arc.imports.settings['input_filenames']`` + keyed on the engine in ``output_doc['opt_level']['software']``. + """ + log_key = _CALC_TYPE_TO_LOG_KEY.get(str(calc_type).lower()) + if log_key is None: + return None + log_path = species_record.get(log_key) + if not log_path: + return None + if kind == 'output_log': + return log_path + if kind == 'input': + # Prefer the path emitted directly by ``arc/output.py``: it's + # per-job (so a Gaussian opt + Molpro sp run picks the right + # deck per calc), and existence on disk has already been + # verified at output-write time. + input_field = _CALC_TYPE_TO_INPUT_KEY.get(str(calc_type).lower()) + if input_field: + recorded = species_record.get(input_field) + if recorded: + return recorded + # Back-compat: older output.yml files predating the + # ``_input`` schema extension. Derive from the opt-level + # software via settings['input_filenames']. Same logic as before + # — kept so old runs can still upload input decks via the + # primitive endpoint. + from arc.imports import settings as _arc_settings + opt_level = output_doc.get('opt_level') or {} + engine = (opt_level.get('software') or '').lower() if isinstance(opt_level, dict) else '' + input_filenames = _arc_settings.get('input_filenames', {}) + input_name = input_filenames.get(engine) + if not input_name: + return None + return os.path.join(os.path.dirname(log_path), input_name) + return None + + +def _sweep_artifacts_for_species( + *, + adapter, + arc_object, + output_doc, + species_record, + outcome, + counts, + failures, + kinds, +): + """For one converged species' conformer upload, push artifacts of each kind to each calc. + + Iterates the calc refs returned by the conformer upload (primary + + additional) and, for each, iterates the configured kinds. Resolves + the right local file path per (kind, calc_type) and dispatches to + ``adapter.submit_artifacts_for_calculation``. Updates ``counts`` and + ``failures`` in place. + """ + label = species_record.get('label') or '' + refs = [] + if outcome.primary_calculation: + refs.append(outcome.primary_calculation) + refs.extend(outcome.additional_calculations or []) + if not refs: + # Older server response without calc refs — skip artifact upload + # for this species rather than guess at IDs. + return + for ref in refs: + calc_id = ref.get('calculation_id') + calc_type = ref.get('type') + if calc_id is None or calc_type is None: + continue + for kind in kinds: + file_path = _resolve_artifact_path( + kind=kind, + calc_type=calc_type, + species_record=species_record, + output_doc=output_doc, + ) + if file_path is None: + counts['skipped'] = counts.get('skipped', 0) + 1 + continue + try: + art_outcome = adapter.submit_artifacts_for_calculation( + output_doc=output_doc, + species_record=species_record, + calculation_id=int(calc_id), + calculation_type=str(calc_type), + file_path=file_path, + kind=kind, + ) + except Exception as exc: + counts['failed'] = counts.get('failed', 0) + 1 + failures.append((label, kind, f'{type(exc).__name__}: {exc}')) + continue + if art_outcome is None: + continue + counts[art_outcome.status] = counts.get(art_outcome.status, 0) + 1 + if art_outcome.status == 'failed': + failures.append((label, art_outcome.kind, art_outcome.error or 'unknown error')) if __name__ == '__main__': diff --git a/ARC_test.py b/ARC_test.py new file mode 100644 index 0000000000..deee98070b --- /dev/null +++ b/ARC_test.py @@ -0,0 +1,361 @@ +"""Tests for the ARC.py end-of-run TCKDB upload sweep dispatcher. + +These tests focus on the wiring between ``tckdb.upload_mode`` and the +adapter method that gets called per species. They use a stub adapter so +no network or live ARC objects are required. +""" + +import os +import shutil +import tempfile +import unittest +from pathlib import Path +from types import SimpleNamespace +from unittest import mock + +import yaml + +# ARC.py is the top-level entry script; import its sweep helper directly. +import importlib.util +_ARC_PY = Path(__file__).parent / "ARC.py" +_spec = importlib.util.spec_from_file_location("arc_entry", _ARC_PY) +_arc_entry = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_arc_entry) + +from arc.tckdb.adapter import UploadOutcome +from arc.tckdb.config import TCKDBConfig + + +# -------------------------------------------------------------------------- +# Test doubles +# -------------------------------------------------------------------------- + + +class _StubAdapter: + """Records which adapter method was called per species, no network.""" + + def __init__(self, *, conformer_outcome=None, bundle_outcome=None, + conformer_raises=None, bundle_raises=None): + self.conformer_calls = [] + self.bundle_calls = [] + self.artifact_calls = [] + self._conformer_outcome = conformer_outcome + self._bundle_outcome = bundle_outcome + self._conformer_raises = conformer_raises + self._bundle_raises = bundle_raises + + def submit_from_output(self, *, output_doc, species_record): + self.conformer_calls.append(species_record.get("label")) + if self._conformer_raises is not None: + raise self._conformer_raises + return self._conformer_outcome + + def submit_computed_species_from_output(self, *, output_doc, species_record): + self.bundle_calls.append(species_record.get("label")) + if self._bundle_raises is not None: + raise self._bundle_raises + return self._bundle_outcome + + def submit_artifacts_for_calculation(self, **kwargs): + self.artifact_calls.append(kwargs) + return None + + +def _outcome(status, *, label="ethanol", error=None, + primary=None, additional=None): + """Build a stand-in UploadOutcome with the fields the sweep reads.""" + return UploadOutcome( + status=status, + payload_path=Path(f"/tmp/{label}.payload.json"), + sidecar_path=Path(f"/tmp/{label}.meta.json"), + idempotency_key=f"arc:test:{label}:k:abc1234567890def", + error=error, + primary_calculation=primary, + additional_calculations=additional or [], + ) + + +# -------------------------------------------------------------------------- +# Fixtures +# -------------------------------------------------------------------------- + + +def _write_output_yml(project_dir: str, *, species_labels=("CCO",), with_ts=False): + """Write a minimal ``output.yml`` matching what the sweep reads.""" + out_dir = os.path.join(project_dir, "output") + os.makedirs(out_dir, exist_ok=True) + doc = { + "schema_version": "1.0", + "project": "test_project", + "arc_version": "0.0.0", + "opt_level": {"method": "wb97xd", "basis": "def2-tzvp", "software": "gaussian"}, + "species": [ + { + "label": label, + "smiles": "CCO", + "charge": 0, + "multiplicity": 1, + "is_ts": False, + "converged": True, + "xyz": "C 0.0 0.0 0.0\nH 1.0 0.0 0.0", + "opt_n_steps": 12, + "opt_final_energy_hartree": -154.0, + "ess_versions": {"opt": "Gaussian 16, Revision A.03"}, + } + for label in species_labels + ], + "transition_states": [ + {"label": "TS0", "is_ts": True, "converged": True} + ] if with_ts else [], + } + with open(os.path.join(out_dir, "output.yml"), "w") as f: + yaml.safe_dump(doc, f) + return doc + + +# -------------------------------------------------------------------------- +# Dispatch behavior +# -------------------------------------------------------------------------- + + +class TestRunTckdbUploadSweepDispatch(unittest.TestCase): + """Wiring tests: which adapter method gets called per upload_mode.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-sweep-test-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + _write_output_yml(self.tmp) + self.arc_object = SimpleNamespace(project_directory=self.tmp) + + def _cfg(self, **overrides): + defaults = dict( + enabled=True, + base_url="http://localhost:8000/api/v1", + api_key_env="X_TCKDB_API_KEY", + ) + defaults.update(overrides) + return TCKDBConfig(**defaults) + + # ---------------- 1: missing upload_mode → conformer (default) + def test_default_mode_uses_legacy_conformer_path(self): + cfg = self._cfg() # upload_mode defaults to "conformer" + adapter = _StubAdapter(conformer_outcome=_outcome("uploaded")) + _arc_entry._run_tckdb_upload_sweep(self.arc_object, adapter, cfg) + self.assertEqual(adapter.conformer_calls, ["CCO"]) + self.assertEqual(adapter.bundle_calls, []) + + # ---------------- 2: explicit conformer + def test_explicit_conformer_mode_uses_legacy_path(self): + cfg = self._cfg(upload_mode="conformer") + adapter = _StubAdapter(conformer_outcome=_outcome("uploaded")) + _arc_entry._run_tckdb_upload_sweep(self.arc_object, adapter, cfg) + self.assertEqual(adapter.conformer_calls, ["CCO"]) + self.assertEqual(adapter.bundle_calls, []) + + # ---------------- 3: computed_species → bundle path + def test_computed_species_mode_dispatches_bundle(self): + cfg = self._cfg(upload_mode="computed_species") + adapter = _StubAdapter(bundle_outcome=_outcome("uploaded")) + _arc_entry._run_tckdb_upload_sweep(self.arc_object, adapter, cfg) + self.assertEqual(adapter.bundle_calls, ["CCO"]) + self.assertEqual(adapter.conformer_calls, []) + + # ---------------- 4: bundle path never calls legacy + def test_computed_species_does_not_call_legacy_submit(self): + # Multiple species so we'd notice any leak across iterations. + _write_output_yml(self.tmp, species_labels=("CCO", "CO", "CC")) + cfg = self._cfg(upload_mode="computed_species") + adapter = _StubAdapter(bundle_outcome=_outcome("uploaded")) + _arc_entry._run_tckdb_upload_sweep(self.arc_object, adapter, cfg) + self.assertEqual(adapter.bundle_calls, ["CCO", "CO", "CC"]) + self.assertEqual(adapter.conformer_calls, []) + # And no per-artifact sweep call: bundles inline artifacts. + self.assertEqual(adapter.artifact_calls, []) + + # ---------------- 5: failure in bundle mode is recorded; sweep continues + def test_computed_species_failure_continues_to_next_species(self): + _write_output_yml(self.tmp, species_labels=("CCO", "CO")) + cfg = self._cfg(upload_mode="computed_species") + # First species: outcome with status=failed (non-strict path). + # Second species: outcome with status=uploaded. + # We achieve "different per call" by mutating the stub's outcome + # mid-sweep, since the stub returns the same outcome each call by + # default. Use a side-effect via a wrapper instead. + outcomes = iter([ + _outcome("failed", label="CCO", error="HTTP 503"), + _outcome("uploaded", label="CO"), + ]) + adapter = _StubAdapter() + adapter.submit_computed_species_from_output = ( + lambda *, output_doc, species_record: ( + adapter.bundle_calls.append(species_record.get("label")) + or next(outcomes) + ) + ) + _arc_entry._run_tckdb_upload_sweep(self.arc_object, adapter, cfg) + # Both species processed; first failed, second uploaded. + self.assertEqual(adapter.bundle_calls, ["CCO", "CO"]) + + # ---------------- 5b: an unhandled exception in bundle mode is caught + def test_computed_species_exception_is_caught_and_logged(self): + _write_output_yml(self.tmp, species_labels=("CCO", "CO")) + cfg = self._cfg(upload_mode="computed_species") + # Simulate an unhandled exception on the FIRST species; second + # should still be attempted (matches conformer-mode behavior). + call_log = [] + def fake_submit(*, output_doc, species_record): + label = species_record.get("label") + call_log.append(label) + if label == "CCO": + raise RuntimeError("boom") + return _outcome("uploaded", label=label) + adapter = _StubAdapter() + adapter.submit_computed_species_from_output = fake_submit + _arc_entry._run_tckdb_upload_sweep(self.arc_object, adapter, cfg) + self.assertEqual(call_log, ["CCO", "CO"]) + + # ---------------- 6: sidecar written before live upload failure (bundle) + def test_bundle_mode_sidecar_written_before_upload_failure(self): + # This is fundamentally an adapter-level guarantee, but we verify + # the wiring preserves it: a "failed" outcome carrying real + # payload_path and sidecar_path values means the sweep still + # passes those upward to the user. + cfg = self._cfg(upload_mode="computed_species") + sentinel_payload = Path("/tmp/sentinel.payload.json") + sentinel_sidecar = Path("/tmp/sentinel.meta.json") + outcome = UploadOutcome( + status="failed", + payload_path=sentinel_payload, + sidecar_path=sentinel_sidecar, + idempotency_key="arc:t:CCO:c:abc1234567890def", + error="HTTP 503", + ) + adapter = _StubAdapter(bundle_outcome=outcome) + # Capture stdout to confirm the failure summary is printed + # (don't assert on exact text — assert on key tokens). + with mock.patch("builtins.print") as mock_print: + _arc_entry._run_tckdb_upload_sweep(self.arc_object, adapter, cfg) + printed = "\n".join(str(c.args[0]) for c in mock_print.call_args_list) + self.assertIn("computed-species bundle", printed) + self.assertIn("failed: 1", printed) + self.assertIn("HTTP 503", printed) + + +# -------------------------------------------------------------------------- +# Summary-print mode awareness +# -------------------------------------------------------------------------- + + +class TestSweepSummaryByMode(unittest.TestCase): + """The summary line names the mode; bundle mode omits the artifact line.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-sweep-summary-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + _write_output_yml(self.tmp) + self.arc_object = SimpleNamespace(project_directory=self.tmp) + + def _run_with_mode(self, *, upload_mode, artifacts_upload=False): + from arc.tckdb.config import TCKDBArtifactConfig + cfg = TCKDBConfig( + enabled=True, base_url="http://x", api_key_env="X", + upload_mode=upload_mode, + artifacts=TCKDBArtifactConfig(upload=artifacts_upload), + ) + adapter = _StubAdapter( + conformer_outcome=_outcome("uploaded"), + bundle_outcome=_outcome("uploaded"), + ) + with mock.patch("builtins.print") as mock_print: + _arc_entry._run_tckdb_upload_sweep(self.arc_object, adapter, cfg) + return "\n".join(str(c.args[0]) for c in mock_print.call_args_list) + + def test_conformer_mode_summary_says_conformer(self): + out = self._run_with_mode(upload_mode="conformer") + self.assertIn("conformer/calculation", out) + self.assertNotIn("computed-species bundle", out) + + def test_bundle_mode_summary_says_bundle(self): + out = self._run_with_mode(upload_mode="computed_species") + self.assertIn("computed-species bundle", out) + self.assertNotIn("conformer/calculation", out) + + def test_bundle_mode_omits_artifact_line_even_when_enabled(self): + # Inline artifacts mean the standalone artifact tally would mislead. + out = self._run_with_mode(upload_mode="computed_species", artifacts_upload=True) + self.assertNotIn("artifacts: uploaded", out) + + def test_conformer_mode_emits_artifact_line_when_enabled(self): + out = self._run_with_mode(upload_mode="conformer", artifacts_upload=True) + self.assertIn("artifacts:", out) + + +# -------------------------------------------------------------------------- +# _resolve_artifact_path: prefer recorded _input over derivation +# -------------------------------------------------------------------------- + + +class TestResolveArtifactPath(unittest.TestCase): + """The legacy artifact sweep prefers ``output.yml``'s ``_input`` + field, falling back to settings-based derivation only when absent.""" + + def test_input_kind_prefers_recorded_field(self): + """When ``opt_input`` is on the record, it wins over the derived path.""" + species_record = { + "opt_log": "calcs/CH4/opt/input.log", + "opt_input": "calcs/CH4/opt/explicit_input.gjf", # NEW field + } + output_doc = {"opt_level": {"software": "gaussian"}} + path = _arc_entry._resolve_artifact_path( + kind="input", calc_type="opt", + species_record=species_record, output_doc=output_doc, + ) + self.assertEqual(path, "calcs/CH4/opt/explicit_input.gjf") + + def test_input_kind_falls_back_to_settings_when_field_absent(self): + """Older output.yml without ``_input`` still resolves via settings.""" + species_record = {"opt_log": "/abs/calcs/CH4/opt/input.log"} + output_doc = {"opt_level": {"software": "gaussian"}} + path = _arc_entry._resolve_artifact_path( + kind="input", calc_type="opt", + species_record=species_record, output_doc=output_doc, + ) + # Derived sibling: input.gjf next to the log. + self.assertEqual(path, "/abs/calcs/CH4/opt/input.gjf") + + def test_input_kind_falls_back_when_recorded_field_is_none(self): + """Explicit ``None`` in the record (deck wasn't kept) → fallback.""" + species_record = { + "opt_log": "/abs/calcs/CH4/opt/input.log", + "opt_input": None, + } + output_doc = {"opt_level": {"software": "gaussian"}} + path = _arc_entry._resolve_artifact_path( + kind="input", calc_type="opt", + species_record=species_record, output_doc=output_doc, + ) + self.assertEqual(path, "/abs/calcs/CH4/opt/input.gjf") + + def test_input_kind_per_job_picks_correct_recorded_field(self): + """Different calcs hit different ``_input`` fields, not all opt's.""" + species_record = { + "opt_log": "/abs/opt.log", "opt_input": "/abs/opt_deck.gjf", + "freq_log": "/abs/freq.log", "freq_input": "/abs/freq_deck.gjf", + "sp_log": "/abs/sp.log", "sp_input": "/abs/sp_deck.in", # cross-software run + } + output_doc = {"opt_level": {"software": "gaussian"}} + for calc, expected in ( + ("opt", "/abs/opt_deck.gjf"), + ("freq", "/abs/freq_deck.gjf"), + ("sp", "/abs/sp_deck.in"), # NOT input.gjf — sp uses its own software + ): + path = _arc_entry._resolve_artifact_path( + kind="input", calc_type=calc, + species_record=species_record, output_doc=output_doc, + ) + self.assertEqual(path, expected, + msg=f"{calc}: expected {expected}, got {path}") + + +if __name__ == "__main__": + unittest.main() diff --git a/arc/job/adapter.py b/arc/job/adapter.py index e8207394b2..1198fd1fef 100644 --- a/arc/job/adapter.py +++ b/arc/job/adapter.py @@ -218,9 +218,82 @@ def execute(self): with an HDF5 file that contains specific directions. The output is returned within the HDF5 file. The new ARC instance, representing a single worker, will run all of its jobs incore. + + Connection sharing: for remote-queue jobs we lease one + :class:`SSHClient` from the process-global pool + (:mod:`arc.job.ssh_pool`) and reuse it for both file upload and + qsub/sbatch submission within this call. Across an entire ARC + run, every remote job for a given server reuses the *same* + pooled client — 100 TS guess opts share one paramiko Transport + instead of opening 200. Pipe mode currently can't bundle these + (``should_use_pipe`` refuses non-``local`` servers, see + ``arc/job/pipe/pipe_coordinator.py:77``); the pool is the + leverage available short of full remote-pipe support. """ - self.upload_files() execution_type = JobExecutionTypeEnum(self.execution_type) + use_shared_ssh = ( + execution_type == JobExecutionTypeEnum.queue + and self.server is not None + and self.server != 'local' + and not self.testing + ) + if use_shared_ssh: + from arc.job.ssh_pool import get_default_pool + with get_default_pool().borrow(self.server) as ssh: + self._shared_ssh = ssh + try: + self._dispatch_execution(execution_type) + finally: + # Pool retains the SSHClient; clearing the attr + # just prevents a later code path on this adapter + # from grabbing a stale reference if the pool + # subsequently reaps and reopens the connection. + self._shared_ssh = None + else: + self._dispatch_execution(execution_type) + if not self.restarted: + self._write_initiated_job_to_csv_file() + + def _open_or_borrow_ssh(self): + """Yield an :class:`SSHClient` for ``self.server``, in priority order: + + 1. ``self._shared_ssh`` if set — the per-call client opened by + :meth:`execute`. Available within the upload+submit window. + 2. The process-global pool (:mod:`arc.job.ssh_pool`) — keeps + one client alive across jobs for the run's lifetime, so the + hot status-poll loop reuses connections. + 3. A fresh ``SSHClient`` opened just for this call — only hit + when the pool can't construct one (testing, exotic env). + + Returns a context manager that does NOT close the underlying + client on exit; the pool retains ownership in case (2), and + case (3) opens-and-closes inline. + """ + from contextlib import contextmanager + shared = getattr(self, '_shared_ssh', None) + if shared is not None: + @contextmanager + def _shared_cm(): + yield shared + return _shared_cm() + try: + from arc.job.ssh_pool import get_default_pool + return get_default_pool().borrow(self.server) + except Exception: + # Pool refused (e.g., factory failed). Fall back to a + # one-shot client so we degrade gracefully — the caller + # gets correctness at the cost of one connection. + logger.debug("ssh pool unavailable; opening one-shot client", exc_info=True) + @contextmanager + def _fresh_cm(): + with SSHClient(self.server) as fresh: + yield fresh + return _fresh_cm() + + def _dispatch_execution(self, execution_type: 'JobExecutionTypeEnum') -> None: + """Inner body of :meth:`execute`, factored out so the SSH-share + wrapper around it stays small and readable.""" + self.upload_files() if execution_type == JobExecutionTypeEnum.incore: self.initial_time = datetime.datetime.now() self.job_status[0] = 'running' @@ -235,19 +308,25 @@ def execute(self): raise ValueError('Pipe execution is handled at the Scheduler level. ' 'JobAdapters inside a pipe must be executed by the worker ' "with execution_type='incore'.") - if not self.restarted: - self._write_initiated_job_to_csv_file() - def legacy_queue_execution(self): + def legacy_queue_execution(self, ssh: 'SSHClient | None' = None): """ Execute a job to the server's queue. The server could be either "local" or remote. + + ``ssh`` is an explicitly-passed shared connection. When ``None`` + we route through :meth:`_open_or_borrow_ssh` which prefers + ``self._shared_ssh`` (set by :meth:`execute`), then the + process-global pool, then opens fresh. """ self._log_job_execution() # Submit to queue, differentiate between local (same machine using its queue) and remote servers. if self.server != 'local': - with SSHClient(self.server) as ssh: + if ssh is not None: self.job_status[0], self.job_id = ssh.submit_job(remote_path=self.remote_path) + else: + with self._open_or_borrow_ssh() as borrowed: + self.job_status[0], self.job_id = borrowed.submit_job(remote_path=self.remote_path) else: # submit to the local queue self.job_status[0], self.job_id = submit_job(path=self.local_path) @@ -363,26 +442,24 @@ def set_file_paths(self): self.set_additional_file_paths() - def upload_files(self): + def upload_files(self, ssh: 'SSHClient | None' = None): """ Upload the relevant files for the job. + + ``ssh`` is an explicitly-passed shared connection. When ``None`` + we route through :meth:`_open_or_borrow_ssh` which prefers + ``self._shared_ssh`` (set by :meth:`execute`), then the + process-global pool, then opens fresh. """ if not self.testing: if self.execution_type != 'incore' and self.server != 'local': # If the job execution type is incore, then no need to upload any files. # Also, even if the job is submitted to the que, no need to upload files if the server is local. - with SSHClient(self.server) as ssh: - for up_file in self.files_to_upload: - logger.debug(f"Uploading {up_file['file_name']} source {up_file['source']} to {self.server}") - if up_file['source'] == 'path': - ssh.upload_file(remote_file_path=up_file['remote'], local_file_path=up_file['local']) - elif up_file['source'] == 'input_files': - ssh.upload_file(remote_file_path=up_file['remote'], file_string=up_file['local']) - else: - raise ValueError(f"Unclear file source for {up_file['file_name']}. Should either be 'path' or " - f"'input_files', got: {up_file['source']}") - if up_file['make_x']: - ssh.change_mode(mode='+x', file_name=up_file['file_name'], remote_path=self.remote_path) + if ssh is not None: + self._upload_with_ssh(ssh) + else: + with self._open_or_borrow_ssh() as borrowed: + self._upload_with_ssh(borrowed) else: # running locally, just copy the check file, if exists, to the job folder for up_file in self.files_to_upload: @@ -393,6 +470,25 @@ def upload_files(self): pass self.initial_time = datetime.datetime.now() + def _upload_with_ssh(self, ssh) -> None: + """SFTP-put every entry in ``self.files_to_upload`` over an open client. + + Factored out of :meth:`upload_files` so the with-shared vs. + with-new code paths share one body — adding a future per-file + knob (compression, retry, throttle) lands in one place. + """ + for up_file in self.files_to_upload: + logger.debug(f"Uploading {up_file['file_name']} source {up_file['source']} to {self.server}") + if up_file['source'] == 'path': + ssh.upload_file(remote_file_path=up_file['remote'], local_file_path=up_file['local']) + elif up_file['source'] == 'input_files': + ssh.upload_file(remote_file_path=up_file['remote'], file_string=up_file['local']) + else: + raise ValueError(f"Unclear file source for {up_file['file_name']}. Should either be 'path' or " + f"'input_files', got: {up_file['source']}") + if up_file['make_x']: + ssh.change_mode(mode='+x', file_name=up_file['file_name'], remote_path=self.remote_path) + def download_files(self): """ Download the relevant files. @@ -401,7 +497,7 @@ def download_files(self): if self.execution_type != 'incore' and self.server != 'local': # If the job execution type is incore, then no need to download any files. # Also, even if the job is submitted to the que, no need to download files if the server is local. - with SSHClient(self.server) as ssh: + with self._open_or_borrow_ssh() as ssh: for dl_file in self.files_to_download: ssh.download_file(remote_file_path=dl_file['remote'], local_file_path=dl_file['local']) self.set_initial_and_final_times(ssh=ssh) @@ -416,7 +512,7 @@ def remove_remote_files(self): """ if self.server is None or self.server == 'local' or not self.remote_path: return - with SSHClient(self.server) as ssh: + with self._open_or_borrow_ssh() as ssh: ssh.remove_dir(remote_path=self.remote_path) def set_initial_and_final_times(self, ssh: SSHClient | None = None): @@ -711,7 +807,7 @@ def delete(self): logger.debug(f'Deleting job {self.job_name} for {self.species_label}') if self.server != 'local': logger.debug(f'deleting job on {self.server}...') - with SSHClient(self.server) as ssh: + with self._open_or_borrow_ssh() as ssh: ssh.delete_job(self.job_id) else: logger.debug('deleting job locally...') @@ -786,7 +882,7 @@ def _get_additional_job_info(self): if include_job_log: remote_paths.append(os.path.join(self.remote_path, 'job.log')) local_paths.append(local_file_path_3) - with SSHClient(self.server) as ssh: + with self._open_or_borrow_ssh() as ssh: for local_file_path, remote_file_path in zip(local_paths, remote_paths): try: ssh.download_file(remote_file_path=remote_file_path, @@ -813,7 +909,7 @@ def _check_job_server_status(self) -> str: Possible statuses: ``initializing``, ``running``, ``errored on node xx``, ``done``. """ if self.server != 'local' and not self.testing: - with SSHClient(self.server) as ssh: + with self._open_or_borrow_ssh() as ssh: return ssh.check_job_status(self.job_id) else: return check_job_status(self.job_id) diff --git a/arc/job/adapter_test.py b/arc/job/adapter_test.py index dd1a520620..4cfef9afc7 100644 --- a/arc/job/adapter_test.py +++ b/arc/job/adapter_test.py @@ -410,5 +410,374 @@ def test_multiple_rotations(self): self.assertEqual(len(archives), 2) +# --------------------------------------------------------------------------- +# SSH connection sharing & pooling (Options 1 + 2). +# +# Option 1 (per-job share): one SSHClient covers both upload and submit +# inside a single execute() call — collapses 2N connections to N. +# Option 2 (process-lifetime pool): the SSHClient for a given server is +# kept alive across jobs — collapses N to a small constant. +# --------------------------------------------------------------------------- + + +class _SSHClientStub: + """In-memory SSHClient lookalike for the pool to hand out. + + Records every upload/submit so tests can assert which calls landed + on which (shared) client. The pool calls ``connect()`` after + instantiation; we no-op that since there's no real socket. + """ + + def __init__(self, server): + self.server = server + self.uploaded = [] + self.submits = [] + self.downloaded = [] + self._closed = False + # Mimic SSHClient's ``_ssh`` attribute so ssh_pool._is_alive() + # finds an active fake-Transport. + self._ssh = _FakeParamikoSSH() + + def connect(self): + pass # the real one opens TCP+auth; we no-op for tests + + def close(self): + self._closed = True + self._ssh = None + + def __enter__(self): + return self + + def __exit__(self, *a): + return False + + def upload_file(self, *, remote_file_path, local_file_path=None, file_string=None): + self.uploaded.append(remote_file_path) + + def submit_job(self, remote_path, recursion=False): + self.submits.append(remote_path) + return 'initializing', 12345 + + def change_mode(self, *, mode, file_name, remote_path): + pass + + # Methods that the post-submit lifecycle paths exercise. + def check_job_status(self, job_id): + return 'running' + + def download_file(self, *, remote_file_path, local_file_path): + self.downloaded.append(remote_file_path) + + def remove_dir(self, *, remote_path): + pass + + def delete_job(self, job_id): + pass + + +class _FakeParamikoSSH: + """Stand-in for paramiko.SSHClient — _is_alive checks Transport.is_active().""" + def get_transport(self): + return _FakeTransport() + + +class _FakeTransport: + def is_active(self): + return True + + +class _StubFactoryPool: + """A pool whose factory builds _SSHClientStub instead of real SSHClient. + + Wraps the production ``SSHConnectionPool`` so reuse + lifecycle + semantics are exactly the production behavior — only the + underlying object is faked. + """ + + def __init__(self): + from arc.job.ssh_pool import SSHConnectionPool + self.created = [] # log of every server name we built a client for + def factory(server): + client = _SSHClientStub(server) + self.created.append(server) + return client + self._inner = SSHConnectionPool(factory=factory) + + def borrow(self, server): + return self._inner.borrow(server) + + def close_all(self): + self._inner.close_all() + + @property + def opens(self): + return self._inner.opens + + @property + def borrows(self): + return self._inner.borrows + + +class _MinimalAdapter(JobAdapter): + """Concrete JobAdapter with just enough state to exercise execute(). + + Skips the heavyweight construction the GaussianAdapter does — we + only need ``server``, ``execution_type``, ``files_to_upload``, + ``remote_path``, and ``testing=False`` for the SSH-share path. + """ + + job_adapter = 'mockter' + + def __init__(self, *, server, execution_type='queue'): + # Bypass JobAdapter.__init__ entirely — all of its real work + # (file paths, settings, csv setup) is unrelated to the SSH + # share contract we're testing here. + self.server = server + self.execution_type = execution_type + self.testing = False + self.restarted = True # skip _write_initiated_job_to_csv_file + self.files_to_upload = [ + {'file_name': 'input.gjf', 'source': 'path', + 'local': '/local/input.gjf', 'remote': '/remote/input.gjf', 'make_x': False}, + {'file_name': 'submit.sh', 'source': 'path', + 'local': '/local/submit.sh', 'remote': '/remote/submit.sh', 'make_x': True}, + ] + self.remote_path = '/remote' + self.local_path = '/local' + self.job_status = ['initializing', {'status': 'initializing'}] + self.job_id = 0 + self.initial_time = None + self.final_time = None + self.job_name = 'job_test' + self.species_label = 'spc_test' + + # JobAdapter requires these abstracts; trivial bodies are fine. + def execute_incore(self): pass + def execute_queue(self): self.legacy_queue_execution() + def write_input_file(self): pass + def set_files(self): pass + def set_additional_file_paths(self): pass + def set_input_file_memory(self): pass + def upload_during_execution(self): pass + def _log_job_execution(self): pass + + +class TestSSHConnectionSharing(unittest.TestCase): + """``execute()`` shares one SSHClient per remote-queue job, and the + pool reuses it across jobs.""" + + def setUp(self): + # Inject a pool whose factory builds stubs, so the test never + # tries to open a real SSH connection to a server that isn't + # in this user's settings (e.g., 'server2'). + import arc.job.ssh_pool as _pool + self._stub_pool = _StubFactoryPool() + _pool.set_default_pool(self._stub_pool) + # Also stub the legacy-direct path: bare + # ``legacy_queue_execution()`` (called outside execute()) uses + # the SSHClient class in ``arc.job.adapter`` directly, so patch + # that name with a context-manager wrapper around our stub. + self._direct_patch = patch( + 'arc.job.adapter.SSHClient', + lambda server: _SSHClientStub(server), + ) + self._direct_patch.start() + + def tearDown(self): + import arc.job.ssh_pool as _pool + _pool.set_default_pool(None) + self._direct_patch.stop() + + def test_remote_queue_opens_one_ssh_per_job(self): + """Upload + submit share a single SSHClient inside one execute().""" + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.execute() + # One SSHClient created (the pool's first borrow), one borrow. + self.assertEqual(self._stub_pool.opens, 1) + self.assertEqual(self._stub_pool.borrows, 1) + + def test_remote_queue_clears_shared_ssh_after_dispatch(self): + """``self._shared_ssh`` is None after execute() returns.""" + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.execute() + self.assertIsNone(getattr(adapter, '_shared_ssh', None)) + + def test_local_server_opens_no_ssh(self): + """local-server queue jobs use the host's queue, no SSH at all.""" + adapter = _MinimalAdapter(server='local', execution_type='queue') + with patch('arc.job.adapter.submit_job', return_value=('initializing', 99)): + adapter.execute() + self.assertEqual(self._stub_pool.opens, 0) + self.assertEqual(self._stub_pool.borrows, 0) + + def test_incore_opens_no_ssh(self): + """incore execution runs in-process — never touches SSH.""" + adapter = _MinimalAdapter(server='server2', execution_type='incore') + adapter.execute() + self.assertEqual(self._stub_pool.opens, 0) + + def test_legacy_queue_execution_routes_through_pool_when_called_directly(self): + """Even when called bare (outside execute()), legacy_queue_execution + now reuses the pool — that's Option 2's payoff for adapter + ``execute_queue`` overrides that call ``self.legacy_queue_execution()`` + from inside their own custom flow. + """ + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.legacy_queue_execution() # bare — no execute() wrapper + self.assertEqual(self._stub_pool.opens, 1) + self.assertEqual(self._stub_pool.borrows, 1) + + def test_shared_ssh_carries_uploads_and_submit(self): + """The pooled SSHClient sees both upload calls AND the submit call.""" + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.execute() + # Inspect the stub the pool kept. + self.assertEqual(self._stub_pool.opens, 1) + client = self._stub_pool._inner._clients['server2'] + self.assertEqual(len(client.uploaded), 2) + self.assertEqual(len(client.submits), 1) + + +class TestSSHConnectionPoolReuse(unittest.TestCase): + """The process-lifetime pool reuses one SSHClient across many jobs.""" + + def setUp(self): + import arc.job.ssh_pool as _pool + self._stub_pool = _StubFactoryPool() + _pool.set_default_pool(self._stub_pool) + + def tearDown(self): + import arc.job.ssh_pool as _pool + _pool.set_default_pool(None) + + def test_one_open_for_many_jobs_same_server(self): + """100 jobs against one server → 1 SSHClient, 100 borrows.""" + for _ in range(100): + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.execute() + self.assertEqual(self._stub_pool.opens, 1, "should reuse the same client") + self.assertEqual(self._stub_pool.borrows, 100) + + def test_separate_clients_per_distinct_server(self): + """Different servers → different clients, each opened once.""" + for _ in range(5): + _MinimalAdapter(server='server2', execution_type='queue').execute() + for _ in range(3): + _MinimalAdapter(server='server3', execution_type='queue').execute() + self.assertEqual(self._stub_pool.opens, 2) + self.assertEqual(self._stub_pool.borrows, 8) + self.assertEqual(sorted(self._stub_pool._inner._clients.keys()), + ['server2', 'server3']) + + def test_dead_client_is_reaped_and_reopened(self): + """If the underlying Transport reports inactive, pool reopens.""" + # First borrow → opens stub #1. + _MinimalAdapter(server='server2', execution_type='queue').execute() + client1 = self._stub_pool._inner._clients['server2'] + # Simulate a dead Transport (remote rebooted, etc.). + client1._ssh = None + # Next borrow should detect the dead client and open a fresh one. + _MinimalAdapter(server='server2', execution_type='queue').execute() + client2 = self._stub_pool._inner._clients['server2'] + self.assertIs(client1._closed, True, "stale client should be closed before reopen") + self.assertIsNot(client1, client2) + self.assertEqual(self._stub_pool.opens, 2) + + def test_close_all_closes_every_pooled_client(self): + for srv in ('server2', 'server3'): + _MinimalAdapter(server=srv, execution_type='queue').execute() + clients = list(self._stub_pool._inner._clients.values()) + self._stub_pool.close_all() + self.assertEqual(self._stub_pool._inner._clients, {}) + for c in clients: + self.assertTrue(c._closed) + + def test_close_all_is_idempotent(self): + _MinimalAdapter(server='server2', execution_type='queue').execute() + self._stub_pool.close_all() + # Second call must not raise or mutate state. + self._stub_pool.close_all() + self.assertEqual(self._stub_pool._inner._clients, {}) + + def test_status_poll_reuses_pooled_client(self): + """The hot path: hundreds of status checks open exactly one client. + + ARC polls a job's queue status every poll cycle for the entire + duration of the job. Pre-pool, each call opened a fresh + SSHClient. After Option 2, all polls reuse the pool's client + for that server — the dominant SSH-cost reducer in a real run. + """ + adapter = _MinimalAdapter(server='server2', execution_type='queue') + # Simulate 200 poll cycles (~1.5 hour run at 30s polling). + for _ in range(200): + adapter._check_job_server_status() + self.assertEqual(self._stub_pool.opens, 1, "pool should reuse one client") + self.assertEqual(self._stub_pool.borrows, 200) + + def test_download_files_reuses_pooled_client(self): + """download_files (called once per finished job) uses the pool too.""" + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.files_to_download = [ + {'remote': '/r/output.log', 'local': '/l/output.log'}, + ] + # set_initial_and_final_times reads file mtimes — stub it. + adapter.set_initial_and_final_times = lambda ssh=None: None + adapter.download_files() + client = self._stub_pool._inner._clients['server2'] + self.assertIn('/r/output.log', client.downloaded) + self.assertEqual(self._stub_pool.opens, 1) + + def test_full_lifecycle_one_open_per_server(self): + """Submit + many polls + download + cleanup all share one pooled client. + + End-to-end view of one job's life: this collapses what was + previously ~(2 + N_polls + 1 + 1) ≈ N+4 individual SSH + connections into a single reused client. + """ + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.files_to_download = [{'remote': '/r/o.log', 'local': '/l/o.log'}] + adapter.set_initial_and_final_times = lambda ssh=None: None + + adapter.execute() # upload + submit (1 borrow) + for _ in range(50): # 50 status polls + adapter._check_job_server_status() + adapter.download_files() # 1 download borrow + adapter.remove_remote_files() # 1 cleanup borrow + adapter.delete() # 1 delete borrow + + # All phases share the same pooled client. + self.assertEqual(self._stub_pool.opens, 1) + # 1 execute + 50 polls + 1 download + 1 cleanup + 1 delete = 54 borrows. + self.assertEqual(self._stub_pool.borrows, 54) + + +class TestSSHPoolDefaultLifecycle(unittest.TestCase): + """The module-level default pool is lazy and resettable.""" + + def setUp(self): + import arc.job.ssh_pool as _pool + _pool.reset_default_pool() + self._pool_module = _pool + + def tearDown(self): + self._pool_module.reset_default_pool() + + def test_get_default_pool_is_idempotent(self): + p1 = self._pool_module.get_default_pool() + p2 = self._pool_module.get_default_pool() + self.assertIs(p1, p2) + + def test_reset_default_pool_drops_the_instance(self): + p1 = self._pool_module.get_default_pool() + self._pool_module.reset_default_pool() + p2 = self._pool_module.get_default_pool() + self.assertIsNot(p1, p2) + + def test_set_default_pool_replaces_instance(self): + replacement = _StubFactoryPool() + self._pool_module.set_default_pool(replacement) + self.assertIs(self._pool_module.get_default_pool(), replacement) + + if __name__ == '__main__': unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/job/ssh.py b/arc/job/ssh.py index b548b5eed2..c21cf9f3f0 100644 --- a/arc/job/ssh.py +++ b/arc/job/ssh.py @@ -398,7 +398,7 @@ def close(self) -> None: @check_connections def get_last_modified_time(self, remote_file_path_1: str, - remote_file_path_2: str | None, + remote_file_path_2: str | None = None, ) -> datetime.datetime | None: """ Returns the last modified time of ``remote_file_path_1`` if the file exists, diff --git a/arc/job/ssh_pool.py b/arc/job/ssh_pool.py new file mode 100644 index 0000000000..1f86de6764 --- /dev/null +++ b/arc/job/ssh_pool.py @@ -0,0 +1,156 @@ +"""Persistent per-server SSHClient pool for the lifetime of an ARC run. + +Without this, each remote-queue job opens its own TCP+auth handshake +for upload, then another for qsub. Option 1 (in :mod:`arc.job.adapter`) +collapsed those two into one (per-job sharing). This module is Option +2: extend the share across ALL jobs run during this Python process, +so 100 TS guess opts end up sharing one paramiko Transport instead of +opening 100 of them. The closest equivalent to OpenSSH's +``ControlMaster``, applied at the library level for paramiko. + +Concurrency: ARC's scheduler is single-threaded (verified — no +``Thread`` / ``asyncio`` / ``concurrent.futures`` imports across +``scheduler.py`` / ``main.py`` / ``adapter.py``), so the pool does no +locking. A future async/parallel scheduler would need per-server +locks; flagged in :meth:`SSHConnectionPool.borrow`. + +Lifecycle: the default process-global pool is opened lazily on first +borrow and closed via :func:`reset_default_pool`. ARC.py's ``main()`` +calls that on exit so pooled connections close cleanly even on +ctrl-C / crash; tests call it in ``tearDown`` to start fresh. +""" + +from contextlib import contextmanager +from typing import Callable + +from arc.common import get_logger +from arc.job.ssh import SSHClient + +logger = get_logger() + + +SSHClientFactory = Callable[[str], SSHClient] + + +def _default_factory(server: str) -> SSHClient: + """Open and connect a real SSHClient. Override for tests.""" + client = SSHClient(server) + client.connect() + return client + + +class SSHConnectionPool: + """Process-lifetime cache of SSHClient instances keyed by server name. + + One client per server, opened lazily on first borrow, kept alive + until :meth:`close_all` is called (or the process exits). Health + is re-checked on every operation by the existing + ``check_connections`` decorator on SSHClient methods, so a stale + Transport is silently re-established mid-run. + """ + + def __init__(self, factory: SSHClientFactory = _default_factory): + self._factory = factory + self._clients: dict[str, SSHClient] = {} + # Counters expose pool behavior to tests/observability without + # forcing them to peek at internals or hook the factory. + self.opens = 0 + self.borrows = 0 + + @contextmanager + def borrow(self, server: str): + """Lease the pool's SSHClient for ``server``. + + Returns a context manager yielding an :class:`SSHClient`. + Exiting the context does NOT close the client — the pool + retains ownership. The borrowed client is transient by + contract; do not stash it past the ``with`` block. + + Concurrent borrows of the same server are not safe today. + ARC's scheduler is single-threaded, so this hasn't bitten; + a parallel scheduler would need a per-server lock around the + yield (or a small "free clients" stack instead of a single). + """ + self.borrows += 1 + client = self._clients.get(server) + if client is None or not _is_alive(client): + if client is not None: + _close_quietly(client, f"reaping dead {server} SSHClient before reopen") + client = self._factory(server) + self._clients[server] = client + self.opens += 1 + logger.debug("ssh_pool: opened SSHClient for %s (total opens=%d)", server, self.opens) + else: + logger.debug("ssh_pool: reusing SSHClient for %s", server) + yield client + # No close on exit — pool keeps the connection. + + def close_all(self) -> None: + """Close every pooled client. Safe to call multiple times.""" + for server, client in list(self._clients.items()): + _close_quietly(client, f"closing pooled {server} SSHClient") + self._clients.clear() + + +def _is_alive(client: SSHClient) -> bool: + """Cheap liveness check: does the paramiko Transport report active? + + Doesn't roundtrip to the server — the SSHClient method's own + ``check_connections`` decorator does that on the next call. This is + just enough to skip the obvious "connection got reset between + jobs" case so we don't hand out a known-dead handle. + """ + underlying = getattr(client, "_ssh", None) + if underlying is None: + return False + transport_getter = getattr(underlying, "get_transport", None) + if transport_getter is None: + return False + transport = transport_getter() + return bool(transport and transport.is_active()) + + +def _close_quietly(client: SSHClient, context: str) -> None: + try: + client.close() + except Exception: + # Pool teardown should never propagate a close error; ARC's + # main path is past the work that needed the connection. + logger.debug("ssh_pool: close errored %s", context, exc_info=True) + + +# Process-global default pool. Lazily instantiated. Reset between ARC +# runs (and between tests) via reset_default_pool(). +_default_pool: SSHConnectionPool | None = None + + +def get_default_pool() -> SSHConnectionPool: + """Return the process-global pool, creating it on first call.""" + global _default_pool + if _default_pool is None: + _default_pool = SSHConnectionPool() + return _default_pool + + +def set_default_pool(pool: SSHConnectionPool | None) -> None: + """Replace the process-global pool. Mainly for tests that want to + inject a stub-factory pool without monkeypatching the module.""" + global _default_pool + _default_pool = pool + + +def reset_default_pool() -> None: + """Close and discard the default pool. Idempotent.""" + global _default_pool + if _default_pool is not None: + _default_pool.close_all() + _default_pool = None + + +__all__ = [ + "SSHClientFactory", + "SSHConnectionPool", + "get_default_pool", + "reset_default_pool", + "set_default_pool", +] diff --git a/arc/output.py b/arc/output.py index 69b92b595b..c3df70d2f3 100644 --- a/arc/output.py +++ b/arc/output.py @@ -106,12 +106,24 @@ def write_output_yml( doc['atom_energy_corrections'] = aec doc['bond_additivity_corrections'] = bac + # ---- per-job software (used for input-deck filename lookup) ---------------- + # freq/sp fall back to opt's software because the runtime falls back + # to opt_level when freq_level/sp_level aren't explicitly set, and + # the same level → same software → same deck filename. + opt_software = getattr(opt_level, 'software', None) + software_by_job = { + 'opt': opt_software, + 'freq': getattr(freq_level, 'software', None) or opt_software, + 'sp': getattr(sp_level, 'software', None) or opt_software, + } + # ---- species and TSs -------------------------------------------------------- point_groups = _compute_point_groups(species_dict, project_directory) doc['species'] = [] doc['transition_states'] = [] for spc in species_dict.values(): - d = _spc_to_dict(spc, output_dict, project_directory, point_groups, irc_requested=irc_requested) + d = _spc_to_dict(spc, output_dict, project_directory, point_groups, + irc_requested=irc_requested, software_by_job=software_by_job) if spc.is_ts: doc['transition_states'].append(d) else: @@ -250,6 +262,44 @@ def _parse_opt_log(geo_path: str | None, project_directory: str) -> tuple: return None, None +def _input_filename_for(software: str | None) -> str | None: + """Return the ESS-specific input deck filename, or None. + + Pulls from ``settings['input_filenames']`` so the mapping stays + in one place. Software not in the map (e.g., ``gcn``, ``torchani``, + ``mockter`` — generally not "real" ESS jobs) returns None and the + caller emits no input-deck path for that job. + """ + if not software: + return None + name = str(software).lower() + return (settings.get('input_filenames') or {}).get(name) + + +def _derive_input_path( + log_path: str | None, + software: str | None, + project_directory: str, +) -> str | None: + """Return the input deck path (project-relative) for a given job log. + + The input deck is a sibling of the log file, named per + ``input_filenames[software]``. Existence is checked on disk: if the + file isn't there (e.g., archived runs that kept the log but discarded + the deck), this returns None rather than emitting a ghost path. + """ + if not log_path: + return None + fname = _input_filename_for(software) + if not fname: + return None + abs_log = log_path if os.path.isabs(log_path) else os.path.join(project_directory, log_path) + candidate = os.path.join(os.path.dirname(abs_log), fname) + if not os.path.isfile(candidate): + return None + return _make_rel_path(candidate, project_directory) + + def _get_ess_versions(paths: dict, project_directory: str) -> dict[str, str] | None: """ Parse ESS version strings from each available log file (sp, opt, freq, neb). @@ -429,8 +479,17 @@ def _compute_point_groups(species_dict: dict, project_directory: str) -> dict[st def _spc_to_dict(spc, output_dict: dict, project_directory: str, - point_groups: dict | None = None, irc_requested: bool = True) -> dict: - """Build the per-species/TS section for output.yml.""" + point_groups: dict | None = None, irc_requested: bool = True, + software_by_job: dict[str, str | None] | None = None) -> dict: + """Build the per-species/TS section for output.yml. + + ``software_by_job`` is an optional ``{'opt': name, 'freq': name, + 'sp': name}`` map that lets this function emit per-job input-deck + paths (``opt_input``, ``freq_input``, ``sp_input``) alongside the + log paths. When omitted (the back-compat path), the input fields + come out as ``None`` and downstream consumers proceed with logs only. + """ + software_by_job = software_by_job or {} label = spc.label entry = output_dict.get(label, {}) converged = entry.get('convergence') is True @@ -515,6 +574,20 @@ def _spc_to_dict(spc, output_dict: dict, project_directory: str, d['freq_log'] = _make_rel_path(paths.get('freq') or None, project_directory) d['sp_log'] = _make_rel_path(paths.get('sp') or None, project_directory) + # ── ESS input deck paths ──────────────────────────────────────────────── + # Same directory as the corresponding log, with the per-software + # filename from settings['input_filenames']. None when the file isn't + # on disk (the consumer treats that as "no deck available"). + d['opt_input'] = _derive_input_path( + paths.get('geo') or None, software_by_job.get('opt'), project_directory, + ) + d['freq_input'] = _derive_input_path( + paths.get('freq') or None, software_by_job.get('freq'), project_directory, + ) + d['sp_input'] = _derive_input_path( + paths.get('sp') or None, software_by_job.get('sp'), project_directory, + ) + # ── ESS software version (from SP log, or fall back to geo/freq log) ── d['ess_versions'] = _get_ess_versions(paths, project_directory) if converged else None diff --git a/arc/output_test.py b/arc/output_test.py index bda8df47fd..ae8b112a50 100644 --- a/arc/output_test.py +++ b/arc/output_test.py @@ -873,6 +873,135 @@ def test_no_point_groups(self): result = _spc_to_dict(spc, output_dict, '/abs') self.assertIsNone(result['statmech']['point_group']) + # ------------------------------------------------------------------ + # Input-deck path emission (`_input` keys). + # ------------------------------------------------------------------ + + def test_input_paths_default_none_when_no_software_info(self): + """Back-compat: callers that don't pass ``software_by_job`` get None.""" + spc = self._make_spc_mock() + output_dict = {'CH4': { + 'convergence': True, + 'paths': {'geo': '/abs/opt.log', 'freq': '/abs/freq.log', 'sp': '/abs/sp.log'}, + 'job_types': {'opt': True}, + }} + result = _spc_to_dict(spc, output_dict, '/abs') + self.assertIsNone(result['opt_input']) + self.assertIsNone(result['freq_input']) + self.assertIsNone(result['sp_input']) + + def test_input_path_emitted_when_file_exists(self): + """Gaussian's input.gjf next to opt.log → opt_input populated, project-relative.""" + proj = tempfile.mkdtemp(prefix='arc-output-test-') + self.addCleanup(shutil.rmtree, proj, ignore_errors=True) + opt_dir = os.path.join(proj, 'calcs', 'CH4', 'opt') + os.makedirs(opt_dir, exist_ok=True) + opt_log = os.path.join(opt_dir, 'input.log') + opt_inp = os.path.join(opt_dir, 'input.gjf') + for p in (opt_log, opt_inp): + with open(p, 'w') as f: + f.write('x') + spc = self._make_spc_mock() + output_dict = {'CH4': { + 'convergence': True, + 'paths': {'geo': opt_log}, + 'job_types': {'opt': True}, + }} + result = _spc_to_dict( + spc, output_dict, proj, + software_by_job={'opt': 'gaussian', 'freq': None, 'sp': None}, + ) + self.assertEqual(result['opt_input'], 'calcs/CH4/opt/input.gjf') + + def test_input_path_none_when_input_file_missing(self): + """Software is known, log is on disk, but input deck isn't → None (no ghost path).""" + proj = tempfile.mkdtemp(prefix='arc-output-test-') + self.addCleanup(shutil.rmtree, proj, ignore_errors=True) + opt_dir = os.path.join(proj, 'calcs', 'CH4', 'opt') + os.makedirs(opt_dir, exist_ok=True) + opt_log = os.path.join(opt_dir, 'input.log') + with open(opt_log, 'w') as f: + f.write('x') + # no input.gjf written + spc = self._make_spc_mock() + output_dict = {'CH4': { + 'convergence': True, + 'paths': {'geo': opt_log}, + 'job_types': {'opt': True}, + }} + result = _spc_to_dict( + spc, output_dict, proj, + software_by_job={'opt': 'gaussian'}, + ) + self.assertIsNone(result['opt_input']) + + def test_input_path_uses_software_specific_filename(self): + """orca → input.in, cfour → ZMAT — driven by settings['input_filenames'].""" + proj = tempfile.mkdtemp(prefix='arc-output-test-') + self.addCleanup(shutil.rmtree, proj, ignore_errors=True) + # opt: orca run, deck is input.in + opt_dir = os.path.join(proj, 'calcs', 'CH4', 'opt') + os.makedirs(opt_dir, exist_ok=True) + opt_log = os.path.join(opt_dir, 'input.log') + opt_inp = os.path.join(opt_dir, 'input.in') + for p in (opt_log, opt_inp): + open(p, 'w').close() + # sp: cfour run, deck is ZMAT + sp_dir = os.path.join(proj, 'calcs', 'CH4', 'sp') + os.makedirs(sp_dir, exist_ok=True) + sp_log = os.path.join(sp_dir, 'output.out') + sp_inp = os.path.join(sp_dir, 'ZMAT') + for p in (sp_log, sp_inp): + open(p, 'w').close() + spc = self._make_spc_mock() + output_dict = {'CH4': { + 'convergence': True, + 'paths': {'geo': opt_log, 'sp': sp_log}, + 'job_types': {'opt': True}, + }} + result = _spc_to_dict( + spc, output_dict, proj, + software_by_job={'opt': 'orca', 'sp': 'cfour'}, + ) + self.assertEqual(result['opt_input'], 'calcs/CH4/opt/input.in') + self.assertEqual(result['sp_input'], 'calcs/CH4/sp/ZMAT') + + def test_input_path_none_when_log_missing(self): + """No log path → no input path, regardless of software.""" + spc = self._make_spc_mock() + output_dict = {'CH4': { + 'convergence': True, + 'paths': {}, # no geo/freq/sp + 'job_types': {'opt': True}, + }} + result = _spc_to_dict( + spc, output_dict, '/abs', + software_by_job={'opt': 'gaussian', 'freq': 'gaussian', 'sp': 'gaussian'}, + ) + self.assertIsNone(result['opt_input']) + self.assertIsNone(result['freq_input']) + self.assertIsNone(result['sp_input']) + + def test_input_path_none_when_software_unknown(self): + """Software not in settings['input_filenames'] (e.g., gcn) → None.""" + proj = tempfile.mkdtemp(prefix='arc-output-test-') + self.addCleanup(shutil.rmtree, proj, ignore_errors=True) + opt_dir = os.path.join(proj, 'calcs', 'CH4', 'opt') + os.makedirs(opt_dir, exist_ok=True) + opt_log = os.path.join(opt_dir, 'output.yml') + open(opt_log, 'w').close() + spc = self._make_spc_mock() + output_dict = {'CH4': { + 'convergence': True, + 'paths': {'geo': opt_log}, + 'job_types': {'opt': True}, + }} + result = _spc_to_dict( + spc, output_dict, proj, + software_by_job={'opt': 'gcn'}, # gcn has no entry in input_filenames + ) + self.assertIsNone(result['opt_input']) + class TestComputePointGroups(unittest.TestCase): """Tests for _compute_point_groups.""" diff --git a/arc/settings/settings.py b/arc/settings/settings.py index 9e17b62d91..1b066f07bc 100644 --- a/arc/settings/settings.py +++ b/arc/settings/settings.py @@ -453,10 +453,12 @@ def add_rmg_db_candidates(prefix: str) -> None: rmg_candidates.extend([ os.path.join(home, 'Code', 'RMG-Py'), + os.path.join(home, 'code', 'RMG-Py'), os.path.join(home, 'runner', 'work', 'ARC', 'ARC', 'RMG-Py') ]) rmg_db_candidates.extend([ os.path.join(home, 'Code', 'RMG-database'), + os.path.join(home, 'code', 'RMG-database'), os.path.join(home, 'runner', 'work', 'ARC', 'ARC', 'RMG-database') ]) diff --git a/arc/statmech/arkane.py b/arc/statmech/arkane.py index 96de727e9a..bea8005850 100644 --- a/arc/statmech/arkane.py +++ b/arc/statmech/arkane.py @@ -215,7 +215,11 @@ def compute_thermo(self, delete_existing_subdir=True) self.generate_arkane_input(statmech_dir=statmech_dir, skip_rotors=skip_rotors, e0_only=e0_only) self.generate_species_files(statmech_dir, skip_rotors, check_compute_thermo=not e0_only) - run_arkane(statmech_dir) + if not run_arkane(statmech_dir): + # No output.py was produced — parsing would either error or + # silently miss data. Skip cleanly; matches the kinetics + # caller's gate. + return self.parse_arkane_thermo_output(statmech_dir) def compute_high_p_rate_coefficient(self, @@ -565,35 +569,79 @@ def run_arkane(statmech_dir: str) -> bool: shell=True, no_fail=True, executable='/bin/bash') - if std_err: - ignorable_phrases = [ - "Open Babel Warning", - "Accepted unusual valence", - "==============================", - "pjrt_executable.cc", - ] - - real_errors = [] - for line in std_err: - line = line.strip() - if not line: - continue - if not any(phrase in line for phrase in ignorable_phrases): - real_errors.append(line) - - if real_errors: - logger.info(f'Arkane run failed with errors:\n{std_err}') - return False + # The authoritative success signal is whether Arkane wrote + # ``output.py``. Stderr content alone is unreliable — upstream tools + # (OpenBabel, Arkane's ``git rev-parse`` provenance stamp, JAX/XLA's + # TPU probe) emit lines that look like errors but don't represent + # failure, and using stderr-non-empty as a gate caused us to discard + # complete Arkane runs (see the kinetics caller, which bails on a + # False return). The classification below is now advisory: if stderr + # has lines that don't match a known-cosmetic pattern, we log them + # at WARNING so they're visible without making them load-bearing. + real_errors = _classify_arkane_stderr(std_err) output_file = os.path.join(statmech_dir, 'output.py') - if not os.path.isfile(output_file): - logger.error(f'Arkane run finished but {output_file} was not created. Check stdout/stderr.') + output_present = os.path.isfile(output_file) + + if real_errors and output_present: + logger.warning( + "Arkane stderr contained non-cosmetic lines but output.py " + "was produced; proceeding. Lines:\n%s", + "\n".join(real_errors), + ) + elif real_errors and not output_present: + # Genuine failure: stderr has real errors AND no output. The + # combination is the most diagnostic signal we can give; log + # both so the user sees cause + effect. + logger.error("Arkane run failed; stderr:\n%s", "\n".join(real_errors)) + + if not output_present: + logger.error( + f'Arkane run finished but {output_file} was not created. ' + 'Check stdout/stderr.' + ) return False logger.debug(f'Arkane run completed:\n{std_out}') return True +# Cosmetic stderr lines from upstream tools that Arkane shells out to. +# Tracking these explicitly (rather than accepting all stderr) keeps the +# WARNING log signal-rich: if a NEW source of stderr noise appears, it +# shows up loudly until we either fix it or add it here. +_ARKANE_STDERR_IGNORABLE_PHRASES: tuple[str, ...] = ( + "Open Babel Warning", + "Accepted unusual valence", + "==============================", + "pjrt_executable.cc", + # Arkane runs `git rev-parse` to stamp its output with the + # RMG-database commit; when run from a non-git CWD (the + # common case under conda-installed databases) git emits + # this and Arkane carries on regardless. + "fatal: not a git repository", +) + + +def _classify_arkane_stderr(std_err: list[str] | None) -> list[str]: + """Return the subset of ``std_err`` lines that aren't known cosmetic noise. + + Used by :func:`run_arkane` for advisory logging only — the boolean + success signal is whether Arkane produced ``output.py``, not whether + this list is empty. + """ + if not std_err: + return [] + real: list[str] = [] + for line in std_err: + stripped = line.strip() + if not stripped: + continue + if not any(phrase in stripped for phrase in _ARKANE_STDERR_IGNORABLE_PHRASES): + real.append(stripped) + return real + + def clean_output_directory(species_path: str, # todo is_ts: bool = False, ) -> None: diff --git a/arc/statmech/arkane_test.py b/arc/statmech/arkane_test.py index 9ff489de10..4798470309 100644 --- a/arc/statmech/arkane_test.py +++ b/arc/statmech/arkane_test.py @@ -681,5 +681,131 @@ def test_check_bacs_different_aec_and_bac_keys(self): self.assertTrue(result) +class TestRunArkaneOutputPySignal(unittest.TestCase): + """``run_arkane``'s pass/fail signal is whether ``output.py`` was + produced — NOT whether stderr is empty. The stderr classification + is now advisory (logged) but doesn't gate the return value. + + Pre-fix bug: complete Arkane runs were discarded because cosmetic + stderr noise (git rev-parse failure, OpenBabel warnings) tripped + a false-failure gate. Both the thermo and kinetics callers now + use the same authoritative output.py-existence signal. + """ + + def setUp(self): + from arc.statmech.arkane import run_arkane + self._run_arkane = run_arkane + self.tmp = tempfile.mkdtemp(prefix='arkane-stderr-test-') + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + # Pre-flight check in run_arkane requires input.py before the + # subprocess fires. + with open(os.path.join(self.tmp, 'input.py'), 'w') as f: + f.write('# fake arkane input\n') + + def _create_output_py(self): + with open(os.path.join(self.tmp, 'output.py'), 'w') as f: + f.write('# fake arkane output\n') + + def _run_with_stderr(self, stderr_lines): + with patch('arc.statmech.arkane.execute_command', + return_value=(['ok'], stderr_lines)): + return self._run_arkane(self.tmp) + + # ---- output.py present: success regardless of stderr ---- + + def test_empty_stderr_with_output_returns_true(self): + self._create_output_py() + self.assertTrue(self._run_with_stderr([])) + + def test_cosmetic_stderr_with_output_returns_true(self): + """OpenBabel + git warnings are classified as cosmetic and don't gate.""" + self._create_output_py() + self.assertTrue(self._run_with_stderr([ + 'fatal: not a git repository (or any of the parent directories): .git', + '*** Open Babel Warning in InChI code', + ' #1 :Accepted unusual valence(s): C(3)', + ])) + + def test_real_error_with_output_returns_true_and_warns(self): + """If output.py was produced, even a Python traceback in stderr + doesn't fail the run — but it IS logged at WARNING so the + operator sees it. This is the load-bearing change vs. the old + behavior that would have returned False here. + """ + self._create_output_py() + with self.assertLogs('arc', level='WARNING') as logs: + result = self._run_with_stderr([ + 'Traceback (most recent call last):', + 'KeyError: "level_of_theory"', + ]) + self.assertTrue(result, "output.py exists → success regardless of stderr") + self.assertTrue( + any('non-cosmetic lines' in m for m in logs.output), + f"expected the advisory warning to fire; got {logs.output}", + ) + + # ---- output.py missing: failure ---- + + def test_missing_output_py_returns_false(self): + """No output.py = Arkane never wrote its result. That's the + only condition that should mean 'failure'.""" + self.assertFalse(self._run_with_stderr([])) + + def test_missing_output_py_with_real_error_logs_both_diagnostics(self): + """When output is missing AND stderr has real errors, log both + — the operator gets cause (stderr) and effect (no output).""" + with self.assertLogs('arc', level='ERROR') as logs: + result = self._run_with_stderr([ + 'Traceback (most recent call last):', + 'ImportError: rmgpy not installed', + ]) + self.assertFalse(result) + joined = '\n'.join(logs.output) + self.assertIn('Arkane run failed', joined) + self.assertIn('ImportError', joined) + self.assertIn('was not created', joined) + + # ---- pre-flight checks still gate ---- + + def test_missing_input_py_returns_false_pre_flight(self): + """If the pre-flight finds no input.py, we never even run the + subprocess — and return False without checking stderr.""" + os.unlink(os.path.join(self.tmp, 'input.py')) + self.assertFalse(self._run_with_stderr([])) + + def test_missing_statmech_dir_returns_false_pre_flight(self): + from arc.statmech.arkane import run_arkane + self.assertFalse(run_arkane('/nonexistent/dir')) + + +class TestClassifyArkaneStderr(unittest.TestCase): + """Direct tests of the stderr-noise filter, independent of run_arkane.""" + + def setUp(self): + from arc.statmech.arkane import _classify_arkane_stderr + self._classify = _classify_arkane_stderr + + def test_empty_input_returns_empty(self): + self.assertEqual(self._classify(None), []) + self.assertEqual(self._classify([]), []) + + def test_only_cosmetic_lines_return_empty(self): + self.assertEqual(self._classify([ + '==============================', + '*** Open Babel Warning in InChI code', + ' #1 :Accepted unusual valence(s): C(3)', + 'fatal: not a git repository (or any of the parent directories): .git', + '', # blank lines also dropped + ]), []) + + def test_real_lines_returned_stripped(self): + result = self._classify([ + 'fatal: not a git repository', + ' KeyError: "level_of_theory" ', + '*** Open Babel Warning in InChI code', + ]) + self.assertEqual(result, ['KeyError: "level_of_theory"']) + + if __name__ == '__main__': unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/tckdb/adapter.py b/arc/tckdb/adapter.py index 9ef552b487..2777dea3fa 100644 --- a/arc/tckdb/adapter.py +++ b/arc/tckdb/adapter.py @@ -1,9 +1,18 @@ -"""Adapter that turns ARC objects into TCKDB conformer-upload payloads. +"""Adapter that turns an ARC ``output.yml`` record into a TCKDB conformer upload. The adapter is the only ARC module that knows the shape of a TCKDB upload. It builds a JSON payload matching ``ConformerUploadRequest``, writes it to disk, and (optionally) hands it to ``tckdb-client``. +Source of truth for upload data is ``/output/output.yml`` — +``arc/output.py`` was designed for this consumer (see its module +docstring). The adapter therefore takes two dicts: the full output +document (for top-level levels-of-theory, ARC version, etc.) and one +species record from ``output_doc['species']`` or +``output_doc['transition_states']``. This keeps the adapter decoupled +from ARC's live object model and makes a separate replay path (read +output.yml later, post payloads, no ARC needed) trivial. + Three guarantees: 1. If the adapter is disabled or no config is provided, it is a no-op. @@ -13,19 +22,31 @@ but does not raise. ``strict=True`` flips that. """ -from __future__ import annotations - +import base64 +import hashlib import logging -from dataclasses import dataclass +import os +from collections.abc import Mapping +from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Mapping +from typing import Any + +from tckdb_client import TCKDBClient -from arc.tckdb.config import TCKDBConfig -from arc.tckdb.idempotency import IdempotencyInputs, build_idempotency_key +from arc.tckdb.config import IMPLEMENTED_ARTIFACT_KINDS, TCKDBConfig +from arc.tckdb.idempotency import ( + ArtifactIdempotencyInputs, + IdempotencyInputs, + build_artifact_idempotency_key, + build_idempotency_key, +) from arc.tckdb.payload_writer import ( + ArtifactSidecarMetadata, PayloadWriter, SidecarMetadata, + WrittenArtifact, WrittenPayload, + _utcnow_iso, ) @@ -33,11 +54,61 @@ CONFORMER_UPLOAD_ENDPOINT = "/uploads/conformers" PAYLOAD_KIND = "conformer_calculation" +ARTIFACTS_ENDPOINT_TEMPLATE = "/calculations/{calculation_id}/artifacts" + +# Computed-species bundle endpoint. One self-contained payload that +# carries species_entry + conformers + calcs + artifacts + thermo, with +# all cross-references expressed as local string keys (no DB ids). +COMPUTED_SPECIES_ENDPOINT = "/uploads/computed-species" +COMPUTED_SPECIES_KIND = "computed_species" + +# Local calculation-key namespace within a computed-species bundle. +# These keys are referenced from `depends_on.parent_calculation_key` and +# `thermo.source_calculations[].calculation_key`. They have no relation +# to TCKDB-assigned calculation_ids — the bundle endpoint mints those +# server-side and returns them in the response. +_CALC_KEY_OPT = "opt" +_CALC_KEY_FREQ = "freq" +_CALC_KEY_SP = "sp" + +# Per-calc record-field map for output_log artifacts. Same convention as +# `_LOG_FIELD_BY_CALC_KEY` in the existing artifact path: the species +# record carries `_log` paths that come straight from +# `arc/output.py::_spc_to_dict`. +_LOG_FIELD_BY_CALC_KEY = { + _CALC_KEY_OPT: "opt_log", + _CALC_KEY_FREQ: "freq_log", + _CALC_KEY_SP: "sp_log", +} + +# Same shape, for input-deck paths. `arc/output.py` emits these per-job +# (with per-job software → per-job filename), only when the deck file +# actually exists on disk; null when the deck wasn't kept (archived runs). +_INPUT_FIELD_BY_CALC_KEY = { + _CALC_KEY_OPT: "opt_input", + _CALC_KEY_FREQ: "freq_input", + _CALC_KEY_SP: "sp_input", +} + +# (artifact_kind, record-field-map) pairs that the inline-artifact +# helper iterates per calc. Keeping the mapping data-driven so adding +# checkpoints (or any future kind) is one tuple, not a code branch. +_INLINE_ARTIFACT_SOURCES: tuple[tuple[str, dict[str, str]], ...] = ( + ("output_log", _LOG_FIELD_BY_CALC_KEY), + ("input", _INPUT_FIELD_BY_CALC_KEY), +) @dataclass(frozen=True) class UploadOutcome: - """Result of one adapter invocation; mirrors the sidecar status.""" + """Result of one adapter invocation; mirrors the sidecar status. + + ``primary_calculation`` and ``additional_calculations`` are populated + on successful conformer uploads (status == ``uploaded``) and carry + the server's :class:`CalculationUploadRef` shape — i.e. dicts with + ``calculation_id``, ``type``, and (for additional calcs) ``request_index``. + They are ``None`` / empty otherwise. + """ status: str # pending | uploaded | failed | skipped payload_path: Path @@ -45,6 +116,22 @@ class UploadOutcome: idempotency_key: str error: str | None = None response: Any = None + primary_calculation: dict[str, Any] | None = None + additional_calculations: list[dict[str, Any]] = field(default_factory=list) + + +@dataclass(frozen=True) +class ArtifactUploadOutcome: + """Result of one artifact upload attempt.""" + + status: str # uploaded | failed | skipped + sidecar_path: Path | None + idempotency_key: str | None + calculation_id: int + kind: str + error: str | None = None + response: Any = None + skip_reason: str | None = None class TCKDBAdapter: @@ -63,6 +150,9 @@ def __init__( client_factory=None, ): self._config = config + self._project_directory = ( + Path(project_directory) if project_directory is not None else None + ) # Resolve payload_dir against the project directory if it's relative, # so payloads land under the active ARC project rather than CWD. payload_root = Path(config.payload_dir) @@ -75,47 +165,36 @@ def __init__( # Public entry points # ------------------------------------------------------------------ - def submit_conformer( + def submit_from_output( self, *, - species, - level, - xyz: str | Mapping[str, Any] | None = None, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any], conformer_index: int = 0, - calculation_type: str = "opt", - calculation_quality: str = "raw", - opt_result: Mapping[str, Any] | None = None, - freq_result: Mapping[str, Any] | None = None, - sp_result: Mapping[str, Any] | None = None, - arc_version: str | None = None, - arc_git_commit: str | None = None, extra_label: str | None = None, ) -> UploadOutcome | None: """Build, write, and (if configured) upload one conformer payload. + ``output_doc`` is the full parsed ``output.yml``. ``species_record`` + is one entry from ``output_doc['species']`` or + ``output_doc['transition_states']``. + Returns ``None`` if the adapter is disabled (so callers can write - ``adapter.submit_conformer(...)`` without an enabled-check). + ``adapter.submit_from_output(...)`` without an enabled-check). """ if not self._config.enabled: return None payload = self._build_payload( - species=species, - level=level, - xyz=xyz, - calculation_type=calculation_type, - calculation_quality=calculation_quality, - opt_result=opt_result, - freq_result=freq_result, - sp_result=sp_result, - arc_version=arc_version, - arc_git_commit=arc_git_commit, + output_doc=output_doc, + species_record=species_record, ) - species_label = getattr(species, "label", None) or "unlabeled" + species_label = species_record.get("label") or "unlabeled" conformer_label = extra_label or f"conf{conformer_index}" + project_label = self._config.project_label or output_doc.get("project") idempotency_inputs = IdempotencyInputs.from_payload( - project_label=self._config.project_label, + project_label=project_label, species_label=species_label, conformer_label=conformer_label, payload_kind=PAYLOAD_KIND, @@ -142,130 +221,635 @@ def submit_conformer( return self._upload(written, payload) + def submit_artifacts_for_calculation( + self, + *, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any], + calculation_id: int, + calculation_type: str, + file_path: str | Path, + kind: str = "output_log", + ) -> ArtifactUploadOutcome | None: + """Upload one local file as an artifact attached to a TCKDB calculation. + + ``output_doc`` and ``species_record`` mirror :meth:`submit_from_output`. + ``file_path`` may be absolute or relative — relative paths are + resolved against ``project_directory`` (passed at construction). + ``kind`` selects the TCKDB ArtifactKind; defaults to + ``"output_log"`` for back-compat with v1 callers. + + Returns ``None`` only if the adapter itself is disabled. Otherwise + returns an :class:`ArtifactUploadOutcome` whose ``status`` is one + of ``uploaded`` / ``failed`` / ``skipped``. Skip reasons: + artifact upload disabled, kind not in config.kinds, kind not yet + implemented in ARC, file path missing, or file exceeds + ``max_size_mb``. + + ``calculation_type`` is recorded in the sidecar but does not feed + the URL — the endpoint takes the calc id directly. + """ + if not self._config.enabled: + return None + species_label = species_record.get("label") or "unlabeled" + artifact_cfg = self._config.artifacts + + if not artifact_cfg.upload: + return _skip(calculation_id, kind, "artifacts.upload is False") + if kind not in artifact_cfg.kinds: + return _skip(calculation_id, kind, f"kind {kind!r} not in config.kinds") + # Defensive check: config-parse warns about not-yet-implemented + # kinds but doesn't reject them. If a user lists e.g. 'checkpoint' + # and somehow a caller routes it here, skip cleanly rather than + # uploading bytes intended for a different code path. + if kind not in IMPLEMENTED_ARTIFACT_KINDS: + return _skip( + calculation_id, kind, + f"kind {kind!r} is server-accepted but ARC has no upload path yet", + ) + + resolved = self._resolve_local_path(file_path) + if resolved is None or not resolved.is_file(): + return _skip( + calculation_id, kind, f"file missing: {file_path!r}" + ) + + size_bytes = resolved.stat().st_size + max_bytes = artifact_cfg.max_size_mb * 1024 * 1024 + if size_bytes > max_bytes: + return _skip( + calculation_id, + kind, + f"file {resolved.name} is {size_bytes} bytes " + f"(>{artifact_cfg.max_size_mb} MB cap)", + ) + + with resolved.open("rb") as fh: + content = fh.read() + sha256 = hashlib.sha256(content).hexdigest() + content_b64 = base64.b64encode(content).decode("ascii") + + project_label = self._config.project_label or output_doc.get("project") + idempotency_inputs = ArtifactIdempotencyInputs( + project_label=project_label, + species_label=species_label, + calculation_id=calculation_id, + artifact_kind=kind, + artifact_sha256=sha256, + ) + idempotency_key = build_artifact_idempotency_key(idempotency_inputs) + endpoint = ARTIFACTS_ENDPOINT_TEMPLATE.format(calculation_id=calculation_id) + + written_artifact = self._writer.write_artifact_sidecar( + species_label=species_label, + calculation_id=calculation_id, + kind=kind, + filename=resolved.name, + sha256=sha256, + bytes_=size_bytes, + endpoint=endpoint, + idempotency_key=idempotency_key, + source_path=str(resolved), + base_url=self._config.base_url, + ) + + request_body = { + "artifacts": [ + { + "kind": kind, + "filename": resolved.name, + "content_base64": content_b64, + "sha256": sha256, + "bytes": size_bytes, + } + ] + } + return self._upload_artifact( + written=written_artifact, + request_body=request_body, + endpoint=endpoint, + ) + # ------------------------------------------------------------------ - # Payload construction + # Computed-species bundle path (POST /uploads/computed-species) # ------------------------------------------------------------------ - def _build_payload( + def submit_computed_species_from_output( self, *, - species, - level, - xyz, - calculation_type: str, - calculation_quality: str, - opt_result: Mapping[str, Any] | None, - freq_result: Mapping[str, Any] | None, - sp_result: Mapping[str, Any] | None, - arc_version: str | None, - arc_git_commit: str | None, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any], + conformer_index: int = 0, + extra_label: str | None = None, + ) -> UploadOutcome | None: + """Build, write, and (if configured) upload one computed-species bundle. + + Bundles species_entry + conformer geometry + opt/freq/sp + thermo + + (optionally) inline artifacts into a single + ``ComputedSpeciesUploadRequest`` and POSTs to + ``/uploads/computed-species``. Returns ``None`` if the adapter is + disabled, mirroring :meth:`submit_from_output`. + + Build failures (e.g. missing opt level, missing xyz) raise; the + caller in scheduler/processor is responsible for wrapping the + per-species call in a try/except so one bad species doesn't take + down the rest of the run — same shape as the conformer path. + """ + if not self._config.enabled: + return None + + species_label = species_record.get("label") or "unlabeled" + conformer_label = extra_label or f"conf{conformer_index}" + project_label = self._config.project_label or output_doc.get("project") + + payload = self._build_computed_species_payload( + output_doc=output_doc, + species_record=species_record, + conformer_key=conformer_label, + ) + + idempotency_inputs = IdempotencyInputs.from_payload( + project_label=project_label, + species_label=species_label, + conformer_label=conformer_label, + payload_kind=COMPUTED_SPECIES_KIND, + payload=payload, + ) + idempotency_key = build_idempotency_key(idempotency_inputs) + + written = self._writer.write( + label=f"{species_label}.{conformer_label}", + payload=payload, + endpoint=COMPUTED_SPECIES_ENDPOINT, + idempotency_key=idempotency_key, + payload_kind=COMPUTED_SPECIES_KIND, + base_url=self._config.base_url, + subdir=PayloadWriter.COMPUTED_SPECIES_SUBDIR, + ) + logger.info( + "TCKDB computed-species payload written: %s (key=%s)", + written.payload_path, + idempotency_key, + ) + + if not self._config.upload: + return self._finalize_skipped(written) + + return self._upload(written, payload, endpoint=COMPUTED_SPECIES_ENDPOINT) + + def _build_computed_species_payload( + self, + *, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any], + conformer_key: str, ) -> dict[str, Any]: - """Construct the JSON payload accepted by ``ConformerUploadRequest``. + """Compose one ComputedSpeciesUploadRequest dict. - Designed to fail soft on missing optional ARC attributes — many - ARC species objects in mid-flight don't carry every field. - Required-by-schema fields (smiles, charge, multiplicity, xyz_text, - method, software name) come from the species/level objects. + Reuses the existing per-calc and species-entry shapers; the only + bundle-specific surface is the conformer wrapper, dependency + edges (declared by local calc keys), inline artifacts, and the + optional thermo block. """ - species_entry = self._species_entry_payload(species) - geometry_payload = {"xyz_text": _coerce_xyz_text(xyz, species)} - calculation_payload = self._calculation_payload( - level=level, - calculation_type=calculation_type, - calculation_quality=calculation_quality, - opt_result=opt_result, - freq_result=freq_result, - sp_result=sp_result, - arc_version=arc_version, - arc_git_commit=arc_git_commit, + included_keys, conformer_block = self._build_conformer_block( + output_doc=output_doc, + species_record=species_record, + conformer_key=conformer_key, ) - payload: dict[str, Any] = { - "species_entry": species_entry, - "geometry": geometry_payload, - "calculation": calculation_payload, - "scientific_origin": "computed", + bundle: dict[str, Any] = { + "species_entry": self._species_entry_payload(species_record), + "conformers": [conformer_block], + } + + thermo_block = _build_thermo_block( + species_record.get("thermo"), + included_calc_keys=included_keys, + ) + if thermo_block is not None: + bundle["thermo"] = thermo_block + + # Workflow-tool release at bundle level (in addition to per-calc): + # mirrors what the conformer adapter records and lets the server + # tag the species_entry with the producer. + arc_version = output_doc.get("arc_version") + arc_git_commit = output_doc.get("arc_git_commit") + if arc_version or arc_git_commit: + wt: dict[str, Any] = {"name": "ARC"} + if arc_version: + wt["version"] = str(arc_version) + if arc_git_commit: + wt["git_commit"] = str(arc_git_commit) + bundle["workflow_tool_release"] = wt + + return bundle + + def _build_conformer_block( + self, + *, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any], + conformer_key: str, + ) -> tuple[list[str], dict[str, Any]]: + """Build one ConformerInBundle and return (included_calc_keys, block). + + The keys list is what's actually present in the bundle's calc + namespace, used to drive thermo's source_calculations links. + """ + primary_calc = self._build_calc_in_bundle( + output_doc=output_doc, + species_record=species_record, + calc_key=_CALC_KEY_OPT, + calc_type="opt", + level_kind="opt", + ess_job_key="opt", + result_field="opt_result", + result_payload=_opt_result_payload(species_record), + depends_on=None, + tckdb_origin=None, + ) + + included: list[str] = [_CALC_KEY_OPT] + additional: list[dict[str, Any]] = [] + + freq_result = _freq_result_payload(species_record) + if freq_result is not None: + try: + additional.append(self._build_calc_in_bundle( + output_doc=output_doc, + species_record=species_record, + calc_key=_CALC_KEY_FREQ, + calc_type="freq", + level_kind="freq", + ess_job_key="freq", + result_field="freq_result", + result_payload=freq_result, + depends_on=[{"parent_calculation_key": _CALC_KEY_OPT, "role": "freq_on"}], + tckdb_origin=None, + )) + included.append(_CALC_KEY_FREQ) + except ValueError as exc: + logger.warning( + "TCKDB computed-species: freq calculation skipped for label=%s: %s", + species_record.get("label"), exc, + ) + + sp_result = _sp_result_payload(species_record) + if sp_result is not None: + try: + additional.append(self._build_calc_in_bundle( + output_doc=output_doc, + species_record=species_record, + calc_key=_CALC_KEY_SP, + calc_type="sp", + level_kind="sp", + ess_job_key="sp", + result_field="sp_result", + result_payload=sp_result, + depends_on=[{"parent_calculation_key": _CALC_KEY_OPT, "role": "single_point_on"}], + tckdb_origin=( + _reused_origin("opt") if _sp_is_reused_from_opt(output_doc) else None + ), + )) + included.append(_CALC_KEY_SP) + except ValueError as exc: + logger.warning( + "TCKDB computed-species: sp calculation skipped for label=%s: %s", + species_record.get("label"), exc, + ) + + block: dict[str, Any] = { + "key": conformer_key, + "geometry": {"xyz_text": _require_xyz_text(species_record)}, + "primary_calculation": primary_calc, + "additional_calculations": additional, } - label = getattr(species, "label", None) + label = species_record.get("label") if label: - payload["label"] = str(label)[:64] - return payload + block["label"] = str(label)[:64] + return included, block + + def _build_calc_in_bundle( + self, + *, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any], + calc_key: str, + calc_type: str, + level_kind: str, + ess_job_key: str, + result_field: str, + result_payload: Mapping[str, Any] | None, + depends_on: list[Mapping[str, Any]] | None, + tckdb_origin: Mapping[str, Any] | None, + ) -> dict[str, Any]: + """Build one CalculationInBundle dict. + + Reuses :meth:`_calculation_payload` for the level/software/result + plumbing, then layers on the bundle-specific fields: ``key``, + ``depends_on``, and inline ``artifacts``. + """ + level = _resolve_level(output_doc, level_kind) + calc = self._calculation_payload( + output_doc, species_record, + calc_type=calc_type, + level=level, + ess_job_key=ess_job_key, + result_field=result_field, + result_payload=result_payload, + tckdb_origin=tckdb_origin, + ) + calc["key"] = calc_key + if depends_on: + calc["depends_on"] = [dict(d) for d in depends_on] + artifacts = self._inline_artifacts_for_calc(species_record, calc_key=calc_key) + # Schema defaults `artifacts: []`. Emit explicitly only when we have + # bytes to send (or when artifact upload is enabled and we want to + # signal "no log available" with an empty list); omit otherwise. + if artifacts: + calc["artifacts"] = artifacts + return calc + + def _inline_artifacts_for_calc( + self, + species_record: Mapping[str, Any], + *, + calc_key: str, + ) -> list[dict[str, Any]]: + """Return the inline artifact list for one calc within a bundle. + + Iterates ``_INLINE_ARTIFACT_SOURCES`` (currently ``output_log`` + and ``input``) and emits one ArtifactIn dict per kind whose + record path resolves to a real file on disk. Each kind is + independently gated on ``config.artifacts.kinds``, so a user + can opt into logs but not decks (or vice versa). + + Skip rules per (calc_key, kind): + - artifacts globally disabled → entire list = [] + - kind not in config.artifacts.kinds → that kind only + - record-field path missing / null → that kind only + - resolved file not on disk → that kind only + - file > artifacts.max_size_mb → that kind only (warn) + """ + artifact_cfg = self._config.artifacts + if not artifact_cfg.upload: + return [] + artifacts: list[dict[str, Any]] = [] + for kind, field_map in _INLINE_ARTIFACT_SOURCES: + if kind not in artifact_cfg.kinds: + continue + artifact = self._read_inline_artifact( + species_record, + calc_key=calc_key, + kind=kind, + record_field=field_map.get(calc_key), + ) + if artifact is not None: + artifacts.append(artifact) + return artifacts + + def _read_inline_artifact( + self, + species_record: Mapping[str, Any], + *, + calc_key: str, + kind: str, + record_field: str | None, + ) -> dict[str, Any] | None: + """Resolve, read, hash, and base64-encode one artifact for the bundle. + + Returns ``None`` (with a debug or warning log) on any of: + unknown calc_key, missing/null record path, file not on disk, or + file exceeding ``max_size_mb``. Otherwise returns the + ``ArtifactIn``-shaped dict ready to drop into ``calc.artifacts``. + """ + if record_field is None: + return None + path_value = species_record.get(record_field) + if not path_value: + return None + resolved = self._resolve_local_path(path_value) + if resolved is None or not resolved.is_file(): + logger.debug( + "TCKDB computed-species: %s %s artifact missing on disk for %s (path=%s)", + calc_key, kind, species_record.get("label"), path_value, + ) + return None + size_bytes = resolved.stat().st_size + max_bytes = self._config.artifacts.max_size_mb * 1024 * 1024 + if size_bytes > max_bytes: + logger.warning( + "TCKDB computed-species: %s %s %s skipped (%s bytes > %s MB cap)", + calc_key, kind, resolved.name, size_bytes, + self._config.artifacts.max_size_mb, + ) + return None + with resolved.open("rb") as fh: + content = fh.read() + return { + "kind": kind, + "filename": resolved.name, + "content_base64": base64.b64encode(content).decode("ascii"), + "sha256": hashlib.sha256(content).hexdigest(), + "bytes": size_bytes, + } + + def _resolve_local_path(self, file_path: str | Path) -> Path | None: + """Resolve a local file path against project_directory if it's relative.""" + if file_path is None: + return None + path = Path(file_path) + if path.is_absolute(): + return path + if self._project_directory is not None: + return Path(self._project_directory) / path + return path.resolve() + + # ------------------------------------------------------------------ + # Payload construction + # ------------------------------------------------------------------ @staticmethod - def _species_entry_payload(species) -> dict[str, Any]: - smiles = _resolve_smiles(species) + def _species_entry_payload(record: Mapping[str, Any]) -> dict[str, Any]: + smiles = record.get("smiles") if not smiles: raise ValueError( - "TCKDB upload requires a SMILES on the species; " - f"got species.label={getattr(species, 'label', None)!r} with no resolvable SMILES." + f"output.yml record for label={record.get('label')!r} has no smiles; " + "TCKDB upload requires a SMILES on the species_entry." ) - is_ts = bool(getattr(species, "is_ts", False)) + is_ts = bool(record.get("is_ts")) return { "molecule_kind": "molecule", - "smiles": smiles, - "charge": int(getattr(species, "charge", 0) or 0), - "multiplicity": int(getattr(species, "multiplicity", 1) or 1), + "smiles": str(smiles), + "charge": int(record.get("charge", 0) or 0), + "multiplicity": int(record.get("multiplicity", 1) or 1), "species_entry_kind": "transition_state" if is_ts else "minimum", } + def _build_payload( + self, + *, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any], + ) -> dict[str, Any]: + species_entry = self._species_entry_payload(species_record) + geometry_payload = {"xyz_text": _require_xyz_text(species_record)} + primary, additional = self._build_calculations(output_doc, species_record) + + payload: dict[str, Any] = { + "species_entry": species_entry, + "geometry": geometry_payload, + "calculation": primary, + "scientific_origin": "computed", + } + if additional: + payload["additional_calculations"] = additional + label = species_record.get("label") + if label: + payload["label"] = str(label)[:64] + return payload + + @classmethod + def _build_calculations( + cls, + output_doc: Mapping[str, Any], + record: Mapping[str, Any], + ) -> tuple[dict[str, Any], list[dict[str, Any]]]: + """Return (primary opt calculation, [freq, sp] additional calculations). + + Additional calculations are skipped (with a warning) when their + result fields are absent or malformed, or when no level of theory + is available. Skipping an optional calc never fails the upload. + """ + primary = cls._calculation_payload( + output_doc, + record, + calc_type="opt", + level=_resolve_level(output_doc, "opt"), + ess_job_key="opt", + result_field="opt_result", + result_payload=_opt_result_payload(record), + ) + + additional: list[dict[str, Any]] = [] + freq_result = _freq_result_payload(record) + if freq_result is not None: + freq_level = _resolve_level(output_doc, "freq") + try: + additional.append( + cls._calculation_payload( + output_doc, + record, + calc_type="freq", + level=freq_level, + ess_job_key="freq", + result_field="freq_result", + result_payload=freq_result, + ) + ) + except ValueError as exc: + logger.warning( + "TCKDB freq additional calculation skipped for label=%s: %s", + record.get("label"), exc, + ) + + sp_result = _sp_result_payload(record) + if sp_result is not None: + sp_level = _resolve_level(output_doc, "sp") + sp_origin = _reused_origin("opt") if _sp_is_reused_from_opt(output_doc) else None + try: + additional.append( + cls._calculation_payload( + output_doc, + record, + calc_type="sp", + level=sp_level, + ess_job_key="sp", + result_field="sp_result", + result_payload=sp_result, + tckdb_origin=sp_origin, + ) + ) + except ValueError as exc: + logger.warning( + "TCKDB sp additional calculation skipped for label=%s: %s", + record.get("label"), exc, + ) + + return primary, additional + @staticmethod def _calculation_payload( + output_doc: Mapping[str, Any], + record: Mapping[str, Any], *, - level, - calculation_type: str, - calculation_quality: str, - opt_result: Mapping[str, Any] | None, - freq_result: Mapping[str, Any] | None, - sp_result: Mapping[str, Any] | None, - arc_version: str | None, - arc_git_commit: str | None, + calc_type: str, + level: Mapping[str, Any] | None, + ess_job_key: str, + result_field: str | None = None, + result_payload: Mapping[str, Any] | None = None, + tckdb_origin: Mapping[str, Any] | None = None, ) -> dict[str, Any]: - method = getattr(level, "method", None) if level is not None else None + if not isinstance(level, Mapping): + raise ValueError( + f"no level of theory available for {calc_type} calculation; " + "cannot build TCKDB calculation payload." + ) + method = level.get("method") if not method: - raise ValueError("TCKDB upload requires a level-of-theory method.") - software_name = getattr(level, "software", None) if level is not None else None + raise ValueError( + f"level of theory for {calc_type} is missing method; " + "cannot build TCKDB calculation payload." + ) + software_name = level.get("software") if not software_name: - raise ValueError("TCKDB upload requires the ESS name (level.software).") - - level_of_theory: dict[str, Any] = {"method": method} - for src, dst in ( - ("basis", "basis"), - ("auxiliary_basis", "aux_basis"), - ("cabs", "cabs_basis"), - ("dispersion", "dispersion"), - ("solvent", "solvent"), - ("solvation_method", "solvent_model"), - ): - value = getattr(level, src, None) - if value: - level_of_theory[dst] = value + raise ValueError( + f"level of theory for {calc_type} is missing software; " + "cannot identify the ESS for TCKDB." + ) + + level_of_theory: dict[str, Any] = {"method": str(method)} + basis = level.get("basis") + if basis: + level_of_theory["basis"] = str(basis) - software_release: dict[str, Any] = {"name": software_name} - version = getattr(level, "software_version", None) - if version is not None: - software_release["version"] = str(version) + software_release: dict[str, Any] = {"name": str(software_name)} + ess_versions = record.get("ess_versions") + if isinstance(ess_versions, Mapping): + # ess_versions is keyed by job type ('opt', 'freq', 'sp', 'neb'), + # not by software name. Fall back to opt's version if the + # job-specific entry is missing (often the case for combined + # opt+freq runs or shared sp/freq logs). + ess_version = ess_versions.get(ess_job_key) or ess_versions.get("opt") + if ess_version: + software_release["version"] = str(ess_version) calc: dict[str, Any] = { - "type": calculation_type, - "quality": calculation_quality, + "type": calc_type, + "quality": "raw", "software_release": software_release, "level_of_theory": level_of_theory, } + arc_version = output_doc.get("arc_version") + arc_git_commit = output_doc.get("arc_git_commit") if arc_version or arc_git_commit: wt: dict[str, Any] = {"name": "ARC"} if arc_version: - wt["version"] = arc_version + wt["version"] = str(arc_version) if arc_git_commit: - wt["git_commit"] = arc_git_commit + wt["git_commit"] = str(arc_git_commit) calc["workflow_tool_release"] = wt - if calculation_type == "opt" and opt_result: - calc["opt_result"] = dict(opt_result) - if calculation_type == "freq" and freq_result: - calc["freq_result"] = dict(freq_result) - if calculation_type == "sp" and sp_result: - calc["sp_result"] = dict(sp_result) + if result_field and result_payload: + calc[result_field] = dict(result_payload) + + # tckdb_origin is qualifier metadata (e.g. "this SP row is reused + # from opt, not an independently executed ESS job"). It rides + # under parameters_json so server-side schema doesn't need to + # grow a column for an ARC-side concern. + if tckdb_origin is not None: + calc["parameters_json"] = {"tckdb_origin": dict(tckdb_origin)} return calc @@ -285,9 +869,13 @@ def _finalize_skipped(self, written: WrittenPayload) -> UploadOutcome: idempotency_key=sc.idempotency_key, ) - def _upload(self, written: WrittenPayload, payload: dict[str, Any]) -> UploadOutcome: - from arc.tckdb.payload_writer import _utcnow_iso # local import keeps deps tidy - + def _upload( + self, + written: WrittenPayload, + payload: dict[str, Any], + *, + endpoint: str = CONFORMER_UPLOAD_ENDPOINT, + ) -> UploadOutcome: sc = written.sidecar api_key = self._config.resolve_api_key() if not api_key: @@ -305,30 +893,21 @@ def _upload(self, written: WrittenPayload, payload: dict[str, Any]) -> UploadOut try: response = client.request_json( "POST", - CONFORMER_UPLOAD_ENDPOINT, + endpoint, json=payload, idempotency_key=sc.idempotency_key, ) except Exception as exc: - client_close = getattr(client, "close", None) - if callable(client_close): - try: - client_close() - except Exception: # pragma: no cover - close errors swallowed - logger.debug("TCKDB client close errored after upload failure", exc_info=True) + _close_quietly(client, "after upload failure") return self._record_failure(written, str(exc), exc) else: - client_close = getattr(client, "close", None) - if callable(client_close): - try: - client_close() - except Exception: # pragma: no cover - close errors swallowed - logger.debug("TCKDB client close errored after upload success", exc_info=True) + _close_quietly(client, "after upload success") sc.status = "uploaded" sc.uploaded_at = _utcnow_iso() sc.response_status_code = getattr(response, "status_code", None) - sc.response_body = _summarize_response_body(getattr(response, "data", None)) + response_data = getattr(response, "data", None) + sc.response_body = _summarize_response_body(response_data) sc.idempotency_replayed = bool(getattr(response, "idempotency_replayed", False)) sc.last_error = None self._writer.update_sidecar(written.sidecar_path, sc) @@ -344,12 +923,15 @@ def _upload(self, written: WrittenPayload, payload: dict[str, Any]) -> UploadOut written.payload_path, sc.idempotency_key, ) + primary, additional = _extract_calc_refs(response_data) return UploadOutcome( status="uploaded", payload_path=written.payload_path, sidecar_path=written.sidecar_path, idempotency_key=sc.idempotency_key, response=sc.response_body, + primary_calculation=primary, + additional_calculations=additional, ) def _record_failure( @@ -379,67 +961,482 @@ def _record_failure( def _make_client(self, api_key: str): if self._client_factory is not None: return self._client_factory(self._config, api_key) - # Lazy import: keep tckdb_client out of the import path when the - # adapter is unused. - from tckdb_client import TCKDBClient - return TCKDBClient( self._config.base_url, api_key=api_key, timeout=self._config.timeout_seconds, ) + def _upload_artifact( + self, + *, + written: WrittenArtifact, + request_body: dict[str, Any], + endpoint: str, + ) -> ArtifactUploadOutcome: + sc = written.sidecar + api_key = self._config.resolve_api_key() + if not api_key: + msg = ( + f"TCKDB API key env var '{self._config.api_key_env}' is not set; " + "skipping artifact network call." + ) + return self._record_artifact_failure(written, msg, ValueError(msg)) + + try: + client = self._make_client(api_key) + except Exception as exc: # pragma: no cover - defensive + return self._record_artifact_failure( + written, f"client init failed: {exc}", exc + ) + + try: + response = client.request_json( + "POST", + endpoint, + json=request_body, + idempotency_key=sc.idempotency_key, + ) + except Exception as exc: + _close_quietly(client, "after artifact upload failure") + return self._record_artifact_failure(written, str(exc), exc) + else: + _close_quietly(client, "after artifact upload success") + + sc.status = "uploaded" + sc.uploaded_at = _utcnow_iso() + sc.response_status_code = getattr(response, "status_code", None) + sc.response_body = _summarize_response_body(getattr(response, "data", None)) + sc.idempotency_replayed = bool(getattr(response, "idempotency_replayed", False)) + sc.last_error = None + self._writer.update_artifact_sidecar(written.sidecar_path, sc) + if sc.idempotency_replayed: + logger.info( + "TCKDB artifact upload replayed (idempotent): calc=%s kind=%s key=%s", + sc.calculation_id, sc.kind, sc.idempotency_key, + ) + else: + logger.info( + "TCKDB artifact upload succeeded: calc=%s kind=%s key=%s", + sc.calculation_id, sc.kind, sc.idempotency_key, + ) + return ArtifactUploadOutcome( + status="uploaded", + sidecar_path=written.sidecar_path, + idempotency_key=sc.idempotency_key, + calculation_id=sc.calculation_id, + kind=sc.kind, + response=sc.response_body, + ) + + def _record_artifact_failure( + self, + written: WrittenArtifact, + message: str, + raised: BaseException, + ) -> ArtifactUploadOutcome: + sc = written.sidecar + sc.status = "failed" + sc.last_error = message + self._writer.update_artifact_sidecar(written.sidecar_path, sc) + logger.warning( + "TCKDB artifact upload failed (strict=%s): calc=%s kind=%s err=%s", + self._config.strict, sc.calculation_id, sc.kind, message, + ) + if self._config.strict: + raise raised + return ArtifactUploadOutcome( + status="failed", + sidecar_path=written.sidecar_path, + idempotency_key=sc.idempotency_key, + calculation_id=sc.calculation_id, + kind=sc.kind, + error=message, + ) + # ---------------------------------------------------------------------- # Helpers # ---------------------------------------------------------------------- -def _resolve_smiles(species) -> str | None: - """Best-effort SMILES extraction from an ARC species or duck-typed stand-in.""" - smiles = getattr(species, "smiles", None) - if smiles: - return str(smiles) - mol = getattr(species, "mol", None) - if mol is None: +def _sp_is_reused_from_opt(output_doc: Mapping[str, Any]) -> bool: + """Whether ARC's SP energy is reused from the opt calculation. + + The signal is structural equality between ``sp_level`` and + ``opt_level``. Two flavors both count: + + - ``sp_level`` absent / null in output.yml — the common ARC case + where the user only declares ``opt_level=`` and ARC reuses the + opt energy as the SP energy. + - ``sp_level`` explicitly set to a dict structurally equal to + ``opt_level`` — same outcome, no separate ESS job. + + A ``sp_level`` that differs from ``opt_level`` (different method, + basis, software, etc.) means a real SP job was executed, so we + return False and the calculation gets no reused-result marker. + """ + opt_level = output_doc.get("opt_level") + if not isinstance(opt_level, Mapping): + return False + sp_level = output_doc.get("sp_level") + if sp_level is None: + return True + if not isinstance(sp_level, Mapping): + return False + return dict(sp_level) == dict(opt_level) + + +def _reused_origin(reused_from_calc_type: str) -> dict[str, Any]: + """Build the ``tckdb_origin`` payload for a reused-result calculation. + + Lives under ``parameters_json.tckdb_origin`` on the calculation row. + The DAG edge between calculations carries the relational link + (e.g. ``opt -> sp`` with role ``single_point_on``); this dict + carries the qualifier — *this* row's energy is reused, not freshly + computed — so downstream consumers can tell aggregate-from-opt SP + rows apart from independently executed SP jobs. + """ + return { + "origin_kind": "reused_result", + "reused_from": {"calculation_type": reused_from_calc_type}, + "reason": ( + f"sp_level equals {reused_from_calc_type}_level; " + f"{reused_from_calc_type} electronic energy reused as SP energy" + ), + "independent_ess_job": False, + "producer": "ARC", + } + + +def _resolve_level( + output_doc: Mapping[str, Any], job_kind: str +) -> Mapping[str, Any] | None: + """Return the level-of-theory dict for ``job_kind`` ('opt'/'freq'/'sp'). + + The opt level is always required by the primary calculation. For + freq/sp, ARC users very often run all three at the same level and + only declare ``opt_level=`` — ``output.yml`` then writes ``freq_level: + null`` / ``sp_level: null``. To avoid silently dropping the freq/sp + additional calculations in the common case, fall back to ``opt_level`` + when the job-specific level is absent. A present-but-distinct + ``freq_level`` / ``sp_level`` is treated as authoritative. + """ + if job_kind == "opt": + level = output_doc.get("opt_level") + return level if isinstance(level, Mapping) else None + job_level = output_doc.get(f"{job_kind}_level") + if isinstance(job_level, Mapping): + return job_level + opt_level = output_doc.get("opt_level") + return opt_level if isinstance(opt_level, Mapping) else None + + +def _opt_result_payload(record: Mapping[str, Any]) -> dict[str, Any] | None: + out: dict[str, Any] = {} + if record.get("opt_n_steps") is not None: + out["n_steps"] = record["opt_n_steps"] + if record.get("opt_final_energy_hartree") is not None: + out["final_energy_hartree"] = record["opt_final_energy_hartree"] + return out or None + + +_FREQ_FIELD_SPECS = ( + # (record_key, payload_key, coerce) + ("freq_n_imag", "n_imag", int), + ("imag_freq_cm1", "imag_freq_cm1", float), + ("zpe_hartree", "zpe_hartree", float), +) + + +def _freq_result_payload(record: Mapping[str, Any]) -> dict[str, Any] | None: + """Build a FreqResultPayload-shaped dict from an output.yml record. + + Returns ``None`` when no freq fields are populated. Returns ``None`` + and logs a warning when any present field cannot be coerced to the + expected numeric type — the task spec mandates skipping the whole + additional calculation rather than uploading a partial freq row. + """ + if all(record.get(rkey) is None for rkey, _, _ in _FREQ_FIELD_SPECS): return None - to_smiles = getattr(mol, "to_smiles", None) or getattr(mol, "smiles", None) - if callable(to_smiles): + out: dict[str, Any] = {} + for record_key, payload_key, coerce in _FREQ_FIELD_SPECS: + value = record.get(record_key) + if value is None: + continue try: - return str(to_smiles()) - except Exception: - logger.debug("mol.to_smiles() raised; falling back to None", exc_info=True) + out[payload_key] = coerce(value) + except (TypeError, ValueError) as exc: + logger.warning( + "TCKDB freq additional calculation skipped for label=%s: " + "malformed %s=%r (%s)", + record.get("label"), record_key, value, exc, + ) return None - if isinstance(to_smiles, str): - return to_smiles - return None - - -def _coerce_xyz_text(xyz, species) -> str: - """Accept xyz as a string or ARC xyz dict; fall back to species attrs.""" - candidate = xyz - if candidate is None: - candidate = ( - getattr(species, "final_xyz", None) - or getattr(species, "initial_xyz", None) - ) - if candidate is None: - raise ValueError("TCKDB upload requires xyz coordinates.") - if isinstance(candidate, str): - text = candidate.strip() - if not text: - raise ValueError("TCKDB upload requires non-empty xyz coordinates.") + return out or None + + +def _sp_result_payload(record: Mapping[str, Any]) -> dict[str, Any] | None: + # ``sp_energy_hartree`` is ARC's record key; ``electronic_energy_hartree`` + # is the TCKDB-side field name (some records may carry it directly). + record_key = "sp_energy_hartree" if record.get("sp_energy_hartree") is not None \ + else "electronic_energy_hartree" + energy = record.get(record_key) + if energy is None: + return None + try: + return {"electronic_energy_hartree": float(energy)} + except (TypeError, ValueError) as exc: + logger.warning( + "TCKDB sp additional calculation skipped for label=%s: " + "malformed %s=%r (%s)", + record.get("label"), record_key, energy, exc, + ) + return None + + +def _build_thermo_block( + thermo_record: Any, + *, + included_calc_keys: list[str], +) -> dict[str, Any] | None: + """Build a ThermoInBundle dict from ``output.yml`` thermo data. + + Returns ``None`` when no usable thermo content can be assembled. The + server-side ``ThermoInBundle.validate_has_scientific_content`` + rejects empty thermo blocks, so emitting one with nothing in it + would just produce a 422; better to omit at the producer. + + Mapping (from ``arc/output.py::_thermo_to_dict``): + h298_kj_mol → h298_kj_mol + s298_j_mol_k → s298_j_mol_k + tmin_k → tmin_k + tmax_k → tmax_k + nasa_low.coeffs → nasa.a1..a7 + nasa_high.coeffs → nasa.b1..b7 + nasa_low.tmin_k → nasa.t_low + nasa_low.tmax_k → nasa.t_mid (cross-checked vs nasa_high.tmin_k) + nasa_high.tmax_k → nasa.t_high + cp_data → points (per-point validation; bad points dropped) + + ``source_calculations`` are populated from ``included_calc_keys``: + each of ``opt`` / ``freq`` / ``sp`` that actually made it into the + bundle gets a link with the matching role, since + ``ThermoCalculationRole`` accepts those literals directly. + """ + if not isinstance(thermo_record, Mapping): + return None + + block: dict[str, Any] = {} + + h298 = thermo_record.get("h298_kj_mol") + if h298 is not None: + try: + block["h298_kj_mol"] = float(h298) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB thermo: malformed h298_kj_mol=%r (%s)", h298, exc) + + s298 = thermo_record.get("s298_j_mol_k") + if s298 is not None: + try: + block["s298_j_mol_k"] = float(s298) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB thermo: malformed s298_j_mol_k=%r (%s)", s298, exc) + + for tk in ("tmin_k", "tmax_k"): + v = thermo_record.get(tk) + if v is not None: + try: + block[tk] = float(v) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB thermo: malformed %s=%r (%s)", tk, v, exc) + + nasa = _build_nasa_block(thermo_record.get("nasa_low"), thermo_record.get("nasa_high")) + if nasa is not None: + block["nasa"] = nasa + + points = _build_thermo_points(thermo_record.get("cp_data")) + if points: + block["points"] = points + + # ThermoCalculationRole accepts opt/freq/sp/composite/imported. We + # link freq and sp when present (they're the calcs that physically + # produced the thermo); we also link opt if it's the only calc in + # the bundle (unusual, but covers thermo-from-opt-only edge cases). + sources: list[dict[str, str]] = [] + for key in (_CALC_KEY_FREQ, _CALC_KEY_SP): + if key in included_calc_keys: + sources.append({"calculation_key": key, "role": key}) + if not sources and _CALC_KEY_OPT in included_calc_keys: + sources.append({"calculation_key": _CALC_KEY_OPT, "role": _CALC_KEY_OPT}) + if sources: + block["source_calculations"] = sources + + has_scalar = "h298_kj_mol" in block or "s298_j_mol_k" in block + has_nasa = "nasa" in block + has_points = "points" in block + if not (has_scalar or has_nasa or has_points): + # Server would 422 us; nothing usable here. + return None + return block + + +def _build_nasa_block( + nasa_low: Any, nasa_high: Any +) -> dict[str, Any] | None: + """Map ARC's two NASA blocks to ``ThermoNASACreate``. + + Returns ``None`` if either block is missing or fails any of the + structural checks. Per spec, malformed NASA must skip the NASA block + only — scalar thermo and Cp points are kept by the caller. + """ + if not isinstance(nasa_low, Mapping) or not isinstance(nasa_high, Mapping): + return None + low_coeffs = nasa_low.get("coeffs") + high_coeffs = nasa_high.get("coeffs") + if not isinstance(low_coeffs, list) or len(low_coeffs) != 7: + logger.warning( + "TCKDB thermo: NASA block skipped — nasa_low.coeffs must be a list of 7 floats." + ) + return None + if not isinstance(high_coeffs, list) or len(high_coeffs) != 7: + logger.warning( + "TCKDB thermo: NASA block skipped — nasa_high.coeffs must be a list of 7 floats." + ) + return None + t_low = nasa_low.get("tmin_k") + t_mid_low = nasa_low.get("tmax_k") + t_mid_high = nasa_high.get("tmin_k") + t_high = nasa_high.get("tmax_k") + if None in (t_low, t_mid_low, t_mid_high, t_high): + logger.warning( + "TCKDB thermo: NASA block skipped — temperature bounds incomplete." + ) + return None + try: + t_low_f, t_mid_low_f, t_mid_high_f, t_high_f = ( + float(t_low), float(t_mid_low), float(t_mid_high), float(t_high) + ) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB thermo: NASA block skipped — non-numeric bounds (%s).", exc) + return None + if t_mid_low_f != t_mid_high_f: + logger.warning( + "TCKDB thermo: NASA block skipped — nasa_low.tmax_k=%s != nasa_high.tmin_k=%s.", + t_mid_low_f, t_mid_high_f, + ) + return None + try: + low_floats = [float(c) for c in low_coeffs] + high_floats = [float(c) for c in high_coeffs] + except (TypeError, ValueError) as exc: + logger.warning("TCKDB thermo: NASA block skipped — non-numeric coefficient (%s).", exc) + return None + block: dict[str, Any] = { + "t_low": t_low_f, + "t_mid": t_mid_low_f, + "t_high": t_high_f, + } + for i, c in enumerate(low_floats, start=1): + block[f"a{i}"] = c + for i, c in enumerate(high_floats, start=1): + block[f"b{i}"] = c + return block + + +def _build_thermo_points(cp_data: Any) -> list[dict[str, Any]]: + """Map ARC's ``cp_data`` list to ``ThermoPointCreate`` dicts. + + Each entry must carry ``temperature_k``; other fields are optional. + Malformed individual points (missing/non-numeric temperature, or a + Cp value that won't coerce) are dropped with a warning so a single + bad row doesn't take out the whole thermo upload. + """ + if not isinstance(cp_data, list): + return [] + seen_temps: set[float] = set() + points: list[dict[str, Any]] = [] + for i, raw in enumerate(cp_data): + if not isinstance(raw, Mapping): + logger.warning("TCKDB thermo: cp_data[%d] skipped — not a mapping.", i) + continue + t = raw.get("temperature_k") + if t is None: + logger.warning("TCKDB thermo: cp_data[%d] skipped — missing temperature_k.", i) + continue + try: + t_f = float(t) + except (TypeError, ValueError) as exc: + logger.warning( + "TCKDB thermo: cp_data[%d] skipped — non-numeric temperature_k=%r (%s).", + i, t, exc, + ) + continue + if t_f <= 0: + logger.warning( + "TCKDB thermo: cp_data[%d] skipped — temperature_k must be > 0 (got %s).", + i, t_f, + ) + continue + if t_f in seen_temps: + # Server enforces uniqueness by temperature_k; skip duplicates here. + logger.warning( + "TCKDB thermo: cp_data[%d] skipped — duplicate temperature_k=%s.", + i, t_f, + ) + continue + seen_temps.add(t_f) + point: dict[str, Any] = {"temperature_k": t_f} + for src_key, dst_key in ( + ("cp_j_mol_k", "cp_j_mol_k"), + ("h_kj_mol", "h_kj_mol"), + ("s_j_mol_k", "s_j_mol_k"), + ("g_kj_mol", "g_kj_mol"), + ): + v = raw.get(src_key) + if v is None: + continue + try: + point[dst_key] = float(v) + except (TypeError, ValueError) as exc: + logger.warning( + "TCKDB thermo: cp_data[%d].%s dropped — non-numeric %r (%s).", + i, src_key, v, exc, + ) + points.append(point) + return points + + +def _require_xyz_text(record: Mapping[str, Any]) -> str: + """Pull the xyz string out of an output.yml species record. + + Converts ARC's atom-only xyz (``"C 0.0 0.0 0.0\\nH 1.0 0.0 0.0"``, + emitted by ``xyz_to_str``) into the standard XYZ format that TCKDB + expects (``"\\n\\n"``). If the input + already has an integer atom-count header, it is passed through + untouched. This is the format-translation boundary between ARC's + internal convention and the TCKDB schema. + """ + xyz = record.get("xyz") + if not xyz: + raise ValueError( + f"output.yml record for label={record.get('label')!r} has no xyz; " + "cannot build geometry payload." + ) + text = str(xyz).strip() + if not text: + raise ValueError( + f"output.yml record for label={record.get('label')!r} has empty xyz." + ) + + lines = text.splitlines() + try: + int(lines[0].strip()) return text - # ARC xyz dict: {'symbols': (...), 'coords': ((x,y,z),...), ...} - symbols = candidate.get("symbols") if isinstance(candidate, Mapping) else None - coords = candidate.get("coords") if isinstance(candidate, Mapping) else None - if not symbols or not coords: - raise ValueError(f"Unrecognized xyz container: {type(candidate).__name__}") - lines = [ - f"{sym} {x:.8f} {y:.8f} {z:.8f}" - for sym, (x, y, z) in zip(symbols, coords) - ] - return "\n".join(lines) + except (ValueError, IndexError): + pass + label = record.get("label") or "" + return f"{len(lines)}\n{label}\n{text}" def _summarize_response_body(body: Any, *, max_chars: int = 2000) -> Any: @@ -454,4 +1451,87 @@ def _summarize_response_body(body: Any, *, max_chars: int = 2000) -> Any: return text -__all__ = ["TCKDBAdapter", "UploadOutcome", "CONFORMER_UPLOAD_ENDPOINT", "PAYLOAD_KIND"] +def _extract_calc_refs( + response_data: Any, +) -> tuple[dict[str, Any] | None, list[dict[str, Any]]]: + """Pull primary/additional CalculationUploadRef dicts from a server response. + + Two response shapes are recognized: + + 1. **Conformer upload** (``/uploads/conformers``): ``primary_calculation`` + and ``additional_calculations`` sit at the top level:: + + {"primary_calculation": {"calculation_id": ..., "type": ...}, + "additional_calculations": [...]} + + 2. **Computed-species bundle** (``/uploads/computed-species``): the + same fields are nested under ``conformers[0]``:: + + {"species_entry_id": ..., "conformers": [ + {"key": "...", "primary_calculation": {...}, + "additional_calculations": [...]}, ...]} + + Only the first conformer's refs are surfaced — today's ARC bundles + carry exactly one conformer per species, so multi-conformer + responses would need a richer outcome shape, which we'd add when + the producer side starts emitting them. + + Older server builds omit these fields entirely; the caller treats + their absence as "no artifact targets known". + """ + if not isinstance(response_data, Mapping): + return None, [] + if "conformers" in response_data and isinstance(response_data["conformers"], list): + confs = response_data["conformers"] + if confs and isinstance(confs[0], Mapping): + return _extract_calc_refs(confs[0]) + return None, [] + primary = response_data.get("primary_calculation") + if not isinstance(primary, Mapping): + primary = None + else: + primary = dict(primary) + additional_raw = response_data.get("additional_calculations") or [] + additional = [dict(ref) for ref in additional_raw if isinstance(ref, Mapping)] + return primary, additional + + +def _skip( + calculation_id: int, kind: str, reason: str +) -> "ArtifactUploadOutcome": + """Build a skipped outcome and log the reason once.""" + logger.info( + "TCKDB artifact upload skipped: calc=%s kind=%s reason=%s", + calculation_id, kind, reason, + ) + return ArtifactUploadOutcome( + status="skipped", + sidecar_path=None, + idempotency_key=None, + calculation_id=calculation_id, + kind=kind, + skip_reason=reason, + ) + + +def _close_quietly(client: Any, context: str) -> None: + """Close a TCKDB client and swallow close errors with a debug log.""" + close = getattr(client, "close", None) + if not callable(close): + return + try: + close() + except Exception: # pragma: no cover - close errors swallowed + logger.debug("TCKDB client close errored %s", context, exc_info=True) + + +__all__ = [ + "ARTIFACTS_ENDPOINT_TEMPLATE", + "ArtifactUploadOutcome", + "COMPUTED_SPECIES_ENDPOINT", + "COMPUTED_SPECIES_KIND", + "CONFORMER_UPLOAD_ENDPOINT", + "PAYLOAD_KIND", + "TCKDBAdapter", + "UploadOutcome", +] diff --git a/arc/tckdb/adapter_test.py b/arc/tckdb/adapter_test.py index 4ead0b7c36..d12415cce6 100644 --- a/arc/tckdb/adapter_test.py +++ b/arc/tckdb/adapter_test.py @@ -9,19 +9,21 @@ import json import os +import pathlib import shutil import tempfile import unittest -from types import SimpleNamespace from unittest import mock from arc.tckdb.adapter import ( + ARTIFACTS_ENDPOINT_TEMPLATE, + ArtifactUploadOutcome, CONFORMER_UPLOAD_ENDPOINT, PAYLOAD_KIND, TCKDBAdapter, UploadOutcome, ) -from arc.tckdb.config import TCKDBConfig +from arc.tckdb.config import TCKDBArtifactConfig, TCKDBConfig # --------------------------------------------------------------------------- @@ -57,29 +59,35 @@ def close(self): self.closed = True -def _fake_species(label="ethanol", smiles="CCO", charge=0, multiplicity=1, is_ts=False): - return SimpleNamespace( - label=label, - smiles=smiles, - charge=charge, - multiplicity=multiplicity, - is_ts=is_ts, - final_xyz="C 0.0 0.0 0.0\nH 1.0 0.0 0.0", - ) +def _fake_record(label="ethanol", smiles="CCO", charge=0, multiplicity=1, is_ts=False): + """Mimics one entry from output.yml's `species:` (or `transition_states:`) list. + Note ``ess_versions`` is keyed by job type (``opt``/``freq``/``sp``/``neb``), + matching ``arc/output.py::_get_ess_versions``. + """ + return { + "label": label, + "smiles": smiles, + "charge": charge, + "multiplicity": multiplicity, + "is_ts": is_ts, + "converged": True, + "xyz": "C 0.0 0.0 0.0\nH 1.0 0.0 0.0", + "opt_n_steps": 12, + "opt_final_energy_hartree": -154.123, + "ess_versions": {"opt": "Gaussian 16, Revision A.03"}, + } -def _fake_level(): - return SimpleNamespace( - method="wb97xd", - basis="def2-tzvp", - auxiliary_basis=None, - cabs=None, - dispersion=None, - solvation_method=None, - solvent=None, - software="gaussian", - software_version="16", - ) + +def _fake_output_doc(): + """Mimics the top-level fields of output.yml that the adapter reads.""" + return { + "schema_version": "1.0", + "project": "test_project", + "arc_version": "1.2.3", + "arc_git_commit": "deadbeef", + "opt_level": {"method": "wb97xd", "basis": "def2-tzvp", "software": "gaussian"}, + } # --------------------------------------------------------------------------- @@ -98,7 +106,7 @@ def test_disabled_returns_none_and_writes_nothing(self): cfg = TCKDBConfig(enabled=False, base_url="http://x", payload_dir=self.tmp) client = _StubClient(response=_StubResponse({"ok": True})) adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) - outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) self.assertIsNone(outcome) self.assertEqual(os.listdir(self.tmp), []) self.assertEqual(client.calls, []) @@ -126,7 +134,7 @@ def test_payload_written_before_failed_upload(self): client = _StubClient(raise_exc=RuntimeError("network down")) adapter = self._adapter(client) with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): - outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) self.assertEqual(outcome.status, "failed") self.assertTrue(outcome.payload_path.exists()) @@ -142,7 +150,7 @@ def test_upload_success_updates_sidecar(self): client = _StubClient(response=_StubResponse({"conformer_observation_id": 42})) adapter = self._adapter(client) with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): - outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) self.assertEqual(outcome.status, "uploaded") sc = json.loads(outcome.sidecar_path.read_text()) @@ -163,7 +171,7 @@ def test_upload_records_idempotency_replay(self): ) adapter = self._adapter(client) with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): - outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) sc = json.loads(outcome.sidecar_path.read_text()) self.assertTrue(sc["idempotency_replayed"]) @@ -172,7 +180,7 @@ def test_payload_contains_no_db_ids(self): client = _StubClient(response=_StubResponse({"id": 1})) adapter = self._adapter(client) with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): - outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) flat = outcome.payload_path.read_text() for forbidden in ( '"species_id"', @@ -190,16 +198,22 @@ def test_payload_validates_against_expected_shape(self): client = _StubClient(response=_StubResponse({"id": 1})) adapter = self._adapter(client) with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): - outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) payload = json.loads(outcome.payload_path.read_text()) self.assertIn("species_entry", payload) self.assertIn("geometry", payload) self.assertIn("calculation", payload) self.assertEqual(payload["species_entry"]["smiles"], "CCO") - self.assertEqual(payload["geometry"]["xyz_text"], "C 0.0 0.0 0.0\nH 1.0 0.0 0.0") + self.assertEqual( + payload["geometry"]["xyz_text"], + "2\nethanol\nC 0.0 0.0 0.0\nH 1.0 0.0 0.0", + ) self.assertEqual(payload["calculation"]["type"], "opt") self.assertEqual(payload["calculation"]["software_release"]["name"], "gaussian") - self.assertEqual(payload["calculation"]["software_release"]["version"], "16") + self.assertEqual( + payload["calculation"]["software_release"]["version"], + "Gaussian 16, Revision A.03", + ) self.assertEqual(payload["calculation"]["level_of_theory"]["method"], "wb97xd") self.assertEqual(payload["calculation"]["level_of_theory"]["basis"], "def2-tzvp") @@ -218,7 +232,7 @@ def test_upload_skipped_writes_payload_no_call(self): ) client = _StubClient(response=_StubResponse({"ok": True})) adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) - outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) self.assertEqual(outcome.status, "skipped") self.assertTrue(outcome.payload_path.exists()) sc = json.loads(outcome.sidecar_path.read_text()) @@ -245,7 +259,7 @@ def test_strict_mode_raises_and_records(self): adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): with self.assertRaises(RuntimeError): - adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) # Sidecar still written, status failed files = os.listdir(os.path.join(self.tmp, "conformer_calculation")) sidecar = [f for f in files if f.endswith(".meta.json")][0] @@ -263,7 +277,7 @@ def test_non_strict_does_not_raise(self): client = _StubClient(raise_exc=RuntimeError("503")) adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): - outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) self.assertEqual(outcome.status, "failed") @@ -285,10 +299,11 @@ def test_idempotency_key_stable_and_distinct(self): client_a = _StubClient(response=_StubResponse({"id": 1})) adapter = TCKDBAdapter(self.cfg, client_factory=lambda c, k: client_a) with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): - o1 = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) - o2 = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) - o3 = adapter.submit_conformer( - species=_fake_species(label="methanol", smiles="CO"), level=_fake_level() + o1 = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) + o2 = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) + o3 = adapter.submit_from_output( + output_doc=_fake_output_doc(), + species_record=_fake_record(label="methanol", smiles="CO"), ) self.assertEqual(o1.idempotency_key, o2.idempotency_key) self.assertNotEqual(o1.idempotency_key, o3.idempotency_key) @@ -314,7 +329,7 @@ def test_missing_api_key_records_failure(self): client = _StubClient(response=_StubResponse({"ok": True})) adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) os.environ.pop("DEFINITELY_NOT_SET_X_X", None) - outcome = adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) self.assertEqual(outcome.status, "failed") self.assertEqual(client.calls, []) # never called the network sc = json.loads(outcome.sidecar_path.read_text()) @@ -332,7 +347,1091 @@ def test_missing_api_key_strict_raises(self): adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) os.environ.pop("DEFINITELY_NOT_SET_X_X", None) with self.assertRaises(ValueError): - adapter.submit_conformer(species=_fake_species(), level=_fake_level()) + adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) + + +# --------------------------------------------------------------------------- +# Artifact upload tests +# --------------------------------------------------------------------------- + + +_GAUSSIAN_LOG_HEADER = ( + b" Gaussian, Inc., Pittsburgh PA, All Rights Reserved.\n" + b" Cite this work as:\n Gaussian 16, Revision A.03\n" +) + + +class TestArtifactUpload(unittest.TestCase): + """Tests for TCKDBAdapter.submit_artifacts_for_calculation.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-art-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.project_dir = os.path.join(self.tmp, "project") + os.makedirs(self.project_dir) + self.payload_dir = "tckdb_payloads" # relative -> resolved under project_dir + self.log_path = os.path.join(self.project_dir, "calcs", "Species", "ethanol", "opt", "output.log") + os.makedirs(os.path.dirname(self.log_path)) + with open(self.log_path, "wb") as fh: + fh.write(_GAUSSIAN_LOG_HEADER) + + def _cfg(self, *, artifacts=None, strict=False, api_key_env="X_TCKDB_API_KEY"): + return TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.payload_dir, + api_key_env=api_key_env, + project_label="proj-A", + strict=strict, + artifacts=artifacts or TCKDBArtifactConfig(upload=True, kinds=("output_log",)), + ) + + def _adapter(self, client, cfg=None): + return TCKDBAdapter( + cfg or self._cfg(), + project_directory=self.project_dir, + client_factory=lambda c, k: client, + ) + + def _submit( + self, + adapter, + *, + log_path=None, + calculation_id=42, + calculation_type="opt", + kind="output_log", + ): + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + return adapter.submit_artifacts_for_calculation( + output_doc=_fake_output_doc(), + species_record=_fake_record(), + calculation_id=calculation_id, + calculation_type=calculation_type, + file_path=log_path or self.log_path, + kind=kind, + ) + + def test_disabled_artifact_upload_returns_skipped_no_call(self): + cfg = self._cfg(artifacts=TCKDBArtifactConfig(upload=False)) + client = _StubClient(response=_StubResponse({}, status_code=201)) + adapter = self._adapter(client, cfg=cfg) + outcome = self._submit(adapter) + self.assertEqual(outcome.status, "skipped") + self.assertEqual(client.calls, []) + self.assertIn("artifacts.upload", outcome.skip_reason) + + def test_kind_not_in_config_returns_skipped(self): + cfg = self._cfg(artifacts=TCKDBArtifactConfig(upload=True, kinds=("input",))) + client = _StubClient(response=_StubResponse({}, status_code=201)) + adapter = self._adapter(client, cfg=cfg) + outcome = self._submit(adapter) + self.assertEqual(outcome.status, "skipped") + self.assertEqual(client.calls, []) + self.assertIn("not in config.kinds", outcome.skip_reason) + + def test_missing_log_file_returns_skipped(self): + client = _StubClient(response=_StubResponse({}, status_code=201)) + adapter = self._adapter(client) + outcome = self._submit(adapter, log_path=os.path.join(self.project_dir, "no_such_log.log")) + self.assertEqual(outcome.status, "skipped") + self.assertEqual(client.calls, []) + self.assertIn("file missing", outcome.skip_reason) + + def test_oversized_log_returns_skipped(self): + cfg = self._cfg( + artifacts=TCKDBArtifactConfig(upload=True, kinds=("output_log",), max_size_mb=1) + ) + # Write a 2 MB file + big_log = os.path.join(self.project_dir, "big.log") + with open(big_log, "wb") as fh: + fh.write(b"\x00" * (2 * 1024 * 1024)) + client = _StubClient(response=_StubResponse({}, status_code=201)) + adapter = self._adapter(client, cfg=cfg) + outcome = self._submit(adapter, log_path=big_log) + self.assertEqual(outcome.status, "skipped") + self.assertEqual(client.calls, []) + self.assertIn(">1 MB cap", outcome.skip_reason) + + def test_successful_upload_writes_sidecar_and_posts(self): + client = _StubClient(response=_StubResponse( + {"calculation_id": 42, "artifacts": [{"id": 7}]}, + status_code=201, + )) + adapter = self._adapter(client) + outcome = self._submit(adapter) + self.assertEqual(outcome.status, "uploaded") + self.assertEqual(outcome.calculation_id, 42) + self.assertEqual(outcome.kind, "output_log") + self.assertEqual(len(client.calls), 1) + call = client.calls[0] + self.assertEqual(call["method"], "POST") + self.assertEqual(call["path"], "/calculations/42/artifacts") + # Body shape + artifacts = call["json"]["artifacts"] + self.assertEqual(len(artifacts), 1) + a = artifacts[0] + self.assertEqual(a["kind"], "output_log") + self.assertEqual(a["filename"], "output.log") + self.assertEqual(a["bytes"], len(_GAUSSIAN_LOG_HEADER)) + self.assertEqual(len(a["sha256"]), 64) + self.assertRegex(a["sha256"], r"^[0-9a-f]{64}$") + self.assertTrue(a["content_base64"]) + # Sidecar + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sc["status"], "uploaded") + self.assertEqual(sc["calculation_id"], 42) + self.assertEqual(sc["kind"], "output_log") + self.assertEqual(sc["bytes"], len(_GAUSSIAN_LOG_HEADER)) + + def test_strict_failure_raises(self): + cfg = self._cfg(strict=True) + client = _StubClient(raise_exc=RuntimeError("422 ESS signature missing")) + adapter = self._adapter(client, cfg=cfg) + with self.assertRaises(RuntimeError): + self._submit(adapter) + # Sidecar still recorded as failed + sidecar_files = [ + f for f in os.listdir(os.path.join(self.project_dir, self.payload_dir, "calculation_artifacts")) + if f.endswith(".artifact.meta.json") + ] + self.assertEqual(len(sidecar_files), 1) + sc = json.loads(open(os.path.join(self.project_dir, self.payload_dir, + "calculation_artifacts", sidecar_files[0])).read()) + self.assertEqual(sc["status"], "failed") + self.assertIn("422", sc["last_error"]) + + def test_non_strict_failure_returns_outcome(self): + client = _StubClient(raise_exc=RuntimeError("422 ESS signature missing")) + adapter = self._adapter(client) + outcome = self._submit(adapter) + self.assertEqual(outcome.status, "failed") + self.assertIn("422", outcome.error) + # Sidecar marks failure + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sc["status"], "failed") + + def test_idempotency_replay_recorded(self): + client = _StubClient(response=_StubResponse( + {"calculation_id": 42, "artifacts": [{"id": 7}]}, + status_code=201, + replayed=True, + )) + adapter = self._adapter(client) + outcome = self._submit(adapter) + self.assertEqual(outcome.status, "uploaded") + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertTrue(sc["idempotency_replayed"]) + + def test_idempotency_key_stable_for_same_inputs(self): + client = _StubClient(response=_StubResponse( + {"calculation_id": 42, "artifacts": []}, status_code=201 + )) + adapter = self._adapter(client) + o1 = self._submit(adapter) + o2 = self._submit(adapter) + self.assertEqual(o1.idempotency_key, o2.idempotency_key) + # Both calls sent the same key header + keys = {c["idempotency_key"] for c in client.calls} + self.assertEqual(keys, {o1.idempotency_key}) + + def test_idempotency_key_distinct_for_different_calc(self): + client = _StubClient(response=_StubResponse( + {"calculation_id": 99, "artifacts": []}, status_code=201 + )) + adapter = self._adapter(client) + o1 = self._submit(adapter, calculation_id=42) + o2 = self._submit(adapter, calculation_id=99) + self.assertNotEqual(o1.idempotency_key, o2.idempotency_key) + + def test_endpoint_template_matches_artifact_request(self): + client = _StubClient(response=_StubResponse( + {"calculation_id": 314, "artifacts": []}, status_code=201 + )) + adapter = self._adapter(client) + outcome = self._submit(adapter, calculation_id=314) + self.assertEqual(client.calls[0]["path"], ARTIFACTS_ENDPOINT_TEMPLATE.format(calculation_id=314)) + self.assertEqual(outcome.status, "uploaded") + + def test_input_kind_uploads_with_correct_marshalling(self): + # Input deck file (input.gjf) gets uploaded with kind="input". + # Use a config that allows both kinds, then upload an input file. + cfg = self._cfg( + artifacts=TCKDBArtifactConfig( + upload=True, + kinds=("output_log", "input"), + ) + ) + input_path = os.path.join(self.project_dir, "calcs", "Species", "ethanol", "opt", "input.gjf") + with open(input_path, "wb") as fh: + fh.write(b"%mem=42GB\n# wb97xd/def2tzvp opt\n") + client = _StubClient(response=_StubResponse( + {"calculation_id": 42, "artifacts": []}, status_code=201 + )) + adapter = self._adapter(client, cfg=cfg) + outcome = self._submit(adapter, log_path=input_path, kind="input") + self.assertEqual(outcome.status, "uploaded") + self.assertEqual(outcome.kind, "input") + self.assertEqual(client.calls[0]["json"]["artifacts"][0]["kind"], "input") + self.assertEqual(client.calls[0]["json"]["artifacts"][0]["filename"], "input.gjf") + + def test_unimplemented_kind_defensive_skip(self): + # Even if the user somehow gets `checkpoint` into config.kinds + # AND a caller passes kind="checkpoint", the adapter refuses + # rather than uploading bytes that may not match the kind's + # semantic contract. + cfg = self._cfg( + artifacts=TCKDBArtifactConfig( + upload=True, + kinds=("output_log", "checkpoint"), # parse-time warning only + ) + ) + client = _StubClient(response=_StubResponse({}, status_code=201)) + adapter = self._adapter(client, cfg=cfg) + outcome = self._submit(adapter, kind="checkpoint") + self.assertEqual(outcome.status, "skipped") + self.assertIn("no upload path yet", outcome.skip_reason) + self.assertEqual(client.calls, []) + + +class TestUploadOutcomeCalcRefs(unittest.TestCase): + """Conformer upload exposes primary/additional calc refs from response.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + + def test_calc_refs_extracted_from_response(self): + cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + ) + client = _StubClient(response=_StubResponse({ + "id": 1, + "primary_calculation": {"calculation_id": 10, "type": "opt", "request_index": None}, + "additional_calculations": [ + {"calculation_id": 11, "type": "freq", "request_index": 0}, + {"calculation_id": 12, "type": "sp", "request_index": 1}, + ], + })) + adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) + self.assertEqual(outcome.status, "uploaded") + self.assertEqual(outcome.primary_calculation["calculation_id"], 10) + self.assertEqual(outcome.primary_calculation["type"], "opt") + self.assertEqual(len(outcome.additional_calculations), 2) + self.assertEqual(outcome.additional_calculations[0]["calculation_id"], 11) + self.assertEqual(outcome.additional_calculations[1]["type"], "sp") + + def test_calc_refs_default_when_response_omits_them(self): + cfg = TCKDBConfig( + enabled=True, base_url="http://x", payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + ) + client = _StubClient(response=_StubResponse({"id": 1})) + adapter = TCKDBAdapter(cfg, client_factory=lambda c, k: client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_from_output(output_doc=_fake_output_doc(), species_record=_fake_record()) + self.assertIsNone(outcome.primary_calculation) + self.assertEqual(outcome.additional_calculations, []) + + +class TestAdditionalCalculations(unittest.TestCase): + """opt/freq/sp chain: payload must carry additional_calculations for freq+sp.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + ) + + def _adapter(self, client): + return TCKDBAdapter(self.cfg, client_factory=lambda c, k: client) + + def _submit(self, *, output_doc, record): + client = _StubClient(response=_StubResponse({"id": 1})) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_from_output(output_doc=output_doc, species_record=record) + return outcome, client, json.loads(outcome.payload_path.read_text()) + + def test_opt_only_record_has_no_additional_calculations(self): + """1. opt-only record produces no additional_calculations key.""" + record = _fake_record() # no freq_*, no sp_energy_hartree + _, _, payload = self._submit(output_doc=_fake_output_doc(), record=record) + self.assertNotIn("additional_calculations", payload) + + def test_opt_plus_freq_record_yields_one_freq_additional(self): + """2. opt+freq record produces one freq additional calculation.""" + record = _fake_record() + record["freq_n_imag"] = 0 + record["zpe_hartree"] = 0.024131 + _, _, payload = self._submit(output_doc=_fake_output_doc(), record=record) + additional = payload["additional_calculations"] + self.assertEqual(len(additional), 1) + freq = additional[0] + self.assertEqual(freq["type"], "freq") + self.assertEqual(freq["freq_result"]["n_imag"], 0) + self.assertAlmostEqual(freq["freq_result"]["zpe_hartree"], 0.024131) + self.assertNotIn("imag_freq_cm1", freq["freq_result"]) + # No sp data → no sp calc + self.assertEqual([c["type"] for c in additional], ["freq"]) + + def test_opt_plus_sp_record_yields_one_sp_additional(self): + """3. opt+sp record produces one sp additional calculation.""" + record = _fake_record() + record["sp_energy_hartree"] = -154.987 + _, _, payload = self._submit(output_doc=_fake_output_doc(), record=record) + additional = payload["additional_calculations"] + self.assertEqual(len(additional), 1) + sp = additional[0] + self.assertEqual(sp["type"], "sp") + self.assertAlmostEqual(sp["sp_result"]["electronic_energy_hartree"], -154.987) + + def test_opt_freq_sp_record_yields_two_additional_calculations(self): + """4. opt+freq+sp record produces two additional calculations.""" + record = _fake_record() + record["freq_n_imag"] = 1 + record["imag_freq_cm1"] = -512.3 + record["zpe_hartree"] = 0.0399 + record["sp_energy_hartree"] = -155.111 + _, _, payload = self._submit(output_doc=_fake_output_doc(), record=record) + additional = payload["additional_calculations"] + self.assertEqual([c["type"] for c in additional], ["freq", "sp"]) + freq, sp = additional + self.assertEqual(freq["freq_result"]["n_imag"], 1) + self.assertAlmostEqual(freq["freq_result"]["imag_freq_cm1"], -512.3) + self.assertAlmostEqual(freq["freq_result"]["zpe_hartree"], 0.0399) + self.assertAlmostEqual(sp["sp_result"]["electronic_energy_hartree"], -155.111) + + def test_distinct_levels_preserved_per_calculation(self): + """5. freq/sp use their own level-of-theory when output_doc has them.""" + doc = _fake_output_doc() + doc["freq_level"] = {"method": "wb97xd", "basis": "6-31g*", "software": "gaussian"} + doc["sp_level"] = {"method": "ccsd(t)-f12a", "basis": "cc-pvtz-f12", "software": "molpro"} + record = _fake_record() + record["freq_n_imag"] = 0 + record["zpe_hartree"] = 0.024131 + record["sp_energy_hartree"] = -154.987 + # Job-type-keyed ess_versions: freq from gaussian, sp from molpro + record["ess_versions"] = { + "opt": "Gaussian 16, Revision A.03", + "freq": "Gaussian 16, Revision A.03", + "sp": "Molpro 2022.3", + } + _, _, payload = self._submit(output_doc=doc, record=record) + primary = payload["calculation"] + freq, sp = payload["additional_calculations"] + # opt uses opt_level + ess_versions['opt'] + self.assertEqual(primary["level_of_theory"]["method"], "wb97xd") + self.assertEqual(primary["level_of_theory"]["basis"], "def2-tzvp") + self.assertEqual(primary["software_release"]["version"], "Gaussian 16, Revision A.03") + # freq uses freq_level (distinct basis) + ess_versions['freq'] + self.assertEqual(freq["level_of_theory"]["method"], "wb97xd") + self.assertEqual(freq["level_of_theory"]["basis"], "6-31g*") + self.assertEqual(freq["software_release"]["name"], "gaussian") + self.assertEqual(freq["software_release"]["version"], "Gaussian 16, Revision A.03") + # sp uses sp_level (different method+software) + ess_versions['sp'] + self.assertEqual(sp["level_of_theory"]["method"], "ccsd(t)-f12a") + self.assertEqual(sp["level_of_theory"]["basis"], "cc-pvtz-f12") + self.assertEqual(sp["software_release"]["name"], "molpro") + self.assertEqual(sp["software_release"]["version"], "Molpro 2022.3") + + def test_freq_sp_levels_fall_back_to_opt_level_when_missing(self): + """Option B: missing freq_level/sp_level falls back to opt_level.""" + doc = _fake_output_doc() + # freq_level / sp_level absent (the common ARC case) + record = _fake_record() + record["freq_n_imag"] = 0 + record["sp_energy_hartree"] = -154.5 + _, _, payload = self._submit(output_doc=doc, record=record) + freq, sp = payload["additional_calculations"] + self.assertEqual(freq["level_of_theory"]["method"], "wb97xd") + self.assertEqual(freq["level_of_theory"]["basis"], "def2-tzvp") + self.assertEqual(sp["level_of_theory"]["method"], "wb97xd") + # ess_versions has only 'opt' → both freq and sp fall back to that + self.assertEqual(freq["software_release"]["version"], "Gaussian 16, Revision A.03") + self.assertEqual(sp["software_release"]["version"], "Gaussian 16, Revision A.03") + + def test_ess_versions_uses_job_type_key_not_software_name(self): + """7. ess_versions lookup must use job-type keys ('opt'/'freq'/'sp').""" + # Record with job-type-keyed ess_versions (matches arc/output.py). + # If the adapter were still using software-name keys, version would be missing. + record = _fake_record() + record["freq_n_imag"] = 0 + record["sp_energy_hartree"] = -154.5 + record["ess_versions"] = { + "opt": "OPT_VER", + "freq": "FREQ_VER", + "sp": "SP_VER", + } + _, _, payload = self._submit(output_doc=_fake_output_doc(), record=record) + self.assertEqual(payload["calculation"]["software_release"]["version"], "OPT_VER") + freq, sp = payload["additional_calculations"] + self.assertEqual(freq["software_release"]["version"], "FREQ_VER") + self.assertEqual(sp["software_release"]["version"], "SP_VER") + + def test_idempotency_key_changes_when_freq_sp_added(self): + """Adding freq/sp to a previously opt-only record must change the idempotency key.""" + record_opt_only = _fake_record() + record_full = _fake_record() + record_full["freq_n_imag"] = 0 + record_full["zpe_hartree"] = 0.024 + record_full["sp_energy_hartree"] = -154.5 + o1, _, _ = self._submit(output_doc=_fake_output_doc(), record=record_opt_only) + o2, _, _ = self._submit(output_doc=_fake_output_doc(), record=record_full) + self.assertNotEqual(o1.idempotency_key, o2.idempotency_key) + + # ------------------------------------------------------------------ + # SP origin metadata: reused-from-opt vs independently executed. + # ------------------------------------------------------------------ + + def test_sp_distinct_level_has_no_reused_marker(self): + """sp_level differing from opt_level → no tckdb_origin marker on SP.""" + doc = _fake_output_doc() + doc["sp_level"] = {"method": "ccsd(t)-f12a", "basis": "cc-pvtz-f12", "software": "molpro"} + record = _fake_record() + record["sp_energy_hartree"] = -154.987 + record["ess_versions"] = {"opt": "Gaussian 16, Revision A.03", "sp": "Molpro 2022.3"} + _, _, payload = self._submit(output_doc=doc, record=record) + sp = next(c for c in payload["additional_calculations"] if c["type"] == "sp") + self.assertNotIn( + "parameters_json", sp, + msg="distinct sp_level must not produce a tckdb_origin marker", + ) + + def test_sp_equal_level_marks_reused(self): + """sp_level structurally equal to opt_level → SP carries tckdb_origin=reused_result.""" + doc = _fake_output_doc() + # Same dict contents as opt_level — explicit duplication, not just absence. + doc["sp_level"] = dict(doc["opt_level"]) + record = _fake_record() + record["sp_energy_hartree"] = -154.123 + _, _, payload = self._submit(output_doc=doc, record=record) + sp = next(c for c in payload["additional_calculations"] if c["type"] == "sp") + origin = sp["parameters_json"]["tckdb_origin"] + self.assertEqual(origin["origin_kind"], "reused_result") + self.assertFalse(origin["independent_ess_job"]) + self.assertEqual(origin["reused_from"]["calculation_type"], "opt") + self.assertEqual(origin["producer"], "ARC") + + def test_sp_missing_level_falls_back_and_marks_reused(self): + """Missing sp_level (the common ARC case) is treated as equal-to-opt → reused.""" + doc = _fake_output_doc() # no sp_level key + record = _fake_record() + record["sp_energy_hartree"] = -154.5 + _, _, payload = self._submit(output_doc=doc, record=record) + sp = next(c for c in payload["additional_calculations"] if c["type"] == "sp") + origin = sp["parameters_json"]["tckdb_origin"] + self.assertEqual(origin["origin_kind"], "reused_result") + self.assertEqual(origin["reused_from"]["calculation_type"], "opt") + self.assertFalse(origin["independent_ess_job"]) + + def test_opt_freq_reused_sp_keeps_per_calc_invariants(self): + """opt + freq + reused SP: primary unchanged, each row has only its own result.""" + doc = _fake_output_doc() # only opt_level → sp falls back, freq falls back + record = _fake_record() + record["freq_n_imag"] = 0 + record["zpe_hartree"] = 0.024131 + record["sp_energy_hartree"] = -154.5 + _, _, payload = self._submit(output_doc=doc, record=record) + + # Primary opt unchanged: still type=opt, has opt_result, no parameters_json. + primary = payload["calculation"] + self.assertEqual(primary["type"], "opt") + self.assertIn("opt_result", primary) + self.assertNotIn("freq_result", primary) + self.assertNotIn("sp_result", primary) + self.assertNotIn("parameters_json", primary) + + freq = next(c for c in payload["additional_calculations"] if c["type"] == "freq") + sp = next(c for c in payload["additional_calculations"] if c["type"] == "sp") + + # freq has only freq_result and no origin marker. + self.assertIn("freq_result", freq) + self.assertNotIn("sp_result", freq) + self.assertNotIn("opt_result", freq) + self.assertNotIn("parameters_json", freq) + + # sp has only sp_result and the reused-result marker. + self.assertIn("sp_result", sp) + self.assertNotIn("freq_result", sp) + self.assertNotIn("opt_result", sp) + self.assertEqual( + sp["parameters_json"]["tckdb_origin"]["origin_kind"], "reused_result" + ) + + def test_no_sp_energy_no_sp_additional_calculation(self): + """No sp_energy_hartree on the record → no SP row, regardless of sp_level.""" + doc = _fake_output_doc() + doc["sp_level"] = {"method": "ccsd(t)-f12a", "basis": "cc-pvtz-f12", "software": "molpro"} + record = _fake_record() # no sp_energy_hartree + _, _, payload = self._submit(output_doc=doc, record=record) + types = [c["type"] for c in payload.get("additional_calculations", [])] + self.assertNotIn("sp", types) + + +class TestMalformedAdditionalCalcFields(unittest.TestCase): + """6. Malformed optional freq/sp fields skip the calc with a warning.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + ) + + def _record_with_opt_freq_sp(self): + record = _fake_record() + record["freq_n_imag"] = 0 + record["zpe_hartree"] = 0.0399 + record["sp_energy_hartree"] = -154.5 + return record + + def _submit(self, record): + client = _StubClient(response=_StubResponse({"id": 1})) + adapter = TCKDBAdapter(self.cfg, client_factory=lambda c, k: client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_from_output( + output_doc=_fake_output_doc(), species_record=record + ) + return outcome, json.loads(outcome.payload_path.read_text()) + + def test_malformed_freq_field_skips_freq_but_keeps_opt_and_sp(self): + """Malformed freq value drops the freq calc only; opt + sp survive.""" + record = self._record_with_opt_freq_sp() + record["zpe_hartree"] = "not_a_number" + + with self.assertLogs("arc", level="WARNING") as logs: + outcome, payload = self._submit(record) + + self.assertEqual(outcome.status, "uploaded") + # Primary opt calculation present and intact + self.assertEqual(payload["calculation"]["type"], "opt") + self.assertIn("opt_result", payload["calculation"]) + # freq dropped, sp preserved + additional = payload.get("additional_calculations", []) + types = [c["type"] for c in additional] + self.assertNotIn("freq", types) + self.assertIn("sp", types) + # Warning names the calc type and the offending field + self.assertTrue( + any("freq" in m and "zpe_hartree" in m for m in logs.output), + msg=f"expected warning naming freq/zpe_hartree; got {logs.output}", + ) + + def test_malformed_sp_field_skips_sp_but_keeps_opt_and_freq(self): + """Malformed sp value drops the sp calc only; opt + freq survive.""" + record = self._record_with_opt_freq_sp() + record["sp_energy_hartree"] = "not-a-float" + + with self.assertLogs("arc", level="WARNING") as logs: + outcome, payload = self._submit(record) + + self.assertEqual(outcome.status, "uploaded") + self.assertEqual(payload["calculation"]["type"], "opt") + self.assertIn("opt_result", payload["calculation"]) + additional = payload.get("additional_calculations", []) + types = [c["type"] for c in additional] + self.assertIn("freq", types) + self.assertNotIn("sp", types) + self.assertTrue( + any("sp" in m and "sp_energy_hartree" in m for m in logs.output), + msg=f"expected warning naming sp/sp_energy_hartree; got {logs.output}", + ) + + def test_both_malformed_yields_two_warnings_and_no_additional_calcs(self): + """Malformed freq AND sp: two warnings, payload has only opt.""" + record = self._record_with_opt_freq_sp() + record["zpe_hartree"] = "not_a_number" + record["sp_energy_hartree"] = "not-a-float" + + with self.assertLogs("arc", level="WARNING") as logs: + outcome, payload = self._submit(record) + + self.assertEqual(outcome.status, "uploaded") + self.assertEqual(payload["calculation"]["type"], "opt") + # Either no key, or empty list — both are "no additional calcs" + self.assertEqual(payload.get("additional_calculations", []), []) + + freq_warnings = [m for m in logs.output if "freq" in m and "zpe_hartree" in m] + sp_warnings = [m for m in logs.output if "sp" in m and "sp_energy_hartree" in m] + self.assertEqual( + len(freq_warnings), 1, + msg=f"expected exactly one freq warning; got {logs.output}", + ) + self.assertEqual( + len(sp_warnings), 1, + msg=f"expected exactly one sp warning; got {logs.output}", + ) + + +# --------------------------------------------------------------------------- +# Computed-species bundle (POST /uploads/computed-species) +# --------------------------------------------------------------------------- + + +_FORBIDDEN_BUNDLE_ID_FIELDS = { + "existing_calculation_id", + "existing_conformer_id", + "existing_conformer_observation_id", + "existing_species_entry_id", + "source_calculation_id", + "source_conformer_observation_id", +} + + +def _walk_for_keys(value, keys): + """Yield every (path, key) pair in a JSON-like value matching ``keys``.""" + if isinstance(value, dict): + for k, v in value.items(): + if k in keys: + yield k + yield from _walk_for_keys(v, keys) + elif isinstance(value, list): + for v in value: + yield from _walk_for_keys(v, keys) + + +def _full_record(): + """A species record with opt+freq+sp results plus thermo populated.""" + record = _fake_record() + record["freq_n_imag"] = 0 + record["zpe_hartree"] = 0.024131 + record["sp_energy_hartree"] = -154.987 + record["thermo"] = { + "h298_kj_mol": -235.1, + "s298_j_mol_k": 282.6, + "tmin_k": 100.0, + "tmax_k": 5000.0, + "nasa_low": { + "tmin_k": 100.0, + "tmax_k": 1000.0, + "coeffs": [4.0, -1e-3, 2e-6, -1e-9, 4e-13, -29000.0, 1.0], + }, + "nasa_high": { + "tmin_k": 1000.0, + "tmax_k": 5000.0, + "coeffs": [3.5, 1e-3, -2e-7, 1e-11, -3e-15, -28500.0, 5.0], + }, + "cp_data": [ + {"temperature_k": 300.0, "cp_j_mol_k": 33.6}, + {"temperature_k": 400.0, "cp_j_mol_k": 35.2}, + {"temperature_k": 500.0, "cp_j_mol_k": 37.0}, + ], + } + return record + + +class TestComputedSpeciesBundle(unittest.TestCase): + """Producer-side tests for the /uploads/computed-species bundle path.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-bundle-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode="computed_species", + ) + + def _adapter(self, client, *, project_directory=None, cfg=None): + return TCKDBAdapter( + cfg or self.cfg, + project_directory=project_directory, + client_factory=lambda c, k: client, + ) + + def _submit(self, *, output_doc=None, record=None, client=None, + project_directory=None, cfg=None): + client = client or _StubClient(response=_StubResponse({ + "species_entry_id": 7, + "conformers": [{ + "key": "conf0", + "conformer_group_id": 3, + "conformer_observation_id": 11, + "primary_calculation": {"key": "opt", "calculation_id": 100, "type": "opt", "role": "primary"}, + "additional_calculations": [ + {"key": "freq", "calculation_id": 101, "type": "freq", "role": "additional"}, + {"key": "sp", "calculation_id": 102, "type": "sp", "role": "additional"}, + ], + }], + "thermo": {"thermo_id": 9}, + })) + adapter = self._adapter(client, project_directory=project_directory, cfg=cfg) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=output_doc or _fake_output_doc(), + species_record=record or _full_record(), + ) + return outcome, client, json.loads(outcome.payload_path.read_text()) + + # ---------------- 1: payload contains species_entry + one conformer + def test_payload_contains_species_entry_and_one_conformer(self): + _, _, payload = self._submit() + self.assertIn("species_entry", payload) + self.assertEqual(payload["species_entry"]["smiles"], "CCO") + self.assertEqual(payload["species_entry"]["charge"], 0) + self.assertEqual(payload["species_entry"]["multiplicity"], 1) + self.assertEqual(len(payload["conformers"]), 1) + self.assertEqual(payload["conformers"][0]["key"], "conf0") + + # ---------------- 2: primary opt maps correctly + def test_primary_opt_calculation_maps_correctly(self): + _, _, payload = self._submit() + primary = payload["conformers"][0]["primary_calculation"] + self.assertEqual(primary["key"], "opt") + self.assertEqual(primary["type"], "opt") + self.assertEqual(primary["quality"], "raw") + self.assertEqual(primary["level_of_theory"]["method"], "wb97xd") + self.assertEqual(primary["software_release"]["name"], "gaussian") + self.assertIn("opt_result", primary) + self.assertEqual(primary["opt_result"]["n_steps"], 12) + + # ---------------- 3: freq+sp included when fields exist + def test_freq_and_sp_included_when_fields_exist(self): + _, _, payload = self._submit() + keys = [c["key"] for c in payload["conformers"][0]["additional_calculations"]] + self.assertEqual(keys, ["freq", "sp"]) + freq = payload["conformers"][0]["additional_calculations"][0] + sp = payload["conformers"][0]["additional_calculations"][1] + self.assertEqual(freq["freq_result"]["n_imag"], 0) + self.assertAlmostEqual(freq["freq_result"]["zpe_hartree"], 0.024131) + self.assertAlmostEqual(sp["sp_result"]["electronic_energy_hartree"], -154.987) + + # ---------------- 4: dependencies point to opt by local key + def test_dependencies_point_to_opt_by_local_key(self): + _, _, payload = self._submit() + freq, sp = payload["conformers"][0]["additional_calculations"] + self.assertEqual( + freq["depends_on"], + [{"parent_calculation_key": "opt", "role": "freq_on"}], + ) + self.assertEqual( + sp["depends_on"], + [{"parent_calculation_key": "opt", "role": "single_point_on"}], + ) + # Primary opt has no dependencies in the bundle namespace. + self.assertNotIn("depends_on", payload["conformers"][0]["primary_calculation"]) + + # ---------------- 5: missing freq/sp omits calc + dep + def test_missing_freq_sp_omits_calc_and_dep(self): + record = _fake_record() # no freq_*, no sp_* + _, _, payload = self._submit(record=record) + self.assertEqual(payload["conformers"][0]["additional_calculations"], []) + # Thermo also has no freq/sp source links, since neither calc was included. + self.assertNotIn("thermo", payload) + + # ---------------- 6: thermo scalar fields + def test_thermo_scalar_fields_map_correctly(self): + _, _, payload = self._submit() + thermo = payload["thermo"] + self.assertAlmostEqual(thermo["h298_kj_mol"], -235.1) + self.assertAlmostEqual(thermo["s298_j_mol_k"], 282.6) + self.assertAlmostEqual(thermo["tmin_k"], 100.0) + self.assertAlmostEqual(thermo["tmax_k"], 5000.0) + + # ---------------- 7: NASA coefficients + def test_nasa_coefficients_map_correctly(self): + _, _, payload = self._submit() + nasa = payload["thermo"]["nasa"] + self.assertAlmostEqual(nasa["t_low"], 100.0) + self.assertAlmostEqual(nasa["t_mid"], 1000.0) + self.assertAlmostEqual(nasa["t_high"], 5000.0) + self.assertAlmostEqual(nasa["a1"], 4.0) + self.assertAlmostEqual(nasa["a7"], 1.0) + self.assertAlmostEqual(nasa["b1"], 3.5) + self.assertAlmostEqual(nasa["b7"], 5.0) + + # ---------------- 8: malformed NASA skips block, keeps scalar thermo + def test_malformed_nasa_skips_block_keeps_scalar(self): + record = _full_record() + # Drop one coefficient → not 7 → NASA block must be skipped. + record["thermo"]["nasa_low"]["coeffs"] = [1.0] * 6 + with self.assertLogs("arc", level="WARNING") as logs: + _, _, payload = self._submit(record=record) + self.assertNotIn("nasa", payload["thermo"]) + # Scalar thermo retained. + self.assertAlmostEqual(payload["thermo"]["h298_kj_mol"], -235.1) + self.assertAlmostEqual(payload["thermo"]["s298_j_mol_k"], 282.6) + # Cp points retained. + self.assertEqual(len(payload["thermo"]["points"]), 3) + self.assertTrue(any("NASA block skipped" in m for m in logs.output)) + + # ---------------- 9: Cp points + def test_cp_points_map_correctly(self): + _, _, payload = self._submit() + points = payload["thermo"]["points"] + self.assertEqual([p["temperature_k"] for p in points], [300.0, 400.0, 500.0]) + self.assertAlmostEqual(points[0]["cp_j_mol_k"], 33.6) + + # ---------------- 10: thermo source links use local keys + def test_thermo_source_calculations_use_local_keys(self): + _, _, payload = self._submit() + sources = payload["thermo"]["source_calculations"] + self.assertEqual( + sources, + [ + {"calculation_key": "freq", "role": "freq"}, + {"calculation_key": "sp", "role": "sp"}, + ], + ) + + # ---------------- 11: payload has no DB ids anywhere + def test_payload_has_no_existing_or_source_calculation_id(self): + _, _, payload = self._submit() + forbidden_hits = list(_walk_for_keys(payload, _FORBIDDEN_BUNDLE_ID_FIELDS)) + self.assertEqual(forbidden_hits, [], msg=f"forbidden DB id keys present: {forbidden_hits}") + + # ---------------- 12: artifacts attach under correct calc when enabled + def test_artifacts_attach_under_correct_calc_when_enabled(self): + # Create a project dir with three real log files, configure artifacts on. + proj = pathlib.Path(self.tmp) / "project" + proj.mkdir() + for name in ("opt.log", "freq.log", "sp.log"): + (proj / name).write_bytes( + # Fake but valid Gaussian header so any future signature check passes; + # the producer doesn't validate, but real fakes are cheaper than mocks. + b" Entering Gaussian System, Link 0\n" + b"x" * 256 + ) + record = _full_record() + record["opt_log"] = "opt.log" + record["freq_log"] = "freq.log" + record["sp_log"] = "sp.log" + cfg_with_art = TCKDBConfig( + enabled=True, + base_url=self.cfg.base_url, + payload_dir=str(proj / "tckdb_payloads"), + api_key_env=self.cfg.api_key_env, + project_label=self.cfg.project_label, + upload_mode="computed_species", + artifacts=TCKDBArtifactConfig(upload=True, kinds=("output_log",), max_size_mb=50), + ) + _, _, payload = self._submit( + record=record, + project_directory=str(proj), + cfg=cfg_with_art, + ) + primary = payload["conformers"][0]["primary_calculation"] + freq, sp = payload["conformers"][0]["additional_calculations"] + self.assertEqual(len(primary["artifacts"]), 1) + self.assertEqual(primary["artifacts"][0]["kind"], "output_log") + self.assertEqual(primary["artifacts"][0]["filename"], "opt.log") + self.assertEqual(len(freq["artifacts"]), 1) + self.assertEqual(freq["artifacts"][0]["filename"], "freq.log") + self.assertEqual(len(sp["artifacts"]), 1) + self.assertEqual(sp["artifacts"][0]["filename"], "sp.log") + + # ---------------- 13: artifacts disabled = no artifact lists emitted + def test_artifact_disabled_produces_no_artifacts(self): + _, _, payload = self._submit() # default cfg has artifacts.upload=False + primary = payload["conformers"][0]["primary_calculation"] + self.assertNotIn("artifacts", primary) + for calc in payload["conformers"][0]["additional_calculations"]: + self.assertNotIn("artifacts", calc) + + # ---------------- 12b: input-deck artifacts inline alongside output_log + def test_input_artifacts_inline_when_kind_enabled_and_path_recorded(self): + """``_input`` from output.yml drives the inline ``input`` artifact.""" + proj = pathlib.Path(self.tmp) / "project_inputs" + proj.mkdir() + # Produce all six artifact files: opt log + deck, freq log + deck, sp log + deck. + # output_log files need a fake ESS header so any future signature check + # would pass; deck contents are arbitrary text. + for name in ("opt.log", "freq.log", "sp.log"): + (proj / name).write_bytes( + b" Entering Gaussian System, Link 0\n" + b"x" * 256 + ) + for name in ("opt.gjf", "freq.gjf", "sp.gjf"): + (proj / name).write_text("# opt freq=hpmodes wb97xd/def2tzvp\n") + record = _full_record() + record["opt_log"] = "opt.log" + record["freq_log"] = "freq.log" + record["sp_log"] = "sp.log" + # The new schema-extension keys from arc/output.py. + record["opt_input"] = "opt.gjf" + record["freq_input"] = "freq.gjf" + record["sp_input"] = "sp.gjf" + cfg = TCKDBConfig( + enabled=True, + base_url=self.cfg.base_url, + payload_dir=str(proj / "tckdb_payloads"), + api_key_env=self.cfg.api_key_env, + project_label=self.cfg.project_label, + upload_mode="computed_species", + artifacts=TCKDBArtifactConfig( + upload=True, kinds=("output_log", "input"), max_size_mb=50, + ), + ) + _, _, payload = self._submit(record=record, project_directory=str(proj), cfg=cfg) + primary = payload["conformers"][0]["primary_calculation"] + freq, sp = payload["conformers"][0]["additional_calculations"] + # Each calc should now have BOTH kinds inlined. + for calc, expected_files in ( + (primary, ("opt.log", "opt.gjf")), + (freq, ("freq.log", "freq.gjf")), + (sp, ("sp.log", "sp.gjf")), + ): + self.assertEqual(len(calc["artifacts"]), 2, + msg=f"{calc['key']}: expected log+input, got {calc['artifacts']}") + kinds = [a["kind"] for a in calc["artifacts"]] + self.assertEqual(sorted(kinds), ["input", "output_log"]) + filenames = {a["filename"] for a in calc["artifacts"]} + self.assertEqual(filenames, set(expected_files)) + + def test_input_artifact_omitted_when_kind_not_configured(self): + """``input`` not in config.artifacts.kinds → only output_log emitted, even if path recorded.""" + proj = pathlib.Path(self.tmp) / "project_no_input_kind" + proj.mkdir() + (proj / "opt.log").write_bytes(b" Entering Gaussian System, Link 0\n" + b"x" * 256) + (proj / "opt.gjf").write_text("# opt\n") + record = _full_record() + record["opt_log"] = "opt.log" + record["opt_input"] = "opt.gjf" # path is set, but kind isn't enabled + record.pop("freq_log", None); record.pop("sp_log", None) + record["freq_n_imag"] = None # drop freq/sp so we only test the opt calc + record["zpe_hartree"] = None + record.pop("sp_energy_hartree", None) + record.pop("thermo", None) # avoid thermo's source-link assertions + cfg = TCKDBConfig( + enabled=True, + base_url=self.cfg.base_url, + payload_dir=str(proj / "tckdb_payloads"), + api_key_env=self.cfg.api_key_env, + project_label=self.cfg.project_label, + upload_mode="computed_species", + artifacts=TCKDBArtifactConfig( + upload=True, kinds=("output_log",), max_size_mb=50, + ), + ) + _, _, payload = self._submit(record=record, project_directory=str(proj), cfg=cfg) + primary = payload["conformers"][0]["primary_calculation"] + self.assertEqual(len(primary["artifacts"]), 1) + self.assertEqual(primary["artifacts"][0]["kind"], "output_log") + + def test_input_artifact_omitted_when_path_field_null(self): + """``input`` enabled but record has no ``_input`` → just output_log emitted.""" + proj = pathlib.Path(self.tmp) / "project_input_null" + proj.mkdir() + (proj / "opt.log").write_bytes(b" Entering Gaussian System, Link 0\n" + b"x" * 256) + record = _full_record() + record["opt_log"] = "opt.log" + record["opt_input"] = None # explicitly None — file wasn't kept + record.pop("freq_log", None); record.pop("sp_log", None) + record["freq_n_imag"] = None + record["zpe_hartree"] = None + record.pop("sp_energy_hartree", None) + record.pop("thermo", None) + cfg = TCKDBConfig( + enabled=True, + base_url=self.cfg.base_url, + payload_dir=str(proj / "tckdb_payloads"), + api_key_env=self.cfg.api_key_env, + project_label=self.cfg.project_label, + upload_mode="computed_species", + artifacts=TCKDBArtifactConfig( + upload=True, kinds=("output_log", "input"), max_size_mb=50, + ), + ) + _, _, payload = self._submit(record=record, project_directory=str(proj), cfg=cfg) + primary = payload["conformers"][0]["primary_calculation"] + self.assertEqual(len(primary["artifacts"]), 1) + self.assertEqual(primary["artifacts"][0]["kind"], "output_log") + + # ---------------- 14/15: sidecar contract + def test_sidecar_payload_kind_and_endpoint(self): + outcome, _, _ = self._submit() + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sc["payload_kind"], "computed_species") + self.assertEqual(sc["endpoint"], "/uploads/computed-species") + self.assertEqual(sc["bundle_format_version"], "0") + # Sidecar must live under the dedicated computed_species subdir + # so replay can sweep it without colliding with conformer sidecars. + self.assertIn("computed_species", str(outcome.sidecar_path)) + + # ---------------- 16: stable idempotency key + def test_idempotency_key_stable_for_identical_payload(self): + out1, _, _ = self._submit() + out2, _, _ = self._submit() + self.assertEqual(out1.idempotency_key, out2.idempotency_key) + # And distinct from a payload-altering change. + modified_record = _full_record() + modified_record["sp_energy_hartree"] = -200.0 + out3, _, _ = self._submit(record=modified_record) + self.assertNotEqual(out1.idempotency_key, out3.idempotency_key) + + # ---------------- 17: payload written before live upload + def test_payload_written_before_live_upload(self): + client = _StubClient(raise_exc=RuntimeError("network down")) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=_fake_output_doc(), + species_record=_full_record(), + ) + self.assertEqual(outcome.status, "failed") + self.assertTrue(outcome.payload_path.exists(), + "payload must hit disk before any network call") + # Sidecar exists and records the failure. + self.assertTrue(outcome.sidecar_path.exists()) + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sc["status"], "failed") + self.assertIn("network down", sc["last_error"]) + + # ---------------- 18: live upload success marks sidecar uploaded + def test_live_upload_success_marks_sidecar_uploaded(self): + outcome, client, _ = self._submit() + self.assertEqual(outcome.status, "uploaded") + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sc["status"], "uploaded") + self.assertEqual(sc["response_status_code"], 201) + # And the call really went to the bundle endpoint. + self.assertEqual(client.calls[0]["path"], "/uploads/computed-species") + # Idempotency-Key was sent. + self.assertEqual(client.calls[0]["idempotency_key"], outcome.idempotency_key) + + # ---------------- 19: live upload failure preserves payload, marks failed + def test_live_upload_failure_marks_failed_and_preserves_payload(self): + client = _StubClient(raise_exc=RuntimeError("HTTP 503")) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=_fake_output_doc(), + species_record=_full_record(), + ) + # Payload still on disk verbatim — replay-ready. + self.assertTrue(outcome.payload_path.exists()) + replay_payload = json.loads(outcome.payload_path.read_text()) + self.assertIn("species_entry", replay_payload) + self.assertIn("conformers", replay_payload) + self.assertEqual(outcome.status, "failed") + self.assertIn("HTTP 503", outcome.error) + + # ---------------- 20: producer reads only output_doc + species_record (mapping arg shape) + def test_producer_consumes_only_output_doc_and_record_mappings(self): + # Pass plain dicts (no ARC class instances). If the adapter ever + # reaches into ARC live objects, this test would surface that + # via a missing-attribute crash. + plain_doc = dict(_fake_output_doc()) + plain_record = dict(_full_record()) + outcome, _, _ = self._submit(output_doc=plain_doc, record=plain_record) + self.assertEqual(outcome.status, "uploaded") if __name__ == "__main__": diff --git a/arc/tckdb/config.py b/arc/tckdb/config.py index 8fd7c5ba0c..05f9b4fdba 100644 --- a/arc/tckdb/config.py +++ b/arc/tckdb/config.py @@ -9,17 +9,101 @@ at upload time. """ -from __future__ import annotations - +import logging import os +from collections.abc import Mapping from dataclasses import dataclass, field -from typing import Any, Mapping +from typing import Any + + +logger = logging.getLogger("arc") DEFAULT_PAYLOAD_DIR = "tckdb_payloads" DEFAULT_TIMEOUT_SECONDS = 30.0 DEFAULT_API_KEY_ENV = "TCKDB_API_KEY" +# Upload-mode switch. ``conformer`` (default) keeps the existing +# /uploads/conformers + per-artifact path. ``computed_species`` builds +# one self-contained bundle and posts it to /uploads/computed-species. +# A run can use either; mixing per-species is intentionally not +# supported — pick one mode per ARC run. +UPLOAD_MODE_CONFORMER = "conformer" +UPLOAD_MODE_COMPUTED_SPECIES = "computed_species" +VALID_UPLOAD_MODES = frozenset({UPLOAD_MODE_CONFORMER, UPLOAD_MODE_COMPUTED_SPECIES}) + +# Mirrors the server-side ArtifactKind enum +# (backend/app/db/models/common.py:147 in TCKDB_v2). Keeping the source +# of truth here keeps the adapter loud-failing on unknown kinds at +# config parse time rather than at HTTP 422 time. +VALID_ARTIFACT_KINDS = frozenset({ + "input", + "output_log", + "checkpoint", + "formatted_checkpoint", + "ancillary", +}) + +# Subset of VALID_ARTIFACT_KINDS that ARC actually has the codepath to +# produce. Listing a valid-but-not-implemented kind in input.yml is +# permitted (so users can opt into future kinds early) but warned at +# parse time so the silent-zero-uploads outcome is visible. +# +# As new upload paths land, add their kinds here. Today: +# - output_log: ESS log file, from record["opt_log"]/["freq_log"]/["sp_log"] +# - input: ESS input deck (input.gjf / ZMAT / input.in), sibling of opt_log +IMPLEMENTED_ARTIFACT_KINDS = frozenset({"output_log", "input"}) + +DEFAULT_ARTIFACT_KINDS: tuple[str, ...] = ("output_log",) +DEFAULT_ARTIFACT_MAX_SIZE_MB = 50 + + +@dataclass(frozen=True) +class TCKDBArtifactConfig: + """Per-artifact upload knobs. + + Defaults are conservative: artifact upload is opt-in (``upload=False``) + and only ``output_log`` is in scope. Users opt in by adding an + ``artifacts:`` sub-block to the ``tckdb`` config. + """ + + upload: bool = False + kinds: tuple[str, ...] = DEFAULT_ARTIFACT_KINDS + max_size_mb: int = DEFAULT_ARTIFACT_MAX_SIZE_MB + + @classmethod + def from_dict(cls, raw: Mapping[str, Any] | None) -> "TCKDBArtifactConfig": + if not raw: + return cls() + kinds_raw = raw.get("kinds", DEFAULT_ARTIFACT_KINDS) + if isinstance(kinds_raw, str): + kinds_raw = (kinds_raw,) + kinds = tuple(str(k) for k in kinds_raw) + unknown = [k for k in kinds if k not in VALID_ARTIFACT_KINDS] + if unknown: + raise ValueError( + f"tckdb.artifacts.kinds contains unknown kind(s): {unknown}. " + f"Valid kinds: {sorted(VALID_ARTIFACT_KINDS)}." + ) + not_implemented = [k for k in kinds if k not in IMPLEMENTED_ARTIFACT_KINDS] + if not_implemented: + logger.warning( + "tckdb.artifacts.kinds includes kind(s) the TCKDB server accepts " + "but ARC doesn't yet produce uploads for: %s. Currently implemented: %s. " + "These will be silently skipped at upload time.", + not_implemented, sorted(IMPLEMENTED_ARTIFACT_KINDS), + ) + max_size_mb = int(raw.get("max_size_mb", DEFAULT_ARTIFACT_MAX_SIZE_MB)) + if max_size_mb <= 0: + raise ValueError( + f"tckdb.artifacts.max_size_mb must be > 0; got {max_size_mb}." + ) + return cls( + upload=bool(raw.get("upload", False)), + kinds=kinds, + max_size_mb=max_size_mb, + ) + @dataclass(frozen=True) class TCKDBConfig: @@ -44,6 +128,8 @@ class TCKDBConfig: default=None, metadata={"help": "Optional ARC project/run label baked into idempotency keys."}, ) + upload_mode: str = UPLOAD_MODE_CONFORMER + artifacts: TCKDBArtifactConfig = field(default_factory=TCKDBArtifactConfig) @classmethod def from_dict(cls, raw: Mapping[str, Any] | None) -> "TCKDBConfig | None": @@ -59,6 +145,12 @@ def from_dict(cls, raw: Mapping[str, Any] | None) -> "TCKDBConfig | None": base_url = raw.get("base_url") if not isinstance(base_url, str) or not base_url: raise ValueError("tckdb.base_url is required when tckdb.enabled is true.") + upload_mode = str(raw.get("upload_mode", UPLOAD_MODE_CONFORMER)) + if upload_mode not in VALID_UPLOAD_MODES: + raise ValueError( + f"tckdb.upload_mode must be one of {sorted(VALID_UPLOAD_MODES)}; " + f"got {upload_mode!r}." + ) return cls( enabled=True, base_url=base_url, @@ -68,6 +160,8 @@ def from_dict(cls, raw: Mapping[str, Any] | None) -> "TCKDBConfig | None": strict=bool(raw.get("strict", False)), timeout_seconds=float(raw.get("timeout_seconds", DEFAULT_TIMEOUT_SECONDS)), project_label=raw.get("project_label"), + upload_mode=upload_mode, + artifacts=TCKDBArtifactConfig.from_dict(raw.get("artifacts")), ) def resolve_api_key(self) -> str | None: diff --git a/arc/tckdb/config_test.py b/arc/tckdb/config_test.py index d33bb69267..d9abbc35ad 100644 --- a/arc/tckdb/config_test.py +++ b/arc/tckdb/config_test.py @@ -3,15 +3,21 @@ """Unit tests for arc.tckdb.config.""" +import logging import os import unittest from unittest import mock from arc.tckdb.config import ( DEFAULT_API_KEY_ENV, + DEFAULT_ARTIFACT_KINDS, + DEFAULT_ARTIFACT_MAX_SIZE_MB, DEFAULT_PAYLOAD_DIR, DEFAULT_TIMEOUT_SECONDS, + IMPLEMENTED_ARTIFACT_KINDS, + TCKDBArtifactConfig, TCKDBConfig, + VALID_ARTIFACT_KINDS, ) @@ -72,6 +78,133 @@ def test_resolve_api_key_missing(self): os.environ.pop("DOES_NOT_EXIST_X", None) self.assertIsNone(cfg.resolve_api_key()) + def test_artifacts_defaults_when_omitted(self): + cfg = TCKDBConfig.from_dict({"enabled": True, "base_url": "http://x"}) + self.assertIsNotNone(cfg) + self.assertFalse(cfg.artifacts.upload) + self.assertEqual(cfg.artifacts.kinds, DEFAULT_ARTIFACT_KINDS) + self.assertEqual(cfg.artifacts.max_size_mb, DEFAULT_ARTIFACT_MAX_SIZE_MB) + + def test_artifacts_full_block(self): + cfg = TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "artifacts": { + "upload": True, + "kinds": ["output_log", "input"], + "max_size_mb": 25, + }, + }) + self.assertTrue(cfg.artifacts.upload) + self.assertEqual(cfg.artifacts.kinds, ("output_log", "input")) + self.assertEqual(cfg.artifacts.max_size_mb, 25) + + def test_artifacts_partial_block_uses_defaults(self): + cfg = TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "artifacts": {"upload": True}, + }) + self.assertTrue(cfg.artifacts.upload) + self.assertEqual(cfg.artifacts.kinds, DEFAULT_ARTIFACT_KINDS) + self.assertEqual(cfg.artifacts.max_size_mb, DEFAULT_ARTIFACT_MAX_SIZE_MB) + + def test_artifacts_unknown_kind_rejected(self): + with self.assertRaises(ValueError) as ctx: + TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "artifacts": {"kinds": ["output_log", "bogus"]}, + }) + self.assertIn("bogus", str(ctx.exception)) + + def test_artifacts_max_size_zero_rejected(self): + with self.assertRaises(ValueError): + TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "artifacts": {"max_size_mb": 0}, + }) + + def test_artifacts_max_size_negative_rejected(self): + with self.assertRaises(ValueError): + TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "artifacts": {"max_size_mb": -5}, + }) + + def test_artifacts_kinds_string_normalized_to_tuple(self): + cfg = TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "artifacts": {"kinds": "output_log"}, + }) + self.assertEqual(cfg.artifacts.kinds, ("output_log",)) + + def test_valid_artifact_kinds_matches_server_enum(self): + # Sanity check: keep the ARC-side allowlist in sync with the + # server's ArtifactKind enum. + self.assertEqual( + VALID_ARTIFACT_KINDS, + frozenset({"input", "output_log", "checkpoint", + "formatted_checkpoint", "ancillary"}), + ) + + +class TestTCKDBArtifactConfig(unittest.TestCase): + + def test_dataclass_defaults(self): + c = TCKDBArtifactConfig() + self.assertFalse(c.upload) + self.assertEqual(c.kinds, DEFAULT_ARTIFACT_KINDS) + self.assertEqual(c.max_size_mb, DEFAULT_ARTIFACT_MAX_SIZE_MB) + + +class TestImplementedKinds(unittest.TestCase): + """Server-accepted vs ARC-implemented kinds split.""" + + def test_implemented_is_subset_of_valid(self): + self.assertTrue(IMPLEMENTED_ARTIFACT_KINDS.issubset(VALID_ARTIFACT_KINDS)) + + def test_implemented_kinds_today(self): + # Pin the current implementation surface so adding a kind is a + # deliberate, test-failing change. + self.assertEqual( + IMPLEMENTED_ARTIFACT_KINDS, + frozenset({"output_log", "input"}), + ) + + def test_unimplemented_kind_warns_but_does_not_raise(self): + with self.assertLogs("arc", level="WARNING") as cm: + cfg = TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "artifacts": {"kinds": ["output_log", "checkpoint"]}, + }) + self.assertIsNotNone(cfg) + self.assertEqual(cfg.artifacts.kinds, ("output_log", "checkpoint")) + joined = "\n".join(cm.output) + self.assertIn("checkpoint", joined) + self.assertIn("doesn't yet produce", joined) + + def test_implemented_only_kinds_no_warning(self): + # No warning when every requested kind is in IMPLEMENTED. + # We assert by checking the warning logger was either silent or + # only emitted unrelated lines (no "doesn't yet produce" phrase). + logger_name = "arc" + with self.assertLogs(logger_name, level="WARNING") as cm: + # Force at least one log line so assertLogs doesn't raise: + logging.getLogger(logger_name).warning("sentinel") + TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "artifacts": {"kinds": ["output_log", "input"]}, + }) + joined = "\n".join(cm.output) + self.assertNotIn("doesn't yet produce", joined) + if __name__ == "__main__": + import logging unittest.main() diff --git a/arc/tckdb/idempotency.py b/arc/tckdb/idempotency.py index 1004ecc0f6..a3c388e294 100644 --- a/arc/tckdb/idempotency.py +++ b/arc/tckdb/idempotency.py @@ -6,13 +6,13 @@ across retries, distinct across logically-different inputs. """ -from __future__ import annotations - import hashlib import json from dataclasses import dataclass from typing import Any +from tckdb_client import make_idempotency_key + @dataclass(frozen=True) class IdempotencyInputs: @@ -72,10 +72,6 @@ def build_idempotency_key(inputs: IdempotencyInputs) -> str: the result against the server constraint ``^[A-Za-z0-9._:-]{16,200}$``; callers don't need to pre-clean parts. """ - # Lazy import so arc.tckdb is importable when the adapter is unused - # and tckdb-client is not installed. - from tckdb_client import make_idempotency_key - parts: list[str] = ["arc"] if inputs.project_label: parts.append(inputs.project_label) @@ -90,4 +86,57 @@ def build_idempotency_key(inputs: IdempotencyInputs) -> str: return make_idempotency_key(*parts) -__all__ = ["IdempotencyInputs", "build_idempotency_key"] +@dataclass(frozen=True) +class ArtifactIdempotencyInputs: + """Stable inputs that identify one logical artifact upload event. + + Distinct from :class:`IdempotencyInputs`: the chemistry-upload key + is scoped to a (species, conformer) — but artifact uploads target a + concrete TCKDB calculation row, and the same calculation can carry + multiple artifacts of different kinds. So the artifact key tail is + ``(calculation_id, artifact_kind, artifact_sha256)``. + + The artifact's bytes-hash is part of the key so that re-uploading + different content for the same kind under the same calculation + produces a different key (i.e. a new upload event), while a literal + retry of the same bytes replays. + """ + + project_label: str | None + species_label: str + calculation_id: int + artifact_kind: str + artifact_sha256: str + + +def build_artifact_idempotency_key(inputs: ArtifactIdempotencyInputs) -> str: + """Compose a stable per-artifact idempotency key. + + Shape: + ``arc:::artifact:::`` + + The artifact sha256 is truncated to 16 hex chars to keep the key + well under the 200-char server cap while preserving collision + resistance for any plausible run. + """ + parts: list[str] = ["arc"] + if inputs.project_label: + parts.append(inputs.project_label) + parts.extend( + [ + inputs.species_label, + "artifact", + str(inputs.calculation_id), + inputs.artifact_kind, + inputs.artifact_sha256[:16], + ] + ) + return make_idempotency_key(*parts) + + +__all__ = [ + "ArtifactIdempotencyInputs", + "IdempotencyInputs", + "build_artifact_idempotency_key", + "build_idempotency_key", +] diff --git a/arc/tckdb/idempotency_test.py b/arc/tckdb/idempotency_test.py index 135e745ad7..7cae1424d2 100644 --- a/arc/tckdb/idempotency_test.py +++ b/arc/tckdb/idempotency_test.py @@ -6,7 +6,12 @@ import re import unittest -from arc.tckdb.idempotency import IdempotencyInputs, build_idempotency_key +from arc.tckdb.idempotency import ( + ArtifactIdempotencyInputs, + IdempotencyInputs, + build_artifact_idempotency_key, + build_idempotency_key, +) _KEY_PATTERN = re.compile(r"^[A-Za-z0-9._:\-]{16,200}$") @@ -61,5 +66,67 @@ def test_payload_dict_ordering_does_not_change_key(self): self.assertEqual(a, b) +def _artifact_inputs(**overrides): + base = dict( + project_label="projA", + species_label="ethanol", + calculation_id=42, + artifact_kind="output_log", + artifact_sha256="0" * 64, + ) + base.update(overrides) + return ArtifactIdempotencyInputs(**base) + + +class TestArtifactIdempotency(unittest.TestCase): + + def test_key_matches_server_pattern(self): + key = build_artifact_idempotency_key(_artifact_inputs()) + self.assertRegex(key, _KEY_PATTERN) + + def test_key_stable_across_calls(self): + a = build_artifact_idempotency_key(_artifact_inputs()) + b = build_artifact_idempotency_key(_artifact_inputs()) + self.assertEqual(a, b) + + def test_key_distinct_across_calc_id(self): + a = build_artifact_idempotency_key(_artifact_inputs(calculation_id=1)) + b = build_artifact_idempotency_key(_artifact_inputs(calculation_id=2)) + self.assertNotEqual(a, b) + + def test_key_distinct_across_kind(self): + a = build_artifact_idempotency_key(_artifact_inputs(artifact_kind="output_log")) + b = build_artifact_idempotency_key(_artifact_inputs(artifact_kind="input")) + self.assertNotEqual(a, b) + + def test_key_distinct_across_sha(self): + a = build_artifact_idempotency_key(_artifact_inputs(artifact_sha256="a" * 64)) + b = build_artifact_idempotency_key(_artifact_inputs(artifact_sha256="b" * 64)) + self.assertNotEqual(a, b) + + def test_key_distinct_across_species(self): + a = build_artifact_idempotency_key(_artifact_inputs(species_label="ethanol")) + b = build_artifact_idempotency_key(_artifact_inputs(species_label="methanol")) + self.assertNotEqual(a, b) + + def test_key_distinct_across_project(self): + a = build_artifact_idempotency_key(_artifact_inputs(project_label="A")) + b = build_artifact_idempotency_key(_artifact_inputs(project_label="B")) + self.assertNotEqual(a, b) + + def test_artifact_key_distinct_from_conformer_key(self): + # Identical species/project; artifact key includes "artifact" + # marker plus calc_id+kind+sha — should never collide with the + # conformer key shape. + artifact_key = build_artifact_idempotency_key(_artifact_inputs()) + conformer_key = build_idempotency_key(_inputs()) + self.assertNotEqual(artifact_key, conformer_key) + self.assertIn(":artifact:", artifact_key) + + def test_key_handles_no_project_label(self): + key = build_artifact_idempotency_key(_artifact_inputs(project_label=None)) + self.assertRegex(key, _KEY_PATTERN) + + if __name__ == "__main__": unittest.main() diff --git a/arc/tckdb/payload_writer.py b/arc/tckdb/payload_writer.py index e0c5e38c48..42615cb50b 100644 --- a/arc/tckdb/payload_writer.py +++ b/arc/tckdb/payload_writer.py @@ -10,8 +10,6 @@ disk rather than no trace at all. """ -from __future__ import annotations - import json import os import re @@ -35,6 +33,13 @@ def _safe_label(label: str) -> str: return cleaned[:120] +#: Bundle format the sidecars on disk conform to. Read by +#: ``tckdb-client``'s replay tool, which only attempts an upload when +#: this value is in its ``supported_format_versions`` list. Bump *only* +#: as a coordinated change with the client. +BUNDLE_FORMAT_VERSION = "0" + + @dataclass class SidecarMetadata: """On-disk record of one upload attempt; updated in place after upload.""" @@ -43,6 +48,7 @@ class SidecarMetadata: endpoint: str idempotency_key: str payload_kind: str + bundle_format_version: str = BUNDLE_FORMAT_VERSION created_at: str = field(default_factory=_utcnow_iso) uploaded_at: str | None = None status: str = "pending" @@ -65,6 +71,51 @@ class WrittenPayload: sidecar: SidecarMetadata +@dataclass +class ArtifactSidecarMetadata: + """On-disk record of one artifact upload event; updated in place after upload. + + Distinct from :class:`SidecarMetadata`: artifact uploads target a + concrete TCKDB calculation row (calculation_id) and carry the bytes + hash so a replay tool can re-verify before retransmission. + + ``payload_kind`` is fixed to ``"calculation_artifact"`` so the replay + tool can dispatch on the same field that conformer sidecars use — + without it, every artifact sidecar lands in the ``__unknown__`` + bucket and is skipped. + """ + + endpoint: str + idempotency_key: str + calculation_id: int + kind: str + filename: str + sha256: str + bytes: int + source_path: str | None = None + payload_kind: str = "calculation_artifact" + bundle_format_version: str = BUNDLE_FORMAT_VERSION + created_at: str = field(default_factory=_utcnow_iso) + uploaded_at: str | None = None + status: str = "pending" + response_status_code: int | None = None + response_body: Any = None + idempotency_replayed: bool | None = None + last_error: str | None = None + base_url: str | None = None + + def to_json(self) -> dict[str, Any]: + return asdict(self) + + +@dataclass(frozen=True) +class WrittenArtifact: + """Handle returned by :meth:`PayloadWriter.write_artifact` for downstream sidecar updates.""" + + sidecar_path: Path + sidecar: ArtifactSidecarMetadata + + class PayloadWriter: """File-system surface for the conformer-calculation upload payload. @@ -79,8 +130,11 @@ class PayloadWriter: """ SUBDIR = "conformer_calculation" + ARTIFACT_SUBDIR = "calculation_artifacts" + COMPUTED_SPECIES_SUBDIR = "computed_species" PAYLOAD_SUFFIX = ".payload.json" SIDECAR_SUFFIX = ".meta.json" + ARTIFACT_SIDECAR_SUFFIX = ".artifact.meta.json" def __init__(self, root_dir: str | os.PathLike[str]): self._root = Path(root_dir) @@ -98,14 +152,20 @@ def write( idempotency_key: str, payload_kind: str = "conformer_calculation", base_url: str | None = None, + subdir: str | None = None, ) -> WrittenPayload: """Write payload JSON and an initial ``pending`` sidecar atomically. + ``subdir`` selects the on-disk bucket (``conformer_calculation`` + for the conformer endpoint, ``computed_species`` for the bundle + endpoint). When omitted, the default ``SUBDIR`` is used to keep + existing callers untouched. + Returns a :class:`WrittenPayload` carrying both paths and the in-memory sidecar dataclass. Callers update the sidecar via :meth:`update_sidecar` after the upload resolves. """ - directory = self._root / self.SUBDIR + directory = self._root / (subdir or self.SUBDIR) directory.mkdir(parents=True, exist_ok=True) safe = _safe_label(label) payload_path = directory / f"{safe}{self.PAYLOAD_SUFFIX}" @@ -132,6 +192,55 @@ def update_sidecar(self, sidecar_path: Path, sidecar: SidecarMetadata) -> None: """Rewrite the sidecar in place with the latest status.""" self._write_json_atomic(sidecar_path, sidecar.to_json()) + def write_artifact_sidecar( + self, + *, + species_label: str, + calculation_id: int, + kind: str, + filename: str, + sha256: str, + bytes_: int, + endpoint: str, + idempotency_key: str, + source_path: str | None = None, + base_url: str | None = None, + ) -> WrittenArtifact: + """Write a ``pending`` artifact sidecar before the network call. + + One sidecar per (species, calculation, kind) tuple. The same + species can have multiple sidecars across calculations (opt / + freq / sp) and across kinds (output_log / checkpoint). The + kind+calc-id+species combination keeps filenames distinct and + makes the replay path scriptable. + """ + directory = self._root / self.ARTIFACT_SUBDIR + directory.mkdir(parents=True, exist_ok=True) + safe_species = _safe_label(species_label) + safe_kind = _safe_label(kind) + sidecar_name = f"{safe_species}.calc{calculation_id}.{safe_kind}{self.ARTIFACT_SIDECAR_SUFFIX}" + sidecar_path = directory / sidecar_name + + sidecar = ArtifactSidecarMetadata( + endpoint=endpoint, + idempotency_key=idempotency_key, + calculation_id=calculation_id, + kind=kind, + filename=filename, + sha256=sha256, + bytes=bytes_, + source_path=source_path, + base_url=base_url, + ) + self._write_json_atomic(sidecar_path, sidecar.to_json()) + return WrittenArtifact(sidecar_path=sidecar_path, sidecar=sidecar) + + def update_artifact_sidecar( + self, sidecar_path: Path, sidecar: ArtifactSidecarMetadata + ) -> None: + """Rewrite the artifact sidecar in place with the latest status.""" + self._write_json_atomic(sidecar_path, sidecar.to_json()) + @staticmethod def _write_json_atomic(path: Path, data: Any) -> None: """Write JSON via tmp+rename so a crash mid-write cannot leave a partial file.""" @@ -142,4 +251,10 @@ def _write_json_atomic(path: Path, data: Any) -> None: os.replace(tmp, path) -__all__ = ["PayloadWriter", "SidecarMetadata", "WrittenPayload"] +__all__ = [ + "ArtifactSidecarMetadata", + "PayloadWriter", + "SidecarMetadata", + "WrittenArtifact", + "WrittenPayload", +] diff --git a/arc/tckdb/payload_writer_test.py b/arc/tckdb/payload_writer_test.py index 31997f4073..1d7c847bea 100644 --- a/arc/tckdb/payload_writer_test.py +++ b/arc/tckdb/payload_writer_test.py @@ -9,7 +9,11 @@ import tempfile import unittest -from arc.tckdb.payload_writer import PayloadWriter, SidecarMetadata +from arc.tckdb.payload_writer import ( + ArtifactSidecarMetadata, + PayloadWriter, + SidecarMetadata, +) class TestPayloadWriter(unittest.TestCase): @@ -37,6 +41,11 @@ def test_write_creates_payload_and_sidecar(self): self.assertEqual(sc["endpoint"], "/uploads/conformers") self.assertIsNone(sc["uploaded_at"]) self.assertIn("created_at", sc) + # Replay-tool contract: every conformer sidecar must declare its + # payload_kind (for dispatch) and bundle_format_version (for the + # version gate). Missing either makes the sidecar unreplayable. + self.assertEqual(sc["payload_kind"], "conformer_calculation") + self.assertEqual(sc["bundle_format_version"], "0") def test_write_sanitizes_label(self): result = self.writer.write( @@ -87,5 +96,80 @@ def test_payload_unchanged_after_sidecar_update(self): self.assertEqual(before, after) +class TestArtifactSidecar(unittest.TestCase): + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-artifact-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.writer = PayloadWriter(self.tmp) + + def _write(self, **overrides): + defaults = dict( + species_label="ethanol", + calculation_id=42, + kind="output_log", + filename="opt.log", + sha256="a" * 64, + bytes_=1024, + endpoint="/calculations/42/artifacts", + idempotency_key="arc:proj:ethanol:artifact:42:output_log:abcdef0123456789", + source_path="/runs/proj/calcs/Species/ethanol/opt_a0/output.log", + base_url="http://localhost:8000/api/v1", + ) + defaults.update(overrides) + return self.writer.write_artifact_sidecar(**defaults) + + def test_artifact_sidecar_initialized_pending(self): + result = self._write() + self.assertTrue(result.sidecar_path.exists()) + on_disk = json.loads(result.sidecar_path.read_text()) + self.assertEqual(on_disk["status"], "pending") + self.assertEqual(on_disk["calculation_id"], 42) + self.assertEqual(on_disk["kind"], "output_log") + self.assertEqual(on_disk["sha256"], "a" * 64) + self.assertEqual(on_disk["bytes"], 1024) + self.assertIn("created_at", on_disk) + self.assertIsNone(on_disk["uploaded_at"]) + self.assertIsNone(on_disk["last_error"]) + # Replay-tool contract: artifact sidecars must declare their + # payload_kind so the replay dispatcher can route them to the + # artifact handler instead of the __unknown__ bucket. + self.assertEqual(on_disk["payload_kind"], "calculation_artifact") + self.assertEqual(on_disk["bundle_format_version"], "0") + + def test_artifact_sidecar_filename_includes_calc_and_kind(self): + result = self._write(calculation_id=99, kind="input") + name = result.sidecar_path.name + self.assertIn("ethanol", name) + self.assertIn("calc99", name) + self.assertIn("input", name) + self.assertTrue(name.endswith(".artifact.meta.json")) + + def test_distinct_sidecars_for_distinct_kinds(self): + a = self._write(kind="output_log") + b = self._write(kind="input") + self.assertNotEqual(a.sidecar_path, b.sidecar_path) + self.assertTrue(a.sidecar_path.exists()) + self.assertTrue(b.sidecar_path.exists()) + + def test_distinct_sidecars_for_distinct_calculations(self): + a = self._write(calculation_id=1) + b = self._write(calculation_id=2) + self.assertNotEqual(a.sidecar_path, b.sidecar_path) + + def test_update_artifact_sidecar_in_place(self): + result = self._write() + sc = result.sidecar + sc.status = "uploaded" + sc.uploaded_at = "2026-04-27T12:00:00Z" + sc.response_status_code = 201 + sc.response_body = {"calculation_id": 42, "artifacts": [{"id": 7}]} + self.writer.update_artifact_sidecar(result.sidecar_path, sc) + on_disk = json.loads(result.sidecar_path.read_text()) + self.assertEqual(on_disk["status"], "uploaded") + self.assertEqual(on_disk["response_status_code"], 201) + self.assertEqual(on_disk["response_body"]["calculation_id"], 42) + + if __name__ == "__main__": unittest.main() diff --git a/arc/testing/test_JobAdapter/calcs/Species/spc1/opt_a472/input.gjf b/arc/testing/test_JobAdapter/calcs/Species/spc1/opt_a472/input.gjf new file mode 100644 index 0000000000..75b9ea7702 --- /dev/null +++ b/arc/testing/test_JobAdapter/calcs/Species/spc1/opt_a472/input.gjf @@ -0,0 +1,12 @@ +%chk=check.chk +%mem=14336mb +%NProcShared=12 + +#P opt=(calcfc) cbs-qb3 IOp(2/9=2000) + +spc1 + +0 1 +O 0.00000000 0.00000000 1.00000000 + + diff --git a/arc/testing/test_JobAdapter/calcs/Species/spc1/opt_a472/submit.sh b/arc/testing/test_JobAdapter/calcs/Species/spc1/opt_a472/submit.sh new file mode 100644 index 0000000000..53f5ef4237 --- /dev/null +++ b/arc/testing/test_JobAdapter/calcs/Species/spc1/opt_a472/submit.sh @@ -0,0 +1,56 @@ +#!/bin/bash -l + +#PBS -q alon_q +#PBS -N a472 +#PBS -l select=1:ncpus=12:mem=15770000000:mpiprocs=12 +#PBS -o out.txt +#PBS -e err.txt + +. ~/.bashrc + +PBS_O_WORKDIR="/home/calvin.p/runs/arc_projects/calvin.p/runs/ARC_Projects/test/spc1/opt_a472" +cd "$PBS_O_WORKDIR" + +source /usr/local/g09/setup.sh + +GAUSS_SCRDIR="/gtmp/calvin.p/scratch/g09/$PBS_JOBID" + +mkdir -p "$GAUSS_SCRDIR" + +export GAUSS_SCRDIR="$GAUSS_SCRDIR" + +touch initial_time + +cd "$GAUSS_SCRDIR" + +cp "$PBS_O_WORKDIR/input.gjf" "$GAUSS_SCRDIR" + +cleanup() { + echo "Cleaning scratch: $GAUSS_SCRDIR" + cp -f "$GAUSS_SCRDIR"/check.chk "$PBS_O_WORKDIR"/ 2>/dev/null + cp -f "$GAUSS_SCRDIR"/*.rwf "$PBS_O_WORKDIR"/ 2>/dev/null + rm -rf "$GAUSS_SCRDIR" +} +trap cleanup EXIT TERM INT + + + +if [ -f "$PBS_O_WORKDIR/check.chk" ]; then + cp "$PBS_O_WORKDIR/check.chk" "$GAUSS_SCRDIR/" +fi + +g09 < input.gjf > input.log + +cp input.* "$PBS_O_WORKDIR/" + +if [ -f check.chk ]; then + cp check.chk "$PBS_O_WORKDIR/" +fi + +rm -vrf "$GAUSS_SCRDIR" + +cd "$PBS_O_WORKDIR" + +touch final_time + + \ No newline at end of file diff --git a/arc/testing/test_JobAdapter/calcs/Species/spc1_and_2_others/conf_opt_a472/input.gjf b/arc/testing/test_JobAdapter/calcs/Species/spc1_and_2_others/conf_opt_a472/input.gjf new file mode 100644 index 0000000000..64eccf99b7 --- /dev/null +++ b/arc/testing/test_JobAdapter/calcs/Species/spc1_and_2_others/conf_opt_a472/input.gjf @@ -0,0 +1,12 @@ +%chk=check.chk +%mem=14336mb +%NProcShared=12 + +#P opt=(calcfc) cbs-qb3 IOp(2/9=2000) + +spc1_and_2_others + +0 1 +O 0.00000000 0.00000000 1.00000000 + + diff --git a/arc/testing/test_JobAdapter/calcs/Species/spc1_and_2_others/conf_opt_a472/submit.sh b/arc/testing/test_JobAdapter/calcs/Species/spc1_and_2_others/conf_opt_a472/submit.sh new file mode 100644 index 0000000000..8c5dd49b30 --- /dev/null +++ b/arc/testing/test_JobAdapter/calcs/Species/spc1_and_2_others/conf_opt_a472/submit.sh @@ -0,0 +1,56 @@ +#!/bin/bash -l + +#PBS -q alon_q +#PBS -N a472 +#PBS -l select=1:ncpus=12:mem=15770000000:mpiprocs=12 +#PBS -o out.txt +#PBS -e err.txt + +. ~/.bashrc + +PBS_O_WORKDIR="/home/calvin.p/runs/arc_projects/calvin.p/runs/ARC_Projects/test/spc1_and_2_others/conf_opt_a472" +cd "$PBS_O_WORKDIR" + +source /usr/local/g09/setup.sh + +GAUSS_SCRDIR="/gtmp/calvin.p/scratch/g09/$PBS_JOBID" + +mkdir -p "$GAUSS_SCRDIR" + +export GAUSS_SCRDIR="$GAUSS_SCRDIR" + +touch initial_time + +cd "$GAUSS_SCRDIR" + +cp "$PBS_O_WORKDIR/input.gjf" "$GAUSS_SCRDIR" + +cleanup() { + echo "Cleaning scratch: $GAUSS_SCRDIR" + cp -f "$GAUSS_SCRDIR"/check.chk "$PBS_O_WORKDIR"/ 2>/dev/null + cp -f "$GAUSS_SCRDIR"/*.rwf "$PBS_O_WORKDIR"/ 2>/dev/null + rm -rf "$GAUSS_SCRDIR" +} +trap cleanup EXIT TERM INT + + + +if [ -f "$PBS_O_WORKDIR/check.chk" ]; then + cp "$PBS_O_WORKDIR/check.chk" "$GAUSS_SCRDIR/" +fi + +g09 < input.gjf > input.log + +cp input.* "$PBS_O_WORKDIR/" + +if [ -f check.chk ]; then + cp check.chk "$PBS_O_WORKDIR/" +fi + +rm -vrf "$GAUSS_SCRDIR" + +cd "$PBS_O_WORKDIR" + +touch final_time + + \ No newline at end of file diff --git a/arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/opt_101/err.txt b/arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/opt_101/err.txt new file mode 100644 index 0000000000..17a55b3536 --- /dev/null +++ b/arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/opt_101/err.txt @@ -0,0 +1,17 @@ +=>> PBS: job killed: walltime 86415 exceeded limit 86400 +Error: software termination + rax fffffffffffffffc, rbx 00007ffc0d4f90d0, rcx ffffffffffffffff + rdx 0000000000000000, rsp 00007ffc0d4f9098, rbp 0000000000000001 + rsi 00007ffc0d4f90d0, rdi 0000000000038f1b, r8 00002b7af22a5700 + r9 0000000000000000, r10 0000000000000000, r11 0000000000000246 + r12 00007ffc0d4f90f0, r13 000000000000008f, r14 0000000000000000 + r15 00007ffc0d4fff40 +Error: software termination + rax 0000000000024fa8, rbx 00002ae812e9f2c0, rcx 0000000000035498 + rdx 00002ae8c4888bd0, rsp 00007ffde70fb680, rbp 00007ffde70fbf70 + rsi 00002ae8c48be068, rdi 00002ae8c48f3508, r8 00002ae8c49289b0 + r9 0000000000006a93, r10 0000000000006a95, r11 00002ae812ed4768 + r12 00002ae812f66508, r13 00002ae812f9b9b0, r14 0000000000006a92 + r15 00002ae81311f478 + --- traceback not available + --- traceback not available diff --git a/arc/testing/test_JobAdapter_scan/calcs/Species/methanol_and_5_others/scan_a472/input.gjf b/arc/testing/test_JobAdapter_scan/calcs/Species/methanol_and_5_others/scan_a472/input.gjf new file mode 100644 index 0000000000..28a57c66a0 --- /dev/null +++ b/arc/testing/test_JobAdapter_scan/calcs/Species/methanol_and_5_others/scan_a472/input.gjf @@ -0,0 +1,20 @@ +%chk=check.chk +%mem=14336mb +%NProcShared=8 + +#P opt=(calcfc,maxStep=5,modredundant,noeigentest) integral=(grid=ultrafine, Acc2E=12) guess=mix wb97xd/def2-tzvp IOp(2/9=2000) scf=(direct,tight) + +methanol_and_5_others + +0 1 +C -0.36848162 -0.03889184 -0.02265038 +O 0.98355090 -0.39758578 -0.24093779 +H -0.56493882 0.00874072 1.05124761 +H -1.01426265 -0.79474153 -0.47572777 +H -0.56797130 0.93149369 -0.48381953 +H 1.53210349 0.29098475 0.17188786 + +D 2 3 4 5 S 45 8.0 + + + diff --git a/docs/tckdb-integration.md b/docs/tckdb-integration.md index b539a94472..e1cd35106a 100644 --- a/docs/tckdb-integration.md +++ b/docs/tckdb-integration.md @@ -44,6 +44,10 @@ tckdb: strict: false timeout_seconds: 30 project_label: "my-project" # optional; baked into idempotency key + artifacts: # optional sub-block; opt-in + upload: true + kinds: ["output_log", "input"] + max_size_mb: 50 ``` | Field | Default | Notes | @@ -56,6 +60,75 @@ tckdb: | `strict` | `false` | If `true`, upload failure raises. | | `timeout_seconds` | `30` | Per-request timeout. | | `project_label` | `null` | Optional run/project tag baked into the idempotency key. | +| `artifacts` | _(see below)_ | Optional sub-block controlling per-file attachments. | + +### Artifact sub-block + +Artifacts are files (ESS logs, input decks, …) attached to an existing +TCKDB calculation row. The conformer payload is uploaded first; on +success the adapter can then push artifacts against the returned +`calculation_id`s. + +```yaml +tckdb: + ... + artifacts: + upload: true + kinds: ["output_log", "input"] + max_size_mb: 50 +``` + +| Field | Default | Notes | +| -------------- | ------------------ | --------------------------------------------------------------------------------- | +| `upload` | `false` | Opt-in switch. When `false`, no artifact network calls. | +| `kinds` | `["output_log"]` | Which `ArtifactKind`s to upload. Validated at config-parse time. | +| `max_size_mb` | `50` | Per-file cap. Larger files are skipped (sidecar `skipped`, reason recorded). | + +**Valid kinds** (mirror the server-side `ArtifactKind` enum): +`input`, `output_log`, `checkpoint`, `formatted_checkpoint`, `ancillary`. + +**Currently implemented in ARC:** +- `output_log` — ESS log from `record["opt_log"] / ["freq_log"] / ["sp_log"]`. +- `input` — ESS input deck (`input.gjf` / `ZMAT` / `input.in`), sibling of `opt_log`. + +Listing a valid-but-not-implemented kind (e.g. `checkpoint`) is allowed +so users can opt in early. Config-parse logs a warning, and the adapter +skips cleanly at upload time rather than 422-ing. + +#### Artifact endpoint and idempotency + +- Endpoint: `POST /calculations/{calculation_id}/artifacts` with the + file bytes base64-encoded in the request body. +- Idempotency key shape: + ``` + arc:::artifact::: + ``` + Identical retry → server replays. Different bytes for the same + `(calc_id, kind)` → new key, new upload event. + +#### Artifact sidecar layout + +Artifact sidecars live alongside conformer payloads: + +``` +/tckdb_payloads/ + artifacts/ + .calc..meta.json +``` + +#### Artifact skip / failure semantics + +`status` ∈ `uploaded | failed | skipped`. Documented skip reasons: + +- `artifacts.upload is False` +- `kind 'X' not in config.kinds` +- `kind 'X' is server-accepted but ARC has no upload path yet` +- `file missing: '...'` +- `file is bytes (> MB cap)` + +Strict / non-strict failure behavior matches the conformer path: +non-strict logs a warning and records the error in the sidecar; strict +re-raises after updating the sidecar. ### API key From dbffca6ba41b17d31840fbccf6be7b5e32d6f5d5 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Tue, 5 May 2026 17:02:05 +0300 Subject: [PATCH 09/12] Further updates - NEB, GSM --- ARC.py | 248 +- arc/job/adapter.py | 89 + arc/job/adapter_test.py | 70 + arc/job/adapters/ts/orca_neb.py | 7 +- arc/job/adapters/ts/xtb_gsm.py | 7 + arc/job/adapters/ts/xtbgsm_test.py | 57 + arc/job/ssh.py | 19 +- arc/output.py | 910 ++++- arc/output_test.py | 1237 ++++++- arc/parser/adapter.py | 16 + arc/parser/adapters/gaussian.py | 487 ++- arc/parser/adapters/orca.py | 121 +- arc/parser/constraints_test.py | 247 ++ arc/parser/parser.py | 86 +- arc/parser/parser_test.py | 44 + arc/scheduler.py | 86 +- arc/scheduler_test.py | 58 + arc/scripts/get_species_corrections.py | 239 ++ arc/scripts/save_arkane_thermo.py | 53 +- arc/scripts_test.py | 21 +- arc/settings/settings.py | 2 +- arc/settings/submit.py | 69 +- arc/settings/submit_test.py | 26 +- arc/species/species.py | 13 +- arc/statmech/arkane.py | 12 +- arc/tckdb/adapter.py | 3147 ++++++++++++++++- arc/tckdb/adapter_test.py | 4386 +++++++++++++++++++++++- arc/tckdb/cli.py | 175 + arc/tckdb/cli_test.py | 258 ++ arc/tckdb/config.py | 178 +- arc/tckdb/config_test.py | 269 ++ arc/tckdb/constraints.py | 195 ++ arc/tckdb/payload_writer.py | 1 + arc/tckdb/sweep.py | 316 ++ docs/output_yml_schema.md | 16 +- 35 files changed, 12626 insertions(+), 539 deletions(-) create mode 100644 arc/parser/constraints_test.py create mode 100644 arc/scripts/get_species_corrections.py create mode 100644 arc/tckdb/cli.py create mode 100644 arc/tckdb/cli_test.py create mode 100644 arc/tckdb/constraints.py create mode 100644 arc/tckdb/sweep.py diff --git a/ARC.py b/ARC.py index 5b4b21901b..bba6da4c2c 100644 --- a/ARC.py +++ b/ARC.py @@ -11,7 +11,8 @@ from arc.common import read_yaml_file from arc.main import ARC -from arc.tckdb.config import TCKDBConfig, UPLOAD_MODE_COMPUTED_SPECIES +from arc.tckdb.config import TCKDBConfig +from arc.tckdb.sweep import run_upload_sweep def parse_command_line_arguments(command_line_args=None): @@ -78,250 +79,15 @@ def main(): if tckdb_config is not None: from arc.tckdb.adapter import TCKDBAdapter adapter = TCKDBAdapter(tckdb_config, project_directory=arc_object.project_directory) - _run_tckdb_upload_sweep(arc_object, adapter, tckdb_config) + run_upload_sweep( + adapter=adapter, + project_directory=arc_object.project_directory, + tckdb_config=tckdb_config, + ) finally: from arc.job.ssh_pool import reset_default_pool reset_default_pool() -def _run_tckdb_upload_sweep(arc_object, adapter, tckdb_config): - """End-of-run sweep: build/write/upload one TCKDB payload per converged species. - - Reads ``/output/output.yml`` (the consolidated run summary - from ``arc/output.py``) and dispatches per ``tckdb_config.upload_mode``: - - - ``"conformer"`` (default): one ``/uploads/conformers`` POST per - species, followed by per-artifact POSTs to - ``/calculations/{id}/artifacts`` for each configured kind. - - ``"computed_species"``: one ``/uploads/computed-species`` bundle - POST per species, with artifacts inlined under each calc; no - separate artifact sweep. - - Both paths share the same per-species iteration, error handling, - and summary print shape. TS records are deferred regardless of mode. - """ - output_path = os.path.join(arc_object.project_directory, 'output', 'output.yml') - if not os.path.exists(output_path): - # Most common cause: the run was interrupted before - # write_output_yml ran. Skip cleanly rather than scrape live - # objects — the replay path expects output.yml as the contract. - print(f'TCKDB upload skipped: {output_path} not found (run did not complete?)') - return - - output_doc = read_yaml_file(path=output_path) - species_records = list(output_doc.get('species') or []) - ts_records = list(output_doc.get('transition_states') or []) - # Both modes cover minima only; TS records are deferred to a future - # TS-specific adapter method targeting /uploads/transition-states - # (different schema, no SMILES requirement). - n_ts_deferred = sum(1 for r in ts_records if r.get('converged')) - - is_bundle_mode = tckdb_config.upload_mode == UPLOAD_MODE_COMPUTED_SPECIES - - counts = {'uploaded': 0, 'skipped': 0, 'failed': 0} - artifact_counts = {'uploaded': 0, 'skipped': 0, 'failed': 0} - failures = [] - artifact_failures = [] - n_attempted = 0 - for record in species_records: - label = record.get('label') or '' - if not record.get('converged'): - continue - n_attempted += 1 - try: - if is_bundle_mode: - # Single bundle carries species_entry + conformer + - # opt/freq/sp + (optional) thermo + inlined artifacts. - outcome = adapter.submit_computed_species_from_output( - output_doc=output_doc, species_record=record, - ) - else: - outcome = adapter.submit_from_output( - output_doc=output_doc, species_record=record, - ) - except Exception as exc: - counts['failed'] += 1 - failures.append((label, f'{type(exc).__name__}: {exc}')) - continue - if outcome is None: - continue - counts[outcome.status] = counts.get(outcome.status, 0) + 1 - if outcome.status == 'failed': - failures.append((label, outcome.error or 'unknown error')) - elif ( - outcome.status == 'uploaded' - and not is_bundle_mode - and tckdb_config.artifacts.upload - ): - # Artifact sweep is conformer-mode only — the bundle path - # carries artifacts inline under each calc. - _sweep_artifacts_for_species( - adapter=adapter, - arc_object=arc_object, - output_doc=output_doc, - species_record=record, - outcome=outcome, - counts=artifact_counts, - failures=artifact_failures, - kinds=_implementable_kinds_from_config(tckdb_config), - ) - - mode_label = 'computed-species bundle' if is_bundle_mode else 'conformer/calculation' - print(f'TCKDB v0 ({mode_label}, {n_attempted} converged species):') - print(f' uploaded: {counts["uploaded"]} skipped: {counts["skipped"]} failed: {counts["failed"]}') - if not is_bundle_mode and tckdb_config.artifacts.upload: - # Bundle mode rolls artifacts into the same upload, so a - # standalone artifact summary line would be misleading. - print( - f' artifacts: uploaded {artifact_counts["uploaded"]} ' - f'skipped {artifact_counts["skipped"]} failed {artifact_counts["failed"]}' - ) - if n_ts_deferred: - print(f' ({n_ts_deferred} converged TS deferred — TS-specific adapter not yet implemented)') - for label, err in failures: - print(f' failed: {label} — {err}') - for label, kind, err in artifact_failures: - print(f' failed artifact: {label} ({kind}) — {err}') - - -_CALC_TYPE_TO_LOG_KEY = { - 'opt': 'opt_log', - 'freq': 'freq_log', - 'sp': 'sp_log', -} - -# Companion mapping for input-deck paths, emitted by ``arc/output.py`` -# alongside the log paths. Per-job, with per-job software → per-job -# filename, and only set when the deck file is on disk. -_CALC_TYPE_TO_INPUT_KEY = { - 'opt': 'opt_input', - 'freq': 'freq_input', - 'sp': 'sp_input', -} - - -def _implementable_kinds_from_config(tckdb_config): - """Intersect user-configured kinds with ARC's IMPLEMENTED_ARTIFACT_KINDS. - - The config-parse step warns about valid-but-not-implemented kinds; - this filter is the runtime side of the same gate, so the sweep - silently skips them rather than calling the adapter (which would - skip with a defensive log message anyway). - """ - from arc.tckdb.config import IMPLEMENTED_ARTIFACT_KINDS - return tuple(k for k in tckdb_config.artifacts.kinds if k in IMPLEMENTED_ARTIFACT_KINDS) - - -def _resolve_artifact_path(*, kind, calc_type, species_record, output_doc): - """Resolve the local file path to upload for a (kind, calc_type) pair. - - Returns ``None`` if there's nothing to upload for this combination - (e.g. unsupported calc type, file not on disk, engine unknown). - - For ``output_log``, the path is keyed off the species_record's - log fields (``opt_log`` / ``freq_log`` / ``sp_log``). - - For ``input``, the input deck (``input.gjf``, ``ZMAT``, ``input.in``, - etc.) is always written as a sibling of the output log, so we - derive its name from ``arc.imports.settings['input_filenames']`` - keyed on the engine in ``output_doc['opt_level']['software']``. - """ - log_key = _CALC_TYPE_TO_LOG_KEY.get(str(calc_type).lower()) - if log_key is None: - return None - log_path = species_record.get(log_key) - if not log_path: - return None - if kind == 'output_log': - return log_path - if kind == 'input': - # Prefer the path emitted directly by ``arc/output.py``: it's - # per-job (so a Gaussian opt + Molpro sp run picks the right - # deck per calc), and existence on disk has already been - # verified at output-write time. - input_field = _CALC_TYPE_TO_INPUT_KEY.get(str(calc_type).lower()) - if input_field: - recorded = species_record.get(input_field) - if recorded: - return recorded - # Back-compat: older output.yml files predating the - # ``_input`` schema extension. Derive from the opt-level - # software via settings['input_filenames']. Same logic as before - # — kept so old runs can still upload input decks via the - # primitive endpoint. - from arc.imports import settings as _arc_settings - opt_level = output_doc.get('opt_level') or {} - engine = (opt_level.get('software') or '').lower() if isinstance(opt_level, dict) else '' - input_filenames = _arc_settings.get('input_filenames', {}) - input_name = input_filenames.get(engine) - if not input_name: - return None - return os.path.join(os.path.dirname(log_path), input_name) - return None - - -def _sweep_artifacts_for_species( - *, - adapter, - arc_object, - output_doc, - species_record, - outcome, - counts, - failures, - kinds, -): - """For one converged species' conformer upload, push artifacts of each kind to each calc. - - Iterates the calc refs returned by the conformer upload (primary + - additional) and, for each, iterates the configured kinds. Resolves - the right local file path per (kind, calc_type) and dispatches to - ``adapter.submit_artifacts_for_calculation``. Updates ``counts`` and - ``failures`` in place. - """ - label = species_record.get('label') or '' - refs = [] - if outcome.primary_calculation: - refs.append(outcome.primary_calculation) - refs.extend(outcome.additional_calculations or []) - if not refs: - # Older server response without calc refs — skip artifact upload - # for this species rather than guess at IDs. - return - for ref in refs: - calc_id = ref.get('calculation_id') - calc_type = ref.get('type') - if calc_id is None or calc_type is None: - continue - for kind in kinds: - file_path = _resolve_artifact_path( - kind=kind, - calc_type=calc_type, - species_record=species_record, - output_doc=output_doc, - ) - if file_path is None: - counts['skipped'] = counts.get('skipped', 0) + 1 - continue - try: - art_outcome = adapter.submit_artifacts_for_calculation( - output_doc=output_doc, - species_record=species_record, - calculation_id=int(calc_id), - calculation_type=str(calc_type), - file_path=file_path, - kind=kind, - ) - except Exception as exc: - counts['failed'] = counts.get('failed', 0) + 1 - failures.append((label, kind, f'{type(exc).__name__}: {exc}')) - continue - if art_outcome is None: - continue - counts[art_outcome.status] = counts.get(art_outcome.status, 0) + 1 - if art_outcome.status == 'failed': - failures.append((label, art_outcome.kind, art_outcome.error or 'unknown error')) - - if __name__ == '__main__': main() diff --git a/arc/job/adapter.py b/arc/job/adapter.py index 1198fd1fef..9ddb4bb366 100644 --- a/arc/job/adapter.py +++ b/arc/job/adapter.py @@ -372,6 +372,21 @@ def write_submit_script(self) -> None: if default_queue and default_queue not in self.attempted_queues: self.attempted_queues.append(default_queue) + # Validate that submit_scripts has an entry for this (server, software) + # combination AND that the entry's directives match the cluster_soft + # configured on this server. Mismatches are a real-world footgun: the + # repo's example templates were authored for specific machines (Slurm + # at MIT Supercloud, etc.) and will silently produce a Slurm script + # under an HTCondor filename when the dict key happens to be 'local' + # but the user's local server is HTCondor-based. We fail loud here + # so users get a clear pointer to ~/.arc/submit.py instead of a + # cryptic scheduler rejection later. + _validate_submit_script_for_server( + server=self.server, + software=self.job_adapter, + cluster_soft=servers[self.server]['cluster_soft'], + ) + submit_script = submit_scripts[self.server][self.job_adapter] queue = self.queue if self.queue is not None else default_queue @@ -1130,3 +1145,77 @@ def save_output_file(self, if key is not None: content[key] = val save_yaml_file(path=yml_out_path, content=content) + + +# Marker → cluster_soft expectations. The mapping is one-way (a marker +# implies a cluster_soft) so we can detect the wrong-template-for-this- +# cluster footgun without needing to enumerate every directive each +# scheduler accepts. +_SUBMIT_SCRIPT_MARKERS: tuple[tuple[str, str, str], ...] = ( + ("#SBATCH", "Slurm", "Slurm directive"), + ("#PBS", "PBS", "PBS directive"), + ("#$ -", "OGE", "OGE/SGE directive"), + ("Universe", "HTCondor", "HTCondor classad"), +) + + +def _validate_submit_script_for_server( + server: str, software: str, cluster_soft: str +) -> None: + """Fail-fast validation for ``submit_scripts[server][software]``. + + Two checks: + + 1. **Existence.** If no template is registered for the + ``(server, software)`` pair, raise ``JobError`` with a directory + of available templates so the user can see exactly what's + missing. Without this guard, callers used to hit a confusing + ``KeyError`` on ``submit_scripts[server][software]`` deep in the + formatting code, with the Python traceback pointing at the + dictionary lookup rather than at the configuration mistake. + + 2. **Cluster-software match.** Each scheduler's directives have a + distinctive prefix (``#SBATCH``, ``#PBS``, ``#$ -``, ``Universe`` + for HTCondor). If the template's directives don't match the + ``cluster_soft`` declared on the server, raise: producing a + Slurm script for a PBS cluster (or vice versa) would either be + rejected by the scheduler or — worse — silently misinterpreted. + This catches the real-world incident where the repo's example + template ``submit_scripts['local']['orca']`` ships Slurm syntax + under a key whose name suggests a generic local server. + + The validator is intentionally narrow — it only checks the *first* + matching marker per script, which is sufficient to detect the + cluster_soft mismatch without trying to validate every directive + a scheduler accepts. + """ + server_scripts = submit_scripts.get(server) + if server_scripts is None: + available = sorted(submit_scripts.keys()) + raise JobError( + f"submit_scripts has no entry for server {server!r} " + f"(available: {available}). Add a template for {server!r} " + f"to ~/.arc/submit.py, or point the ESS at a server that " + f"already has one." + ) + if software not in server_scripts: + available = sorted(server_scripts.keys()) + raise JobError( + f"submit_scripts[{server!r}] has no entry for software " + f"{software!r} (available: {available}). Add a template " + f"under submit_scripts[{server!r}][{software!r}] in " + f"~/.arc/submit.py." + ) + template = server_scripts[software] + for marker, expected_cluster_soft, description in _SUBMIT_SCRIPT_MARKERS: + if marker in template: + if expected_cluster_soft != cluster_soft: + raise JobError( + f"submit_scripts[{server!r}][{software!r}] uses " + f"{description} ({marker!r}) which implies " + f"cluster_soft={expected_cluster_soft!r}, but " + f"servers[{server!r}]['cluster_soft']={cluster_soft!r}. " + f"Edit ~/.arc/submit.py to use a template matching " + f"the server's actual cluster software." + ) + break diff --git a/arc/job/adapter_test.py b/arc/job/adapter_test.py index 4cfef9afc7..86d830da5b 100644 --- a/arc/job/adapter_test.py +++ b/arc/job/adapter_test.py @@ -779,5 +779,75 @@ def test_set_default_pool_replaces_instance(self): self.assertIs(self._pool_module.get_default_pool(), replacement) +class TestSubmitScriptValidator(unittest.TestCase): + """ + Standalone tests for ``_validate_submit_script_for_server`` — kept + in its own class so the validator's coverage doesn't depend on + ``TestJobAdapter.setUpClass``, which builds full job adapters that + require server names (e.g. ``server3``) declared in the user's + ``~/.arc/settings.py``. Bare validator tests should always run. + """ + + def test_missing_server_raises(self): + from arc.exceptions import JobError + from arc.job.adapter import _validate_submit_script_for_server + with self.assertRaises(JobError) as ctx: + _validate_submit_script_for_server( + server='nonexistent_host', software='gaussian', cluster_soft='Slurm', + ) + self.assertIn("nonexistent_host", str(ctx.exception)) + self.assertIn("available", str(ctx.exception)) + + def test_missing_software_raises(self): + from arc.exceptions import JobError + from arc.job.adapter import _validate_submit_script_for_server + with self.assertRaises(JobError) as ctx: + _validate_submit_script_for_server( + server='EXAMPLE_pbs', software='nonexistent_ess', cluster_soft='PBS', + ) + self.assertIn("nonexistent_ess", str(ctx.exception)) + + def test_cluster_soft_mismatch_raises(self): + # The user's actual incident: 'local' (HTCondor) lookup hits + # the EXAMPLE_slurm_supercloud Slurm template via the back-compat + # alias, which would silently produce a Slurm script under + # HTCondor's submit.sub filename. Validator must catch this. + from arc.exceptions import JobError + from arc.job.adapter import _validate_submit_script_for_server + with self.assertRaises(JobError) as ctx: + _validate_submit_script_for_server( + server='EXAMPLE_slurm_supercloud', software='gaussian', + cluster_soft='HTCondor', + ) + self.assertIn("Slurm", str(ctx.exception)) + self.assertIn("HTCondor", str(ctx.exception)) + self.assertIn("submit.py", str(ctx.exception)) + + def test_matching_cluster_soft_passes_silently(self): + # Slurm template + cluster_soft='Slurm' is consistent. + from arc.job.adapter import _validate_submit_script_for_server + _validate_submit_script_for_server( + server='EXAMPLE_slurm_supercloud', software='gaussian', + cluster_soft='Slurm', + ) + # PBS template + cluster_soft='PBS' is consistent. + _validate_submit_script_for_server( + server='EXAMPLE_pbs', software='gaussian', cluster_soft='PBS', + ) + + def test_back_compat_aliases_resolve(self): + # The 'local'/'pbs_sample'/'server3' back-compat aliases must + # validate the same as their EXAMPLE_* canonical keys. + from arc.job.adapter import _validate_submit_script_for_server + # 'local' → EXAMPLE_slurm_supercloud (Slurm) + _validate_submit_script_for_server( + server='local', software='gaussian', cluster_soft='Slurm', + ) + # 'pbs_sample' → EXAMPLE_pbs (PBS) + _validate_submit_script_for_server( + server='pbs_sample', software='gaussian', cluster_soft='PBS', + ) + + if __name__ == '__main__': unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/job/adapters/ts/orca_neb.py b/arc/job/adapters/ts/orca_neb.py index 0647fd3169..a51c20f46f 100644 --- a/arc/job/adapters/ts/orca_neb.py +++ b/arc/job/adapters/ts/orca_neb.py @@ -39,15 +39,15 @@ %%maxcore ${memory} %%pal nprocs ${cpus} end -%%neb +%%neb Interpolation ${interpolation} NImages ${nnodes} PrintLevel 3 PreOpt ${preopt} - NEB_END_XYZFILE "${abs_path}/product.xyz" + NEB_END_XYZFILE "product.xyz" END -* XYZFILE ${charge} ${multiplicity} ${abs_path}/reactant.xyz +* XYZFILE ${charge} ${multiplicity} reactant.xyz """ @@ -222,7 +222,6 @@ def write_input_file(self) -> None: input_dict['cpus'] = self.cpu_cores input_dict['charge'] = self.charge input_dict['multiplicity'] = self.multiplicity - input_dict['abs_path'] = self.local_path # NEB specific parameters neb_settings = orca_neb_settings.get('keyword', {}) diff --git a/arc/job/adapters/ts/xtb_gsm.py b/arc/job/adapters/ts/xtb_gsm.py index bcf7d73bda..baa06819cf 100644 --- a/arc/job/adapters/ts/xtb_gsm.py +++ b/arc/job/adapters/ts/xtb_gsm.py @@ -393,6 +393,13 @@ def process_run(self): tsg.initial_xyz = traj[int((len(traj) - 1) / 2) + 1] tsg.execution_time = self.final_time - self.initial_time tsg.success = True + # Provenance for the TCKDB path_search adapter: the GSM + # stringfile is the result-bearing artifact of a successful + # GSM run (the ESS log/string is what the consumer needs to + # anchor a parent calc). The scheduler reads this attribute + # to populate ``output[label]['paths']['gsm']`` (separate + # from ``paths['neb']`` — distinct method, distinct slot). + tsg.log_path = self.stringfile_path self.reactions[0].ts_species.ts_guesses.append(tsg) def cleanup_files(self): diff --git a/arc/job/adapters/ts/xtbgsm_test.py b/arc/job/adapters/ts/xtbgsm_test.py index ed6a3373fc..e70364ceeb 100644 --- a/arc/job/adapters/ts/xtbgsm_test.py +++ b/arc/job/adapters/ts/xtbgsm_test.py @@ -5,6 +5,7 @@ This module contains unit tests of the arc.job.adapters.ts.xtb_gsm module """ +import datetime import os import shutil import unittest @@ -133,6 +134,62 @@ def test_execute_incore(self): traj = parse_trajectory(self.job_2.stringfile_path) self.assertEqual(len(traj), 9) + def test_process_run_records_log_path_on_success(self): + """``process_run`` records the GSM stringfile as ``tsg.log_path`` + on success, so the scheduler can route it to ``paths['gsm']`` and + the TCKDB adapter can emit a ``path_search`` parent calc with + ``method=gsm``. The test side-steps a real DE-GSM run by reusing + the stringfile produced by ``test_execute_incore`` (or any prior + execution); if that fixture isn't on disk the test is skipped + rather than reimplemented as a unit test against a stub. + """ + if not os.path.isfile(self.job_2.stringfile_path): + # First-time runs produce the file via test_execute_incore. + self.job_2.execute() + # Reset the TS species' guess list so we can assert on the new entry. + self.job_2.reactions[0].ts_species.ts_guesses = [] + self.job_2.initial_time = datetime.datetime.now() + self.job_2.final_time = datetime.datetime.now() + self.job_2.process_run() + guesses = self.job_2.reactions[0].ts_species.ts_guesses + self.assertEqual(len(guesses), 1) + tsg = guesses[0] + self.assertTrue(tsg.success) + self.assertEqual(tsg.method, 'xTB-GSM') + self.assertEqual(tsg.log_path, self.job_2.stringfile_path) + + def test_process_run_no_stringfile_no_log_path(self): + """When the GSM stringfile isn't on disk (failed run), the + adapter must not invent provenance: ``tsg.log_path`` stays unset + / falsy, the gate downstream stays closed, and no path_search + parent calc is emitted by the TCKDB adapter. + """ + # Build a fresh job in a temp dir guaranteed not to contain a + # stringfile, so process_run's success branch is skipped. + tmp_proj = os.path.join(ARC_TESTING_PATH, 'test_xTBAGSMdapter_no_stringfile') + shutil.rmtree(tmp_proj, ignore_errors=True) + job = xTBGSMAdapter(execution_type='incore', + project='test_no_stringfile', + job_type='tsg', + project_directory=tmp_proj, + reactions=[ARCReaction( + r_species=[ARCSpecies(label='HNO', smiles='N=O')], + p_species=[ARCSpecies(label='HON', smiles='[N-]=[OH+]')])], + ) + job.reactions[0].ts_species = ARCSpecies(label='TS_no_string', is_ts=True) + # Sanity: stringfile must NOT exist for this test to be meaningful. + self.assertFalse(os.path.isfile(job.stringfile_path)) + job.initial_time = datetime.datetime.now() + job.final_time = datetime.datetime.now() + job.process_run() + guesses = job.reactions[0].ts_species.ts_guesses + self.assertEqual(len(guesses), 1) + tsg = guesses[0] + self.assertFalse(tsg.success) + # Default TSGuess.log_path is None; we never set it on failure. + self.assertFalse(bool(getattr(tsg, 'log_path', None))) + shutil.rmtree(tmp_proj, ignore_errors=True) + @classmethod def tearDownClass(cls): """ diff --git a/arc/job/ssh.py b/arc/job/ssh.py index c21cf9f3f0..460abf1ce1 100644 --- a/arc/job/ssh.py +++ b/arc/job/ssh.py @@ -183,13 +183,18 @@ def download_file(self, Raises: ServerError: If the file cannot be downloaded with maximum times to try """ - if not self._check_file_exists(remote_file_path): - # Check if a file exists - # This doesn't have a real impact now to avoid screwing up ESS trsh - # but introduce an opportunity for better troubleshooting. - # The current behavior is that if the remote path does not exist - # an empty file will be created at the local path - logger.debug(f'{remote_file_path} does not exist on {self.server}.') + # PBS/SGE epilogues sometimes flush stdout/stderr to the work dir a + # second or two after qstat reports the job has left the queue. Briefly + # retry the existence check so we don't loud-warn on that race. + for attempt in range(3): + if self._check_file_exists(remote_file_path): + break + if attempt < 2: + time.sleep(1.0) + else: + logger.debug(f'{remote_file_path} does not exist on {self.server}; ' + f'skipping download.') + return try: self._sftp.get(remotepath=remote_file_path, localpath=local_file_path) diff --git a/arc/output.py b/arc/output.py index c3df70d2f3..1442b7c1c8 100644 --- a/arc/output.py +++ b/arc/output.py @@ -18,10 +18,21 @@ from arc.constants import E_h_kJmol from arc.imports import settings from arc.job.local import execute_command -from arc.parser.parser import parse_1d_scan_energies, parse_e_elect, parse_ess_version, parse_opt_steps, parse_zpe_correction +from arc.parser.parser import ( + parse_1d_scan_energies, + parse_1d_scan_full_result, + parse_e_elect, + parse_ess_version, + parse_geometry, + parse_opt_steps, + parse_scan_args, + parse_zpe_correction, +) from arc.species.converter import xyz_to_str +from arc.species.vectors import calculate_dihedral_angle from arc.statmech.arkane import ( AEC_SECTION_START, AEC_SECTION_END, + ARKANE_TUNNELING_METHOD, MBAC_SECTION_START, MBAC_SECTION_END, PBAC_SECTION_START, PBAC_SECTION_END, find_best_across_files, get_qm_corrections_files, @@ -119,11 +130,18 @@ def write_output_yml( # ---- species and TSs -------------------------------------------------------- point_groups = _compute_point_groups(species_dict, project_directory) + species_corrections = _compute_species_corrections( + species_dict, arkane_level_of_theory, bac_type, project_directory, + ) doc['species'] = [] doc['transition_states'] = [] for spc in species_dict.values(): d = _spc_to_dict(spc, output_dict, project_directory, point_groups, irc_requested=irc_requested, software_by_job=software_by_job) + d['applied_energy_corrections'] = _build_applied_corrections_for_species( + spc.label, species_corrections, arkane_level_of_theory, bac_type, + aec_table=aec, bac_table=bac, + ) if spc.is_ts: doc['transition_states'].append(d) else: @@ -242,24 +260,106 @@ def _parse_zpe(freq_path: str | None, project_directory: str) -> float | None: def _parse_opt_log(geo_path: str | None, project_directory: str) -> tuple: """ - Parse opt_n_steps and opt_final_energy_hartree from the geometry opt log. - - Returns: - (opt_n_steps, opt_final_energy_hartree) — either may be None. + Parse n_steps, final electronic energy, and final geometry from an opt log. + + Returns a 3-tuple ``(n_steps, final_energy_hartree, final_xyz_str)``; + any element may be ``None`` if that piece couldn't be extracted (the + others are still attempted independently). ``final_xyz_str`` is in + the same atom-only format as ``xyz_to_str`` produces — symbol + + coords, one atom per line, no count header. + + The geometry is parsed via :func:`parse_geometry`, which dispatches + to per-ESS adapters; logs from any supported ESS work without + branching here. Used for both fine and coarse opt logs — the coarse + geometry surfaces as ``coarse_opt_output_xyz`` in output.yml so the + TCKDB bundle can chain ``opt_coarse → opt`` with the right geometry + on each side. """ if not geo_path: - return None, None + return None, None, None if not os.path.isabs(geo_path): geo_path = os.path.join(project_directory, geo_path) if not os.path.isfile(geo_path): - return None, None + return None, None, None + + # Each parse is best-effort and independent — if one fails we still + # surface the others. The final-energy parse historically failed-fast + # for the whole tuple, but that's not the right shape now that the + # geometry parse is also in the mix. + n_steps = None + e_hartree = None + final_xyz = None try: n_steps = parse_opt_steps(geo_path) - e_elect_kj = parse_e_elect(geo_path) # returns kJ/mol - e_elect_hartree = e_elect_kj / E_h_kJmol if e_elect_kj is not None else None - return n_steps, e_elect_hartree except Exception: - return None, None + logger.debug("Could not parse n_steps from %s", geo_path, exc_info=True) + try: + e_kj = parse_e_elect(geo_path) + e_hartree = e_kj / E_h_kJmol if e_kj is not None else None + except Exception: + logger.debug("Could not parse final energy from %s", geo_path, exc_info=True) + try: + xyz_dict = parse_geometry(geo_path) + if xyz_dict is not None: + final_xyz = xyz_to_str(xyz_dict) + except Exception: + logger.debug("Could not parse final geometry from %s", geo_path, exc_info=True) + return n_steps, e_hartree, final_xyz + + +def _parse_calc_constraints( + input_rel_path: str | None, + log_path: str | None, + software: str | None, + project_directory: str, +) -> list[dict]: + """Best-effort parse of held-fixed constraints for one calculation. + + Prefers the ESS input deck (``input_rel_path``) over the log because + the deck holds the exact ModRedundant / ``%geom Constraints`` block + ARC emitted; the log echoes it but adds parser surface area. Returns + an empty list when no deck/log is available, the software is + unsupported, or the file can't be parsed. + + Never raises: any failure inside the parser is logged as a warning by + the parser itself, and we shrug it off here so output.yml generation + stays robust to malformed decks. + """ + if not software: + return [] + sw = str(software).lower() + + candidates: list[str] = [] + if input_rel_path: + abs_input = os.path.join(project_directory, input_rel_path) \ + if not os.path.isabs(input_rel_path) else input_rel_path + if os.path.isfile(abs_input): + candidates.append(abs_input) + if log_path and os.path.isfile(log_path): + candidates.append(log_path) + + if not candidates: + return [] + + try: + if sw == 'gaussian': + from arc.parser.adapters.gaussian import parse_gaussian_constraints + for path in candidates: + parsed = parse_gaussian_constraints(path) + if parsed: + return parsed + return [] + if sw == 'orca': + from arc.parser.adapters.orca import parse_orca_constraints + for path in candidates: + parsed = parse_orca_constraints(path) + if parsed: + return parsed + return [] + except Exception as exc: + logger.warning("Constraint extraction failed for %s (software=%s): %s", + candidates[0], sw, exc) + return [] def _input_filename_for(software: str | None) -> str | None: @@ -413,6 +513,243 @@ def _safe(fn, default=None): return default +_BAC_KIND_BY_TYPE = {'p': 'bac_petersson', 'm': 'bac_melius'} + + +def _compute_species_corrections( + species_dict: dict, + arkane_level_of_theory, + bac_type: str | None, + project_directory: str, +) -> dict[str, dict]: + """Compute per-species AEC/BAC totals + components by delegating to + Arkane's correction functions through ``arc/scripts/get_species_corrections.py`` + in the RMG conda environment. + + Returns a dict ``{label: {'aec': {...}, 'bac': {...}}}``. Species whose + inputs are insufficient (no xyz, no bonds, etc.) are omitted; per-species + Arkane errors land as ``aec_error``/``bac_error`` keys and are dropped + silently when the result is consumed (the correction is omitted for that + species). Returns ``{}`` on any whole-batch failure so that downstream + output.yml writing proceeds without corrections rather than aborting. + """ + if arkane_level_of_theory is None: + return {} + from arc.common import NUMBER_BY_SYMBOL + + # Resolve the same matched LevelOfTheory key string that + # _get_energy_corrections uses, via fuzzy match against the RMG QM + # corrections data files. This is what the rmg_env script's regex + # parser knows how to reconstruct. + try: + qm_corr_files = get_qm_corrections_files() + lot_str = find_best_across_files( + arkane_level_of_theory, qm_corr_files, + AEC_SECTION_START, AEC_SECTION_END, + ) + except Exception: + lot_str = None + if not lot_str: + return {} + + species_inputs: list[dict] = [] + for label, spc in species_dict.items(): + xyz = spc.final_xyz if spc.final_xyz is not None else spc.initial_xyz + if xyz is None: + continue + symbols = list(xyz.get('symbols') or []) + coords = [list(row) for row in (xyz.get('coords') or [])] + if not symbols or not coords: + continue + nums = [NUMBER_BY_SYMBOL.get(s) for s in symbols] + if any(n is None for n in nums): + continue + atoms: dict[str, int] = {} + for s in symbols: + atoms[s] = atoms.get(s, 0) + 1 + bonds = dict(getattr(spc, 'bond_corrections', None) or {}) + species_inputs.append({ + 'label': label, + 'atoms': atoms, + 'bonds': bonds, + 'coords': coords, + 'nums': nums, + 'multiplicity': int(getattr(spc, 'multiplicity', None) or 1), + }) + + if not species_inputs: + return {} + + rmg_env = settings.get('RMG_ENV_NAME', 'rmg_env') + script_path = os.path.join(ARC_PATH, 'arc', 'scripts', 'get_species_corrections.py') + + tmp_dir = os.path.join(project_directory, 'output') + os.makedirs(tmp_dir, exist_ok=True) + fd_in, tmp_in = tempfile.mkstemp(dir=tmp_dir, suffix='.spc_corr_input.yml') + fd_out, tmp_out = tempfile.mkstemp(dir=tmp_dir, suffix='.spc_corr_output.yml') + try: + os.close(fd_in) + os.close(fd_out) + save_yaml_file(path=tmp_in, content={ + 'level_of_theory': lot_str, + 'bac_type': bac_type, + 'species': species_inputs, + }) + + commands = [ + 'bash -lc "set -euo pipefail; ' + 'if command -v micromamba >/dev/null 2>&1; then ' + f' micromamba run -n {rmg_env} python {script_path} {tmp_in} {tmp_out}; ' + 'elif command -v conda >/dev/null 2>&1; then ' + f' conda run -n {rmg_env} python {script_path} {tmp_in} {tmp_out}; ' + 'elif command -v mamba >/dev/null 2>&1; then ' + f' mamba run -n {rmg_env} python {script_path} {tmp_in} {tmp_out}; ' + 'else ' + ' echo \'micromamba/conda/mamba required\' >&2; exit 1; ' + 'fi"', + ] + _, stderr = execute_command(command=commands, executable='/bin/bash') + if stderr: + logger.warning(f'get_species_corrections.py stderr: {stderr}') + + result = read_yaml_file(tmp_out) or {} + out: dict[str, dict] = {} + for entry in (result.get('species') or []): + label = entry.get('label') + if label is None: + continue + out[label] = {k: v for k, v in entry.items() if k != 'label'} + return out + except Exception as e: + logger.warning(f'Per-species correction computation failed: {e}') + return {} + finally: + for p in (tmp_in, tmp_out): + try: + os.unlink(p) + except OSError: + logger.debug(f'Failed to remove temporary file {p!r}', exc_info=True) + + +def _aec_atom_params(aec_table: dict | None) -> list[dict]: + """Translate ARC's run-level ``atom_energy_corrections`` mapping into + TCKDB ``SchemeAtomParamPayload`` shape, sorted by element for + deterministic output.yml. Returns ``[]`` when the table is empty or + missing — TCKDB then has no scheme params to persist, but the applied + correction row still lands.""" + if not aec_table: + return [] + return [ + {'element': str(elem), 'value': float(value)} + for elem, value in sorted(aec_table.items()) + ] + + +def _pbac_bond_params(bac_table: dict | None) -> list[dict]: + """Translate ARC's run-level ``bond_additivity_corrections`` mapping + into TCKDB ``SchemeBondParamPayload`` shape, sorted by bond_key for + deterministic output.yml.""" + if not bac_table: + return [] + return [ + {'bond_key': str(key), 'value': float(value)} + for key, value in sorted(bac_table.items()) + ] + + +def _build_applied_corrections_for_species( + label: str, + species_corrections: dict[str, dict], + arkane_level_of_theory, + bac_type: str | None, + *, + aec_table: dict | None = None, + bac_table: dict | None = None, +) -> list[dict]: + """Build the per-species ``applied_energy_corrections`` list for output.yml. + + Translates the rmg_env script's per-species totals + components into the + output.yml shape (mirroring TCKDB's ``AppliedEnergyCorrectionUploadPayload`` + contract): each entry has ``application_role``, ``value``, ``value_unit``, + ``scheme``, and (optional) ``components``. Failure rows from the script + (``aec_error``/``bac_error``) are silently dropped — that species simply + has the failing role omitted. + + ``aec_table`` and ``bac_table`` are the run-level parameter dicts ARC + already retrieves from the RMG database (see ``_get_energy_corrections``). + Attached to each scheme as ``atom_params`` / ``bond_params`` so TCKDB + populates the ``energy_correction_scheme_atom_param`` / + ``energy_correction_scheme_bond_param`` reference tables — without + them the applied correction lands but the scheme rows that link to + parameter values stay empty. mBAC schemes never carry params: the + Melius parameter table doesn't fit ``SchemeBondParamPayload``'s + bond-key shape (it's atom-pair indexed and includes mol-level + corrections), and there's no safe coercion at the producer. + """ + entry = species_corrections.get(label) or {} + applied: list[dict] = [] + lot_dict = _level_to_dict(arkane_level_of_theory) + + aec_block = entry.get('aec') + if aec_block and aec_block.get('value') is not None: + scheme: dict = { + 'kind': 'atom_energy', + 'name': 'atom_energy', + 'level_of_theory': lot_dict, + 'units': aec_block.get('value_unit', 'hartree'), + 'version': None, + 'source_literature': None, + 'note': 'Per-species AEC computed by Arkane.', + } + atom_params = _aec_atom_params(aec_table) + if atom_params: + scheme['atom_params'] = atom_params + applied.append({ + 'application_role': 'aec_total', + 'value': float(aec_block['value']), + 'value_unit': aec_block.get('value_unit', 'hartree'), + 'scheme': scheme, + 'components': aec_block.get('components') or [], + }) + + bac_block = entry.get('bac') + if bac_block and bac_block.get('value') is not None and bac_type in _BAC_KIND_BY_TYPE: + scheme_kind = _BAC_KIND_BY_TYPE[bac_type] + components = bac_block.get('components') + # Drop components when any bond is missing a parameter — partial + # decompositions would not sum to the total and would mislead + # downstream consumers. + if components and any(c.get('parameter_value') is None for c in components): + components = None + scheme = { + 'kind': scheme_kind, + 'name': scheme_kind, + 'level_of_theory': lot_dict, + 'units': bac_block.get('value_unit', 'kcal_mol'), + 'version': None, + 'source_literature': None, + 'note': f'Per-species BAC computed by Arkane (bac_type={bac_type}).', + } + # Petersson BAC has a clean (bond_key → value) parameter table; + # Melius does not — its parameters are atom-pair / length / + # neighbor / molecular and need ``component_params``, which ARC + # doesn't currently surface. Per spec we omit params for mBAC + # rather than fabricate or coerce. + if bac_type == 'p': + bond_params = _pbac_bond_params(bac_table) + if bond_params: + scheme['bond_params'] = bond_params + applied.append({ + 'application_role': 'bac_total', + 'value': float(bac_block['value']), + 'value_unit': bac_block.get('value_unit', 'kcal_mol'), + 'scheme': scheme, + 'components': components or [], + }) + + return applied + + def _compute_point_groups(species_dict: dict, project_directory: str) -> dict[str, str | None]: """ Compute point groups for all species via the ``symmetry`` binary in the RMG env. @@ -526,6 +863,10 @@ def _spc_to_dict(spc, output_dict: dict, project_directory: str, xyz = spc.final_xyz if spc.final_xyz is not None else spc.initial_xyz d['xyz'] = xyz_to_str(xyz) if xyz is not None else None + # ``opt_input_xyz`` and the coarse-opt xyz fields are populated below, + # AFTER coarse-opt parsing (we need ``coarse_final_xyz`` to set the + # fine opt's input correctly when coarse ran). + # ── is monoatomic? (drives null-vs-value for freq/statmech) ───────────── is_mono = spc.is_monoatomic() is True @@ -541,10 +882,14 @@ def _spc_to_dict(spc, output_dict: dict, project_directory: str, d['opt_converged'] = entry.get('job_types', {}).get('opt') if converged else None # ── coarse opt (null if fine grid wasn't used or job didn't run) ──────── + # When coarse ran, its parsed final geometry becomes both the coarse + # opt's output and (semantically) the fine opt's input — see the + # ``opt_input_xyz`` resolution below. coarse_path = paths.get('geo_coarse') or None + coarse_final_xyz: str | None = None if converged and coarse_path: d['coarse_opt_log'] = _make_rel_path(coarse_path, project_directory) - d['coarse_opt_n_steps'], d['coarse_opt_final_energy_hartree'] = \ + d['coarse_opt_n_steps'], d['coarse_opt_final_energy_hartree'], coarse_final_xyz = \ _parse_opt_log(coarse_path, project_directory) else: d['coarse_opt_log'] = None @@ -552,8 +897,41 @@ def _spc_to_dict(spc, output_dict: dict, project_directory: str, d['coarse_opt_final_energy_hartree'] = None # ── fine opt (or only opt if no fine grid) ───────────────────────────── - d['opt_n_steps'], d['opt_final_energy_hartree'] = _parse_opt_log( - paths.get('geo') or None, project_directory) if converged else (None, None) + # We discard the geometry here — ``xyz`` (set above) already carries + # the fine opt's final geometry via ``spc.final_xyz``, and re-parsing + # the log would just produce the same content with different rounding. + if converged: + d['opt_n_steps'], d['opt_final_energy_hartree'], _ = _parse_opt_log( + paths.get('geo') or None, project_directory + ) + else: + d['opt_n_steps'], d['opt_final_energy_hartree'] = None, None + + # ── opt input/output geometry semantics ──────────────────────────────── + # When coarse ran (a real two-stage opt), the geometry chain is: + # spc.initial_xyz → coarse_opt → coarse_final_xyz → opt → xyz + # When coarse didn't run (single-stage opt), there's no intermediate + # and the chain is just: + # spc.initial_xyz → opt → xyz + # + # ``opt_input_xyz`` always means "what was submitted to the FINE opt" + # — the coarse output if coarse ran, else the species' initial xyz. + # ``coarse_opt_input_xyz`` and ``coarse_opt_output_xyz`` are non-null + # only when a coarse opt actually ran AND its log was parseable. + initial_xyz_str = xyz_to_str(spc.initial_xyz) if spc.initial_xyz is not None else None + if coarse_final_xyz is not None: + # Real two-stage opt with parseable coarse output. + d['coarse_opt_input_xyz'] = initial_xyz_str + d['coarse_opt_output_xyz'] = coarse_final_xyz + d['opt_input_xyz'] = coarse_final_xyz + else: + # Either no coarse stage, or coarse ran but its geometry wasn't + # parseable. Producers downstream (TCKDB bundle) won't emit a + # standalone ``opt_coarse`` calc in that case — they need the + # output geometry to chain. Honest-empty beats fake-provenance. + d['coarse_opt_input_xyz'] = None + d['coarse_opt_output_xyz'] = None + d['opt_input_xyz'] = initial_xyz_str # ── freq results ──────────────────────────────────────────────────────── if is_mono: @@ -588,6 +966,23 @@ def _spc_to_dict(spc, output_dict: dict, project_directory: str, paths.get('sp') or None, software_by_job.get('sp'), project_directory, ) + # ── held-fixed coordinate constraints (for TCKDB) ────────────────────── + # Best-effort: parse the input deck (preferred — exact ARC-emitted form) + # falling back to the log when no deck is on disk. Failures here never + # fail output.yml generation; they just emit ``[]`` for that calc. + d['opt_constraints'] = _parse_calc_constraints( + d.get('opt_input'), paths.get('geo') or None, + software_by_job.get('opt'), project_directory, + ) + d['freq_constraints'] = _parse_calc_constraints( + d.get('freq_input'), paths.get('freq') or None, + software_by_job.get('freq'), project_directory, + ) + d['sp_constraints'] = _parse_calc_constraints( + d.get('sp_input'), paths.get('sp') or None, + software_by_job.get('sp'), project_directory, + ) + # ── ESS software version (from SP log, or fall back to geo/freq log) ── d['ess_versions'] = _get_ess_versions(paths, project_directory) if converged else None @@ -595,7 +990,24 @@ def _spc_to_dict(spc, output_dict: dict, project_directory: str, d['chosen_ts_method'] = getattr(spc, 'chosen_ts_method', None) d['successful_ts_methods'] = getattr(spc, 'successful_methods', None) or None d['neb_log'] = _make_rel_path(paths.get('neb') or None, project_directory) - d['irc_logs'] = [_make_rel_path(p, project_directory) for p in (paths.get('irc') or [])] + # Path-search log slot for xtb_gsm-chosen TS guesses. Distinct + # from ``neb_log`` so the TCKDB adapter can dispatch a method- + # aware ``path_search`` parent calc (``method=neb`` vs + # ``method=gsm``) without inspecting the file. Null when the + # chosen TS-guess method is not GSM (or when GSM ran but didn't + # surface a stringfile path). + d['gsm_log'] = _make_rel_path(paths.get('gsm') or None, project_directory) + irc_paths = list(paths.get('irc') or []) + d['irc_logs'] = [_make_rel_path(p, project_directory) for p in irc_paths] + # Per-log direction in lockstep with ``irc_logs``: 'forward' / + # 'reverse' / null when the scheduler couldn't observe it (older + # projects, restarts predating the parallel-list tracking). + # Padded to ``len(irc_logs)`` so consumers can zip the two lists + # without index-out-of-range checks. + irc_dirs_raw = list(paths.get('irc_directions') or []) + d['irc_log_directions'] = ( + irc_dirs_raw + [None] * (len(irc_paths) - len(irc_dirs_raw)) + )[:len(irc_paths)] if not irc_requested: d['irc_converged'] = None else: @@ -615,6 +1027,16 @@ def _spc_to_dict(spc, output_dict: dict, project_directory: str, else: d['statmech'] = None + # ── additional calculations (rotor scans, etc.) ───────────────────────── + # Bundle-local calcs that aren't part of the opt → freq → sp chain. The + # field is a list (possibly empty) so consumers can iterate without a + # None-guard. Currently populated only for converged non-monoatomic + # species with successful 1D rotor scans; everything else gets ``[]``. + if not is_mono and converged: + d['additional_calculations'] = _build_scan_calculations(spc, project_directory) + else: + d['additional_calculations'] = [] + return d @@ -653,17 +1075,23 @@ def _scalar(x): 'tmax_k': _scalar(thermo.Tmax), } - # ── Cp tabulation ──────────────────────────────────────────────────────── - cp = getattr(thermo, 'cp_data', None) - if cp is not None: - t['cp_data'] = cp + # ── Per-temperature thermochemistry ────────────────────────────────────── + # ``thermo_points`` carries the full TCKDB ``thermo_point`` shape + # (Cp + H + S + G at each tabulated T). Falls back to building a + # Cp-only point list from RMG's ``Tdata``/``Cpdata`` when only that + # legacy form is available — H/S/G stay omitted because we don't + # have the polynomial in scope here to evaluate them. The TCKDB + # adapter accepts either shape. + points = getattr(thermo, 'thermo_points', None) + if points is not None: + t['thermo_points'] = points elif thermo.Tdata is not None and thermo.Cpdata is not None: T_list = thermo.Tdata[0] if isinstance(thermo.Tdata, (list, tuple)) else thermo.Tdata Cp_list = thermo.Cpdata[0] if isinstance(thermo.Cpdata, (list, tuple)) else thermo.Cpdata - t['cp_data'] = [{'temperature_k': float(T), 'cp_j_mol_k': float(Cp)} - for T, Cp in zip(T_list, Cp_list)] + t['thermo_points'] = [{'temperature_k': float(T), 'cp_j_mol_k': float(Cp)} + for T, Cp in zip(T_list, Cp_list)] else: - t['cp_data'] = None + t['thermo_points'] = None # ── NASA polynomials ───────────────────────────────────────────────────── t['nasa_low'] = getattr(thermo, 'nasa_low', None) @@ -703,11 +1131,19 @@ def _statmech_to_dict(spc, project_directory: str, point_group: str | None = Non def _get_torsions(spc, project_directory: str) -> list[dict]: - """Build the torsions list from spc.rotors_dict.""" + """Build the torsions list from spc.rotors_dict. + + Each emitted torsion carries a ``source_scan_calculation_key`` (e.g. + ``"scan_rotor_3"``) when its scan log is on disk and parseable. The key + matches the bundle-local key used by :func:`_build_scan_calculations`, + so TCKDB consumers can resolve the torsion's underlying scan calc. + Rotors whose scan log is missing or fails to parse get the field set to + ``None`` rather than fabricating a key that points at no calc. + """ if not getattr(spc, 'rotors_dict', None): return [] torsions = [] - for rotor in spc.rotors_dict.values(): + for rotor_index, rotor in spc.rotors_dict.items(): if rotor.get('success') is not True: continue scan = rotor.get('scan') # 4-atom dihedral defining atoms, 1-indexed @@ -715,22 +1151,27 @@ def _get_torsions(spc, project_directory: str) -> list[dict]: symmetry = rotor.get('symmetry', 1) rotor_type = rotor.get('type', 'HinderedRotor') treatment = 'free_rotor' if 'Free' in str(rotor_type) else 'hindered_rotor' + scan_key = ( + f'scan_rotor_{rotor_index}' + if _scan_log_is_parseable(rotor, project_directory) else None + ) torsions.append({ 'symmetry_number': symmetry, 'treatment': treatment, 'atom_indices': scan, 'pivot_atoms': pivots, 'barrier_kj_mol': _get_rotor_barrier(rotor, project_directory), + 'source_scan_calculation_key': scan_key, }) return torsions -def _get_rotor_barrier(rotor: dict, project_directory: str) -> float | None: - """ - Return max(V) - min(V) in kJ/mol from the 1D scan output file. +def _resolve_scan_path(rotor: dict, project_directory: str) -> str | None: + """Return an absolute, on-disk scan log path for ``rotor``, or ``None``. - parse_1d_scan_energies already zeroes the minimum, so max(energies) is the - barrier height directly. + Centralizes the rules used by both the barrier helper and the + scan-calc builder so they agree on which rotors qualify as "has a + scan log we can use." """ scan_path = rotor.get('scan_path', '') if not scan_path: @@ -739,15 +1180,417 @@ def _get_rotor_barrier(rotor: dict, project_directory: str) -> float | None: scan_path = os.path.join(project_directory, scan_path) if not os.path.isfile(scan_path): return None + return scan_path + + +def _scan_log_is_parseable(rotor: dict, project_directory: str) -> bool: + """Cheap presence-check for a usable 1D scan log on disk.""" + return _resolve_scan_path(rotor, project_directory) is not None + + +def _get_rotor_barrier(rotor: dict, project_directory: str) -> float | None: + """ + Return max(V) - min(V) in kJ/mol from the 1D scan output file. + + parse_1d_scan_energies already zeroes the minimum, so max(energies) is the + barrier height directly. + """ + scan_path = _resolve_scan_path(rotor, project_directory) + if scan_path is None: + return None try: energies, _ = parse_1d_scan_energies(log_file_path=scan_path) - if energies: + if energies is not None and len(energies): return float(max(energies)) except Exception: logger.debug(f"Failed to parse 1D rotor scan energies from '{scan_path}'", exc_info=True) return None +def _build_scan_calculations(spc, project_directory: str) -> list[dict]: + """Build the species' ``additional_calculations`` list from rotor scans. + + Emits one ``type='scan'`` calc per successful 1D rotor whose scan log + is on disk and parses cleanly. Each entry shape: + + { + 'key': 'scan_rotor_', + 'type': 'scan', + 'scan_result': { + 'dimension': 1, + 'is_relaxed': True, + 'zero_energy_reference_hartree': float | None, + 'coordinates': [{coordinate_index, coordinate_kind, atom*_index, + step_count, value_unit, symmetry_number}], + 'points': [{point_index, electronic_energy_hartree, + relative_energy_kj_mol, + coordinate_values: [{coordinate_index, + coordinate_value, value_unit}], + xyz: str | None}], + }, + } + + Rotors that don't produce a parseable scan_result are skipped — the + matching torsion's ``source_scan_calculation_key`` will be ``None`` + so consumers don't end up with dangling references. + + Only 1D rotors are emitted here; ND rotors and any future + multi-dimensional shapes go through a separate path (deferred). + """ + if not getattr(spc, 'rotors_dict', None): + return [] + # The species's converged opt geometry is the input the rotor scan + # job was launched against — we pass it down so the scan-result + # builder can compute ``start_value`` (the dihedral the user + # requested as the scan's starting point). ``initial_xyz`` is a + # fallback for species that haven't (yet) populated ``final_xyz``; + # better to derive a start dihedral from the pre-opt geometry than + # to leave the field null when something usable exists. + input_xyz = getattr(spc, 'final_xyz', None) or getattr(spc, 'initial_xyz', None) + out: list[dict] = [] + for rotor_index, rotor in spc.rotors_dict.items(): + if rotor.get('success') is not True: + continue + if rotor.get('dimensions', 1) != 1: + continue + scan_result = _build_scan_result_for_rotor( + rotor, project_directory, input_xyz=input_xyz, + ) + if scan_result is None: + continue + scan_constraints = _parse_scan_constraints(rotor, project_directory) + entry = { + 'key': f'scan_rotor_{rotor_index}', + 'type': 'scan', + 'scan_result': scan_result, + } + if scan_constraints: + entry['constraints'] = scan_constraints + out.append(entry) + return out + + +def _parse_scan_constraints(rotor: dict, project_directory: str) -> list[dict]: + """Best-effort extraction of held-fixed constraints for a rotor scan. + + Dispatches to the per-software constraint parser based on the + rotor's ``scan_software`` hint (set by the scheduler when the scan + job completes). Returns the held-fixed constraints — the active + scan coordinate is excluded; it lives in ``scan_result.coordinates[]``. + + Software dispatch: + ``gaussian`` → :func:`parse_gaussian_constraints` + ``orca`` → :func:`parse_orca_constraints` + missing / empty → fall back to Gaussian (the only software ARC + currently emits ModRedundant for; this keeps + restart files / pre-existing rotor dicts from + losing their constraints) + anything else → ``[]`` with a debug log (no noisy warning; + the consumer only cares about the absence) + + Never raises: parser failures degrade to ``[]`` so a malformed log + doesn't break ``additional_calculations`` for the rest of the run. + """ + scan_path = _resolve_scan_path(rotor, project_directory) + if scan_path is None: + return [] + software = (rotor.get('scan_software') or '').strip().lower() + parser_fn = _scan_constraint_parser_for(software) + if parser_fn is None: + logger.debug("Scan-constraint extraction: no parser for software=%r " + "at '%s'; emitting [].", software, scan_path) + return [] + try: + return parser_fn(scan_path) + except Exception as exc: + logger.warning("Scan-constraint extraction failed for '%s' " + "(software=%r): %s", scan_path, software, exc) + return [] + + +def _scan_constraint_parser_for(software: str): + """Return the constraint-parser callable for ``software``, or None. + + Empty / missing ``software`` falls back to the Gaussian parser to + preserve the prior best-effort behavior for rotor dicts that predate + the ``scan_software`` field (older restart files, in-progress + refactors). Anything explicitly unknown (e.g., ``'qchem'``) returns + None so the caller can debug-log and emit an empty list without + parsing surprise files. + """ + if not software: + from arc.parser.adapters.gaussian import parse_gaussian_constraints + return parse_gaussian_constraints + if software == 'gaussian': + from arc.parser.adapters.gaussian import parse_gaussian_constraints + return parse_gaussian_constraints + if software == 'orca': + from arc.parser.adapters.orca import parse_orca_constraints + return parse_orca_constraints + return None + + +def _safe_dihedral_for_scan_atoms( + xyz: dict | None, + scan_atoms: list[int], + scan_path: str, +) -> float | None: + """Compute the dihedral (degrees, 0-360) at the scan quartet, or ``None``. + + Wraps :func:`arc.species.vectors.calculate_dihedral_angle` with the + failure-tolerant contract this helper needs: missing input → quiet + ``None`` (caller falls through to "unknown"); raised exception or + NaN return (colinear atoms) → ``None`` plus a warning that names + the scan log so the operator can investigate. Never raises. + + ``scan_atoms`` must be 1-based per ARC convention; that's enforced + upstream where ``rotor['scan']`` is validated. + """ + if xyz is None: + return None + try: + angle = calculate_dihedral_angle(coords=xyz, torsion=scan_atoms, index=1) + except Exception as exc: + logger.warning( + "Scan start_value/end_value: dihedral calculation failed for " + "atoms=%r in '%s': %s; omitting start_value/end_value.", + scan_atoms, scan_path, exc, + ) + return None + if angle != angle: # NaN: colinear atoms in the input geometry. + logger.warning( + "Scan start_value/end_value: dihedral is NaN (colinear quartet?) " + "for atoms=%r in '%s'; omitting start_value/end_value.", + scan_atoms, scan_path, + ) + return None + return float(angle) + + +def _xyz_dict_to_tckdb_xyz_text(xyz_dict: dict | None) -> str | None: + """Serialize an ARC xyz dict into TCKDB's count-headered xyz_text. + + Mirrors the conformer-side normalization the adapter applies to the + species ``xyz`` field: bare atom-only string from + ``arc.species.converter.xyz_to_str`` plus a TCKDB-required + ``\\n\\n`` envelope. Returns ``None`` for + null/empty input so the caller can decide whether to omit the + field; serialization errors propagate as exceptions for the caller + to handle (uniform-drop on failure rather than per-point drop). + """ + if xyz_dict is None: + return None + bare = xyz_to_str(xyz_dict=xyz_dict) + if not bare: + return None + text = bare.strip() + if not text: + return None + return f"{len(text.splitlines())}\n\n{text}" + + +def _build_scan_result_for_rotor( + rotor: dict, + project_directory: str, + *, + input_xyz: dict | None = None, +) -> dict | None: + """Parse the rotor's scan log and shape it into a TCKDB ``scan_result`` dict. + + Returns ``None`` when: + * no scan log on disk (covered by ``_resolve_scan_path``) + * parser fails to extract angles or relative energies (the only + two fields that drive the shape — absolute Hartree and + geometries are nice-to-have) + * atom_indices isn't a 4-int 1-based dihedral quartet (the + coordinates entry would be unusable downstream) + + On the happy path the dict carries plain Python primitives only, so + YAML emission via ``save_yaml_file`` Just Works. Per-point + geometries are emitted under ``points[i].geometry.xyz_text`` when + the parser returns a per-step geometry list aligned with the + energy/angle list (TCKDB resolves these server-side into + ``calc_scan_point.geometry_id``); on length mismatch or + serialization failure the geometries are dropped uniformly across + all points and a warning is logged — partial coverage would imply + a per-point alignment we can't actually verify. + """ + scan_path = _resolve_scan_path(rotor, project_directory) + if scan_path is None: + return None + + try: + parsed = parse_1d_scan_full_result(log_file_path=scan_path) + except Exception: + logger.debug(f"Failed to parse 1D rotor scan result from '{scan_path}'", exc_info=True) + return None + + angles = parsed.get('angles_deg') + rel_energies = parsed.get('relative_energies_kj_mol') + if not angles or not rel_energies: + return None + if len(angles) != len(rel_energies): + logger.debug( + "Skipping scan_result for '%s': angles/energies length mismatch (%d vs %d)", + scan_path, len(angles), len(rel_energies), + ) + return None + + abs_energies = parsed.get('absolute_energies_hartree') + # absolute_energies and angles must align with relative_energies; if a + # parser disagreed on the count, drop absolute rather than misalign. + if abs_energies is not None and len(abs_energies) != len(rel_energies): + abs_energies = None + + # Per-point geometries: the parser wrapper returns one xyz dict per + # converged scan iteration. Attach to each point as + # ``geometry.xyz_text`` (TCKDB count-headered format) only when the + # list aligns 1:1 with the energy list; otherwise drop wholesale — + # mixing populated and missing entries would imply an alignment we + # can't verify, and the schema accepts ``geometry`` as omitted but + # not as ``null``. + geometries = parsed.get('geometries') + point_geometry_xyz_texts: list[str] | None = None + if geometries is not None: + if len(geometries) != len(rel_energies): + logger.warning( + "Scan-point geometry count (%d) does not match scan-point count (%d) " + "for '%s'; omitting per-point geometries from scan_result.", + len(geometries), len(rel_energies), scan_path, + ) + else: + try: + point_geometry_xyz_texts = [ + _xyz_dict_to_tckdb_xyz_text(g) for g in geometries + ] + except Exception as exc: + logger.warning( + "Scan-point geometry serialization failed for '%s': %s; " + "omitting per-point geometries from scan_result.", + scan_path, exc, + ) + point_geometry_xyz_texts = None + else: + # If any single point produced an empty/None text, drop + # all rather than emit asymmetric coverage. + if any(t is None or not t for t in point_geometry_xyz_texts): + logger.warning( + "Scan-point geometry serialization yielded empty text for " + "at least one point in '%s'; omitting per-point geometries.", + scan_path, + ) + point_geometry_xyz_texts = None + + scan_atoms = rotor.get('scan') + if not (isinstance(scan_atoms, list) and len(scan_atoms) == 4 + and all(isinstance(a, int) and a >= 1 for a in scan_atoms) + and len(set(scan_atoms)) == 4): + return None + a1, a2, a3, a4 = scan_atoms + + symmetry = rotor.get('symmetry') + coord: dict[str, Any] = { + 'coordinate_index': 1, + 'coordinate_kind': 'dihedral', + 'atom1_index': a1, + 'atom2_index': a2, + 'atom3_index': a3, + 'atom4_index': a4, + 'step_count': len(angles), + 'value_unit': 'degree', + } + if isinstance(symmetry, int) and symmetry >= 1: + coord['symmetry_number'] = symmetry + + # Requested grid metadata: ``parse_scan_args`` reads the + # ModRedundant header that Gaussian echoes back into its log + # (``D a b c d S ``), giving us the exact + # step size the user requested rather than one inferred from the + # completed point spacing. ORCA / other ESS raise + # ``NotImplementedError`` from the same parser; in those cases + # the grid fields stay absent rather than guessed at — TCKDB + # treats null as "unknown grid", which is honest. + requested_step_size: float | None = None + try: + scan_args = parse_scan_args(scan_path) + raw_step_size = scan_args.get('step_size') + if isinstance(raw_step_size, (int, float)) and raw_step_size > 0: + requested_step_size = float(raw_step_size) + except NotImplementedError: + # Non-Gaussian ESS: parser doesn't speak this log format yet. + pass + except Exception: + logger.debug(f"parse_scan_args failed for '{scan_path}'", exc_info=True) + if requested_step_size is not None: + coord['step_size'] = requested_step_size + # 1D dihedral torsion scans: the resolution IS the step size. + # ND or non-dihedral scans would need a different mapping; + # this whole helper is the 1D path so the equivalence holds. + coord['resolution_degrees'] = requested_step_size + + # ``start_value``/``end_value`` describe the requested grid, not + # the completed-point spacing. Computing them honestly requires + # both the requested step size (above) AND the input dihedral + # (the geometry the user pointed the scan at). The latter comes + # from the species record — preferred — falling back to the + # first parsed scan-iteration geometry, which for Gaussian + # ModRedundant scans has the dihedral held fixed at the input + # value by construction (so it's not "inferring from outputs", + # it's reading a frozen DOF). ``end_value`` is then exact: + # ``start + step_size * (step_count - 1)``. We deliberately do + # NOT wrap into [-180, 180]: a full rotation must land at + # ``start + 360°``, not back at ``start`` — TCKDB's column has + # no range constraint, and continuity is what downstream + # consumers (rotor-treatment plotters, etc.) expect. + step_count_for_grid = len(rel_energies) + if ( + requested_step_size is not None + and step_count_for_grid >= 1 + ): + dihedral_source = input_xyz + if dihedral_source is None: + geom_list = parsed.get('geometries') + if isinstance(geom_list, list) and geom_list: + dihedral_source = geom_list[0] + start_value = _safe_dihedral_for_scan_atoms( + dihedral_source, scan_atoms, scan_path, + ) + if start_value is not None: + coord['start_value'] = start_value + coord['end_value'] = ( + start_value + requested_step_size * (step_count_for_grid - 1) + ) + + points: list[dict[str, Any]] = [] + for i, (angle, rel_e) in enumerate(zip(angles, rel_energies), start=1): + point: dict[str, Any] = { + 'point_index': i, + 'relative_energy_kj_mol': float(rel_e), + 'coordinate_values': [{ + 'coordinate_index': 1, + 'coordinate_value': float(angle), + 'value_unit': 'degree', + }], + } + if abs_energies is not None: + point['electronic_energy_hartree'] = float(abs_energies[i - 1]) + if point_geometry_xyz_texts is not None: + point['geometry'] = {'xyz_text': point_geometry_xyz_texts[i - 1]} + points.append(point) + + scan_result: dict[str, Any] = { + 'dimension': 1, + 'is_relaxed': True, + 'coordinates': [coord], + 'points': points, + } + zero_ref = parsed.get('zero_energy_reference_hartree') + if isinstance(zero_ref, (int, float)): + scan_result['zero_energy_reference_hartree'] = float(zero_ref) + + return scan_result + + def _rxn_to_dict(rxn) -> dict: """Convert an ARCReaction to a plain dict for output.yml.""" kinetics = rxn.kinetics @@ -770,6 +1613,13 @@ def _rxn_to_dict(rxn) -> dict: 'dEa': kinetics.get('dEa'), 'dEa_units': kinetics.get('dEa_units'), 'n_data_points': kinetics.get('n_data_points'), + # ARC always renders the same tunneling method into Arkane's + # input template (currently 'Eckart'); record it here so + # downstream consumers know which correction was applied to + # the fitted A/n/Ea. If Arkane's parsed kinetics carries an + # explicit tunneling marker in the future, prefer that; + # otherwise fall back to the template constant. + 'tunneling': kinetics.get('tunneling') or ARKANE_TUNNELING_METHOD, } return { diff --git a/arc/output_test.py b/arc/output_test.py index ae8b112a50..4796ea76f7 100644 --- a/arc/output_test.py +++ b/arc/output_test.py @@ -12,7 +12,11 @@ from arc.level import Level from arc.common import ARC_TESTING_PATH from arc.output import ( + _build_applied_corrections_for_species, + _build_scan_calculations, + _build_scan_result_for_rotor, _compute_point_groups, + _compute_species_corrections, _get_arkane_git_commit, _get_energy_corrections, _get_ess_versions, @@ -172,7 +176,7 @@ def test_basic_thermo(self): self.assertEqual(result['s298_j_mol_k'], 230.1) self.assertEqual(result['tmin_k'], 300) self.assertEqual(result['tmax_k'], 3000) - self.assertIsNone(result['cp_data']) + self.assertIsNone(result['thermo_points']) self.assertIsNone(result['nasa_low']) self.assertIsNone(result['nasa_high']) @@ -185,11 +189,17 @@ def test_thermo_with_nasa(self): self.assertEqual(result['nasa_low'], nasa_low) self.assertEqual(result['nasa_high'], nasa_high) - def test_thermo_with_cp_data(self): - cp = [{'temperature_k': 300.0, 'cp_j_mol_k': 35.1}, {'temperature_k': 400.0, 'cp_j_mol_k': 40.5}] - thermo = ThermoData(H298=-10.0, S298=200.0, Tmin=(300, 'K'), Tmax=(2000, 'K'), cp_data=cp) + def test_thermo_with_thermo_points(self): + points = [ + {'temperature_k': 300.0, 'cp_j_mol_k': 35.1, + 'h_kj_mol': -50.0, 's_j_mol_k': 200.0, 'g_kj_mol': -110.0}, + {'temperature_k': 400.0, 'cp_j_mol_k': 40.5, + 'h_kj_mol': -45.2, 's_j_mol_k': 215.0, 'g_kj_mol': -131.2}, + ] + thermo = ThermoData(H298=-10.0, S298=200.0, Tmin=(300, 'K'), + Tmax=(2000, 'K'), thermo_points=points) result = _thermo_to_dict(thermo) - self.assertEqual(result['cp_data'], cp) + self.assertEqual(result['thermo_points'], points) def test_tmin_tmax_scalar(self): """Tmin/Tmax can be plain numbers (not tuples).""" @@ -366,22 +376,29 @@ class TestParseOptLog(unittest.TestCase): """Tests for _parse_opt_log and the Gaussian parse_opt_steps adapter.""" def test_gaussian_opt_log(self): - """Parse a real Gaussian opt log for step count and final energy.""" + """Parse a real Gaussian opt log for step count, final energy, and final xyz.""" opt_path = os.path.join(ARC_TESTING_PATH, 'opt', 'iC3H7.out') - n_steps, e_hartree = _parse_opt_log(opt_path, '/dummy') + n_steps, e_hartree, final_xyz = _parse_opt_log(opt_path, '/dummy') self.assertEqual(n_steps, 4) self.assertIsNotNone(e_hartree) self.assertAlmostEqual(e_hartree, -116.986089069, places=6) + # The geometry is parsed via the shared parse_geometry dispatcher; + # we just check it produced a non-empty atom-only string. + self.assertIsNotNone(final_xyz) + self.assertTrue(len(final_xyz.splitlines()) >= 3, + msg=f"expected several atom lines, got {final_xyz!r}") def test_missing_file(self): - n_steps, e_hartree = _parse_opt_log('/nonexistent/file.log', '/tmp') + n_steps, e_hartree, final_xyz = _parse_opt_log('/nonexistent/file.log', '/tmp') self.assertIsNone(n_steps) self.assertIsNone(e_hartree) + self.assertIsNone(final_xyz) def test_none_path(self): - n_steps, e_hartree = _parse_opt_log(None, '/tmp') + n_steps, e_hartree, final_xyz = _parse_opt_log(None, '/tmp') self.assertIsNone(n_steps) self.assertIsNone(e_hartree) + self.assertIsNone(final_xyz) def test_parse_zpe_from_freq_log(self): """Parse ZPE from a real Gaussian freq log.""" @@ -693,6 +710,81 @@ def test_ts_without_mol(self): self.assertIsNone(result['smiles']) self.assertIsNone(result['formula']) + @staticmethod + def _minimal_ts_mock(label, chosen): + """Build a ``MagicMock`` species shaped enough for ``_spc_to_dict`` + to walk the TS branch and emit ``neb_log``/``gsm_log``.""" + spc = MagicMock() + spc.label = label + spc.original_label = None + spc.charge = 0 + spc.multiplicity = 2 + spc.is_ts = True + spc.mol = None + spc.final_xyz = {'symbols': ('C',), 'isotopes': (12,), 'coords': ((0, 0, 0),)} + spc.initial_xyz = None + spc.is_monoatomic.return_value = False + spc.e_elect = -100.0 + spc.e0 = -95.0 + spc._is_linear = False + spc.optical_isomers = 1 + spc.external_symmetry = 1 + spc.freqs = [-1500.0, 100.0] + spc.rotors_dict = None + spc.thermo = None + spc.rxn_label = 'A <=> B' + spc.chosen_ts_method = chosen + spc.successful_methods = [chosen] if chosen else [] + return spc + + def test_ts_emits_gsm_log_when_paths_gsm_set(self): + # When the scheduler routed an xtb_gsm log to ``paths['gsm']`` + # (separate slot from ``paths['neb']``), the TS record carries + # a ``gsm_log`` field populated with the run-relative path. The + # ``neb_log`` field stays empty/None for the same record so the + # TCKDB adapter's method-aware gate doesn't see cross-pollination. + spc = self._minimal_ts_mock(label='TS_gsm', chosen='xTB-GSM') + gsm_abs = '/abs/calcs/TS_gsm/gsm/stringfile.xyz0000' + output_dict = {'TS_gsm': { + 'convergence': True, + 'paths': {'irc': [], 'neb': '', 'gsm': gsm_abs}, + 'job_types': {}, + }} + result = _spc_to_dict(spc, output_dict, '/abs') + # ``_make_rel_path`` strips the project_directory prefix. + self.assertEqual(result['gsm_log'], + 'calcs/TS_gsm/gsm/stringfile.xyz0000') + self.assertIsNone(result['neb_log']) + + def test_ts_emits_neb_log_when_paths_neb_set(self): + # Mirror of the GSM test: ``paths['neb']`` populated → ``neb_log`` + # filled, ``gsm_log`` stays None. Guards against a regression + # that would emit both fields from the same path slot. + spc = self._minimal_ts_mock(label='TS_neb', chosen='orca_neb') + neb_abs = '/abs/calcs/TS_neb/neb/input.log' + output_dict = {'TS_neb': { + 'convergence': True, + 'paths': {'irc': [], 'neb': neb_abs, 'gsm': ''}, + 'job_types': {}, + }} + result = _spc_to_dict(spc, output_dict, '/abs') + self.assertEqual(result['neb_log'], 'calcs/TS_neb/neb/input.log') + self.assertIsNone(result['gsm_log']) + + def test_ts_emits_neither_log_when_paths_empty(self): + # Geometry-only TS guess (heuristics/AutoTST/user XYZ): both + # slots empty, both ``*_log`` fields end up None. The TCKDB + # adapter's gate then leaves ts_opt edge-less. + spc = self._minimal_ts_mock(label='TS_geom', chosen='Heuristics') + output_dict = {'TS_geom': { + 'convergence': True, + 'paths': {'irc': [], 'neb': '', 'gsm': ''}, + 'job_types': {}, + }} + result = _spc_to_dict(spc, output_dict, '/abs') + self.assertIsNone(result['neb_log']) + self.assertIsNone(result['gsm_log']) + class TestRxnToDict(unittest.TestCase): """Tests for _rxn_to_dict.""" @@ -739,6 +831,48 @@ def test_with_kinetics(self): self.assertEqual(result['kinetics']['Ea_units'], 'kJ/mol') self.assertEqual(result['kinetics']['n'], 0.5) + def test_tunneling_method_defaults_to_arkane_template_constant(self): + # ARC writes ``tunneling='Eckart'`` into every Arkane reaction() + # block (see ARKANE_TUNNELING_METHOD). _rxn_to_dict must surface + # that decision in the kinetics block so downstream consumers + # (TCKDB, analysis) know which correction was applied to the fit. + from arc.statmech.arkane import ARKANE_TUNNELING_METHOD + rxn = MagicMock() + rxn.label = 'A <=> B' + rxn.reactants = ['A'] + rxn.products = ['B'] + rxn.family = None + rxn.multiplicity = 1 + rxn.ts_label = 'TS0' + rxn.kinetics = { + 'A': (1.0e10, 's^-1'), + 'n': 0.0, + 'Ea': (10.0, 'kJ/mol'), + 'Tmin': (300, 'K'), 'Tmax': (2000, 'K'), + } + result = _rxn_to_dict(rxn) + self.assertEqual(result['kinetics']['tunneling'], ARKANE_TUNNELING_METHOD) + + def test_tunneling_method_from_parsed_kinetics_wins(self): + # If Arkane ever surfaces an explicit tunneling marker on the + # parsed kinetics dict, prefer that over the template constant. + # Future-proofs the producer against per-reaction tunneling + # configs without forcing a template-constant change. + rxn = MagicMock() + rxn.label = 'A <=> B' + rxn.reactants = ['A'] + rxn.products = ['B'] + rxn.family = None + rxn.multiplicity = 1 + rxn.ts_label = 'TS0' + rxn.kinetics = { + 'A': (1.0e10, 's^-1'), 'n': 0.0, 'Ea': (10.0, 'kJ/mol'), + 'Tmin': (300, 'K'), 'Tmax': (2000, 'K'), + 'tunneling': 'Wigner', + } + result = _rxn_to_dict(rxn) + self.assertEqual(result['kinetics']['tunneling'], 'Wigner') + class TestSpcToDict(unittest.TestCase): """Tests for _spc_to_dict.""" @@ -1002,6 +1136,147 @@ def test_input_path_none_when_software_unknown(self): ) self.assertIsNone(result['opt_input']) + # ------------------------------------------------------------------ + # opt_input_xyz: pre-opt geometry surfaced for opt's input-geometry + # provenance. freq + sp share the conformer's converged geometry by + # ARC's invariant; only opt has a distinct input. + # ------------------------------------------------------------------ + + def test_opt_input_xyz_emitted_from_initial_xyz(self): + """``spc.initial_xyz`` lands as ``opt_input_xyz`` in xyz_to_str format.""" + spc = self._make_spc_mock() + spc.initial_xyz = { + 'symbols': ('C', 'H'), + 'isotopes': (12, 1), + 'coords': ((0.001, 0.002, 0.003), (1.090, 0.000, 0.000)), + } + output_dict = {'CH4': {'convergence': True, 'paths': {}, 'job_types': {}}} + result = _spc_to_dict(spc, output_dict, '/abs') + self.assertIsNotNone(result['opt_input_xyz']) + # Same atom-only string format as the existing ``xyz`` field. + lines = result['opt_input_xyz'].splitlines() + self.assertEqual(len(lines), 2) + self.assertTrue(lines[0].startswith('C ')) + self.assertTrue(lines[1].startswith('H ')) + + def test_opt_input_xyz_distinct_from_xyz_when_both_present(self): + """``xyz`` carries final, ``opt_input_xyz`` carries initial. Different + coordinates → different strings; the bundle's `input_geometries` + link for opt is genuinely separate from the conformer geometry.""" + spc = self._make_spc_mock() + spc.initial_xyz = { + 'symbols': ('C',), + 'isotopes': (12,), + 'coords': ((0.001, 0.0, 0.0),), + } + spc.final_xyz = { + 'symbols': ('C',), + 'isotopes': (12,), + 'coords': ((0.500, 0.0, 0.0),), + } + output_dict = {'CH4': {'convergence': True, 'paths': {}, 'job_types': {}}} + result = _spc_to_dict(spc, output_dict, '/abs') + self.assertIsNotNone(result['xyz']) + self.assertIsNotNone(result['opt_input_xyz']) + self.assertNotEqual(result['xyz'], result['opt_input_xyz']) + + def test_opt_input_xyz_none_when_initial_xyz_absent(self): + """Species with no initial_xyz set → ``opt_input_xyz`` is null.""" + spc = self._make_spc_mock() + spc.initial_xyz = None + output_dict = {'CH4': {'convergence': True, 'paths': {}, 'job_types': {}}} + result = _spc_to_dict(spc, output_dict, '/abs') + self.assertIsNone(result['opt_input_xyz']) + + def test_opt_input_xyz_emitted_independently_of_convergence(self): + """Opt's input is meaningful even on failed runs — surface it + regardless of the species convergence flag.""" + spc = self._make_spc_mock(converged=False) + spc.initial_xyz = { + 'symbols': ('O',), + 'isotopes': (16,), + 'coords': ((0.0, 0.0, 0.0),), + } + output_dict = {'CH4': {'convergence': False, 'paths': {}, 'job_types': {}}} + result = _spc_to_dict(spc, output_dict, '/abs') + self.assertIsNotNone(result['opt_input_xyz']) + + # ------------------------------------------------------------------ + # coarse → fine opt geometry chain. When coarse runs, opt_input_xyz + # changes meaning from "pre-everything" to "fine opt's input" = + # coarse opt's output. + # ------------------------------------------------------------------ + + def test_no_coarse_opt_keeps_single_stage_semantics(self): + """No coarse log → coarse_opt_* fields are null; opt_input_xyz + reflects the species' initial xyz (the single-stage opt's input).""" + spc = self._make_spc_mock() + spc.initial_xyz = {'symbols': ('C',), 'isotopes': (12,), + 'coords': ((0.0, 0.0, 0.0),)} + output_dict = {'CH4': {'convergence': True, + 'paths': {}, # no geo_coarse + 'job_types': {'opt': True}}} + result = _spc_to_dict(spc, output_dict, '/abs') + self.assertIsNone(result['coarse_opt_log']) + self.assertIsNone(result['coarse_opt_input_xyz']) + self.assertIsNone(result['coarse_opt_output_xyz']) + # Single-stage: opt_input_xyz comes from spc.initial_xyz. + self.assertIsNotNone(result['opt_input_xyz']) + self.assertEqual(result['opt_input_xyz'].split()[0], 'C') + + def test_coarse_opt_chains_geometries_to_fine(self): + """When coarse log parses cleanly, the geometry chain is: + spc.initial_xyz → coarse_opt_input_xyz → coarse_opt_output_xyz = + opt_input_xyz → xyz.""" + # Use the shipped Gaussian iC3H7 opt log — parse_geometry handles + # it via the per-ESS adapter, so we get a real xyz back. + coarse_log = os.path.join(ARC_TESTING_PATH, 'opt', 'iC3H7.out') + spc = self._make_spc_mock() + spc.initial_xyz = {'symbols': ('C',), 'isotopes': (12,), + 'coords': ((9.999, 9.999, 9.999),)} # distinctive + output_dict = {'CH4': {'convergence': True, + 'paths': {'geo_coarse': coarse_log, + 'geo': coarse_log}, # both point at same file for this test + 'job_types': {'opt': True}}} + result = _spc_to_dict(spc, output_dict, '/abs') + # Coarse fields populated. + self.assertIsNotNone(result['coarse_opt_log']) + self.assertIsNotNone(result['coarse_opt_input_xyz']) + self.assertIsNotNone(result['coarse_opt_output_xyz']) + # Coarse input == species initial xyz. + self.assertIn('9.999', result['coarse_opt_input_xyz']) + # Coarse output != initial xyz (it was actually parsed from the log). + self.assertNotIn('9.999', result['coarse_opt_output_xyz']) + # opt_input_xyz now points at the coarse output, not initial xyz. + self.assertEqual(result['opt_input_xyz'], result['coarse_opt_output_xyz']) + + def test_coarse_opt_unparseable_geometry_falls_back_safely(self): + """If the coarse log exists but its geometry can't be parsed, we + emit no coarse_opt_output_xyz and fall back to single-stage + semantics for opt_input_xyz (= spc.initial_xyz). A bundle + downstream won't emit a structured opt_coarse calc in this case.""" + # An unparseable file (empty), but its existence triggers the + # coarse-opt-ran branch. + proj = tempfile.mkdtemp(prefix='arc-coarse-fallback-') + self.addCleanup(shutil.rmtree, proj, ignore_errors=True) + empty_log = os.path.join(proj, 'coarse.log') + open(empty_log, 'w').close() + spc = self._make_spc_mock() + spc.initial_xyz = {'symbols': ('C',), 'isotopes': (12,), + 'coords': ((0.0, 0.0, 0.0),)} + output_dict = {'CH4': {'convergence': True, + 'paths': {'geo_coarse': empty_log}, + 'job_types': {'opt': True}}} + result = _spc_to_dict(spc, output_dict, proj) + # coarse_opt_log path is recorded (it exists) but the parsed + # geometry is None, so the chain-aware fields stay null. + self.assertIsNotNone(result['coarse_opt_log']) + self.assertIsNone(result['coarse_opt_output_xyz']) + self.assertIsNone(result['coarse_opt_input_xyz']) + # Fallback: opt_input_xyz comes from initial_xyz, not from the + # missing coarse output. + self.assertIsNotNone(result['opt_input_xyz']) + class TestComputePointGroups(unittest.TestCase): """Tests for _compute_point_groups.""" @@ -1316,5 +1591,949 @@ def test_point_group_unknown_element(self): sys.path.remove(scripts_dir) +class TestBuildAppliedCorrectionsForSpecies(unittest.TestCase): + """Direct tests for `_build_applied_corrections_for_species`. + + Stubs the rmg_env script's per-label result so we don't depend on the + Arkane subprocess; the helper's job is purely shape-translation. + """ + + def _lot(self): + return Level(method='wb97xd3', basis='def2tzvp', software='qchem') + + def _aec_block(self): + return { + 'value': -0.0234, + 'value_unit': 'hartree', + 'components': [ + {'component_kind': 'atom', 'key': 'C', 'multiplicity': 1, + 'parameter_value': -37.84993993, 'parameter_unit': 'hartree', + 'contribution_value': -0.015}, + {'component_kind': 'atom', 'key': 'H', 'multiplicity': 4, + 'parameter_value': -0.49991749, 'parameter_unit': 'hartree', + 'contribution_value': -0.008}, + ], + } + + def _pbac_block(self): + return { + 'value': -0.694, + 'value_unit': 'kcal_mol', + 'bac_type': 'p', + 'components': [ + {'component_kind': 'bond', 'key': 'C-H', 'multiplicity': 4, + 'parameter_value': -0.1735, 'parameter_unit': 'kcal_mol', + 'contribution_value': -0.694}, + ], + } + + def _mbac_block(self): + return { + 'value': -0.056, + 'value_unit': 'kcal_mol', + 'bac_type': 'm', + } + + def test_aec_total_emitted(self): + sc = {'CH4': {'aec': self._aec_block()}} + out = _build_applied_corrections_for_species('CH4', sc, self._lot(), 'p') + roles = [e['application_role'] for e in out] + self.assertIn('aec_total', roles) + aec = next(e for e in out if e['application_role'] == 'aec_total') + self.assertAlmostEqual(aec['value'], -0.0234) + self.assertEqual(aec['value_unit'], 'hartree') + self.assertEqual(aec['scheme']['kind'], 'atom_energy') + self.assertEqual(aec['scheme']['name'], 'atom_energy') + + def test_aec_components_sum_to_total(self): + # Use values that arithmetically sum exactly so the test + # asserts the producer doesn't drop or rescale rows. + block = { + 'value': -0.030, + 'value_unit': 'hartree', + 'components': [ + {'component_kind': 'atom', 'key': 'C', 'multiplicity': 1, + 'parameter_value': -37.85, 'parameter_unit': 'hartree', + 'contribution_value': -0.018}, + {'component_kind': 'atom', 'key': 'H', 'multiplicity': 4, + 'parameter_value': -0.5, 'parameter_unit': 'hartree', + 'contribution_value': -0.012}, + ], + } + sc = {'X': {'aec': block}} + out = _build_applied_corrections_for_species('X', sc, self._lot(), None) + aec = next(e for e in out if e['application_role'] == 'aec_total') + total = sum(c['contribution_value'] for c in aec['components']) + self.assertAlmostEqual(total, aec['value'], places=6) + + def test_pbac_total_and_components(self): + sc = {'CH4': {'aec': self._aec_block(), 'bac': self._pbac_block()}} + out = _build_applied_corrections_for_species('CH4', sc, self._lot(), 'p') + bac = next(e for e in out if e['application_role'] == 'bac_total') + self.assertEqual(bac['scheme']['kind'], 'bac_petersson') + self.assertEqual(bac['value_unit'], 'kcal_mol') + self.assertEqual(len(bac['components']), 1) + self.assertEqual(bac['components'][0]['key'], 'C-H') + + def test_mbac_total_only_no_components(self): + sc = {'CH4': {'aec': self._aec_block(), 'bac': self._mbac_block()}} + out = _build_applied_corrections_for_species('CH4', sc, self._lot(), 'm') + bac = next(e for e in out if e['application_role'] == 'bac_total') + self.assertEqual(bac['scheme']['kind'], 'bac_melius') + self.assertEqual(bac['components'], []) + + def test_pbac_omits_components_when_param_missing(self): + block = self._pbac_block() + block['components'][0]['parameter_value'] = None + sc = {'X': {'aec': self._aec_block(), 'bac': block}} + out = _build_applied_corrections_for_species('X', sc, self._lot(), 'p') + bac = next(e for e in out if e['application_role'] == 'bac_total') + # Components dropped entirely (partial decomposition would mislead). + self.assertEqual(bac['components'], []) + + def test_units_are_explicit(self): + sc = {'X': {'aec': self._aec_block(), 'bac': self._pbac_block()}} + out = _build_applied_corrections_for_species('X', sc, self._lot(), 'p') + units = {e['application_role']: e['value_unit'] for e in out} + self.assertEqual(units['aec_total'], 'hartree') + self.assertEqual(units['bac_total'], 'kcal_mol') + + def test_missing_correction_omits_silently(self): + # AEC failed (no 'aec' key), BAC succeeded → only BAC emitted. + sc = {'X': {'bac': self._pbac_block()}} + out = _build_applied_corrections_for_species('X', sc, self._lot(), 'p') + roles = [e['application_role'] for e in out] + self.assertEqual(roles, ['bac_total']) + + def test_no_data_returns_empty_list(self): + out = _build_applied_corrections_for_species('X', {}, self._lot(), 'p') + self.assertEqual(out, []) + + def test_bac_type_none_omits_bac(self): + # Even if a BAC block is present, bac_type=None means no BAC role. + sc = {'X': {'aec': self._aec_block(), 'bac': self._pbac_block()}} + out = _build_applied_corrections_for_species('X', sc, self._lot(), None) + roles = [e['application_role'] for e in out] + self.assertEqual(roles, ['aec_total']) + + # ---- scheme parameter tables (atom_params / bond_params) ---- + + def test_aec_scheme_includes_atom_params_from_run_table(self): + # ARC's run-level atom_energy_corrections dict is the source of + # truth for AEC scheme parameters; without atom_params the + # downstream energy_correction_scheme_atom_param table never gets + # populated even though the applied row lands. Sorted-by-element + # for deterministic output.yml. + aec_table = {'C': -37.84706, 'H': -0.50066} + sc = {'X': {'aec': self._aec_block()}} + out = _build_applied_corrections_for_species( + 'X', sc, self._lot(), 'p', aec_table=aec_table, bac_table=None, + ) + aec = next(e for e in out if e['application_role'] == 'aec_total') + self.assertEqual( + aec['scheme']['atom_params'], + [{'element': 'C', 'value': -37.84706}, + {'element': 'H', 'value': -0.50066}], + ) + + def test_pbac_scheme_includes_bond_params_from_run_table(self): + bac_table = {'C-H': -0.17350, 'C=O': -2.63454} + sc = {'X': {'aec': self._aec_block(), 'bac': self._pbac_block()}} + out = _build_applied_corrections_for_species( + 'X', sc, self._lot(), 'p', aec_table=None, bac_table=bac_table, + ) + bac = next(e for e in out if e['application_role'] == 'bac_total') + self.assertEqual( + bac['scheme']['bond_params'], + [{'bond_key': 'C-H', 'value': -0.17350}, + {'bond_key': 'C=O', 'value': -2.63454}], + ) + + def test_mbac_scheme_omits_params(self): + # Per spec: Melius BAC parameters are atom-pair / length / neighbor / + # molecular and don't fit SchemeBondParamPayload's bond-key shape. + # The producer must NOT fabricate or coerce them — emit total only. + bac_table = {'C-H': -0.17350} # would coerce, but we must not + sc = {'X': {'aec': self._aec_block(), 'bac': self._mbac_block()}} + out = _build_applied_corrections_for_species( + 'X', sc, self._lot(), 'm', aec_table=None, bac_table=bac_table, + ) + bac = next(e for e in out if e['application_role'] == 'bac_total') + self.assertEqual(bac['scheme']['kind'], 'bac_melius') + self.assertNotIn('bond_params', bac['scheme']) + self.assertNotIn('atom_params', bac['scheme']) + self.assertNotIn('component_params', bac['scheme']) + + def test_aec_scheme_omits_atom_params_when_table_missing(self): + # Backward compat: when aec_table isn't supplied (caller predates + # this fix, or output.yml was written without it), the scheme still + # has identity but no atom_params field — schema treats it as [] + # via the default factory. + sc = {'X': {'aec': self._aec_block()}} + out = _build_applied_corrections_for_species( + 'X', sc, self._lot(), 'p', aec_table=None, bac_table=None, + ) + aec = next(e for e in out if e['application_role'] == 'aec_total') + self.assertNotIn('atom_params', aec['scheme']) + self.assertNotIn('bond_params', aec['scheme']) + + def test_atom_params_sorted_for_determinism(self): + # Stable insertion order matters for the idempotency hash + # downstream consumers compute over the payload. + aec_table = {'O': -75.07, 'H': -0.5, 'C': -37.85} + sc = {'X': {'aec': self._aec_block()}} + out = _build_applied_corrections_for_species( + 'X', sc, self._lot(), None, aec_table=aec_table, bac_table=None, + ) + aec = next(e for e in out if e['application_role'] == 'aec_total') + elements = [p['element'] for p in aec['scheme']['atom_params']] + self.assertEqual(elements, ['C', 'H', 'O']) # sorted + + +class TestComputeSpeciesCorrections(unittest.TestCase): + """Tests for `_compute_species_corrections` orchestration (subprocess call).""" + + def setUp(self): + self.tmp_dir = tempfile.mkdtemp() + self.addCleanup(shutil.rmtree, self.tmp_dir, ignore_errors=True) + os.makedirs(os.path.join(self.tmp_dir, 'output'), exist_ok=True) + + def _spc(self, label='CH4'): + spc = MagicMock() + spc.label = label + spc.multiplicity = 1 + spc.bond_corrections = {'C-H': 4} + spc.final_xyz = {'symbols': ('C', 'H', 'H', 'H', 'H'), + 'isotopes': (12, 1, 1, 1, 1), + 'coords': ((0, 0, 0), (0.6, 0.6, 0.6), + (-0.6, -0.6, 0.6), (-0.6, 0.6, -0.6), + (0.6, -0.6, -0.6))} + spc.initial_xyz = None + return spc + + def test_returns_empty_when_lot_is_none(self): + out = _compute_species_corrections({'CH4': self._spc()}, None, 'p', self.tmp_dir) + self.assertEqual(out, {}) + + def _patch_lot_key(self, key="LevelOfTheory(method='wb97xd3',basis='def2tzvp',software='qchem')"): + return [ + patch('arc.output.get_qm_corrections_files', return_value=['/fake/data.py']), + patch('arc.output.find_best_across_files', return_value=key), + ] + + def test_returns_empty_when_no_species_have_xyz(self): + spc = self._spc() + spc.final_xyz = None + spc.initial_xyz = None + lot = Level(method='wb97xd3', basis='def2tzvp', software='qchem') + patches = self._patch_lot_key() + for p in patches: + p.start() + self.addCleanup(lambda: [p.stop() for p in patches]) + with patch('arc.output.execute_command') as mock_exec: + out = _compute_species_corrections({'CH4': spc}, lot, 'p', self.tmp_dir) + self.assertEqual(out, {}) + mock_exec.assert_not_called() + + def test_invokes_subprocess_with_batched_input(self): + lot = Level(method='wb97xd3', basis='def2tzvp', software='qchem') + patches = self._patch_lot_key() + for p in patches: + p.start() + self.addCleanup(lambda: [p.stop() for p in patches]) + with patch('arc.output.execute_command', return_value=('', '')) as mock_exec, \ + patch('arc.output.read_yaml_file', return_value={'species': [ + {'label': 'CH4', + 'aec': {'value': -0.02, 'value_unit': 'hartree', 'components': []}, + 'bac': {'value': -0.7, 'value_unit': 'kcal_mol', 'components': []}} + ]}), \ + patch('arc.output.save_yaml_file') as mock_save: + out = _compute_species_corrections( + {'CH4': self._spc()}, lot, 'p', self.tmp_dir, + ) + # Subprocess was called once + self.assertEqual(mock_exec.call_count, 1) + # Result keyed by label + self.assertIn('CH4', out) + self.assertEqual(out['CH4']['aec']['value'], -0.02) + self.assertEqual(out['CH4']['bac']['value'], -0.7) + # Subprocess input batched all species + save_call = mock_save.call_args + content = save_call[1].get('content') or save_call[0][1] + self.assertEqual(content['level_of_theory'], + "LevelOfTheory(method='wb97xd3',basis='def2tzvp',software='qchem')") + self.assertEqual(content['bac_type'], 'p') + self.assertEqual(len(content['species']), 1) + self.assertEqual(content['species'][0]['label'], 'CH4') + self.assertEqual(content['species'][0]['atoms'], {'C': 1, 'H': 4}) + self.assertEqual(content['species'][0]['bonds'], {'C-H': 4}) + self.assertEqual(content['species'][0]['multiplicity'], 1) + + def test_returns_empty_when_lot_key_not_in_database(self): + lot = Level(method='unknown', basis='unknown') + patches = [patch('arc.output.get_qm_corrections_files', return_value=['/fake/data.py']), + patch('arc.output.find_best_across_files', return_value=None)] + for p in patches: + p.start() + self.addCleanup(lambda: [p.stop() for p in patches]) + with patch('arc.output.execute_command') as mock_exec: + out = _compute_species_corrections({'CH4': self._spc()}, lot, 'p', self.tmp_dir) + self.assertEqual(out, {}) + mock_exec.assert_not_called() + + def test_subprocess_failure_returns_empty(self): + lot = Level(method='wb97xd3', basis='def2tzvp', software='qchem') + patches = self._patch_lot_key() + for p in patches: + p.start() + self.addCleanup(lambda: [p.stop() for p in patches]) + with patch('arc.output.execute_command', side_effect=RuntimeError('boom')): + out = _compute_species_corrections( + {'CH4': self._spc()}, lot, 'p', self.tmp_dir, + ) + self.assertEqual(out, {}) + + +class TestScanCalculations(unittest.TestCase): + """Tests for the rotor scan → ``additional_calculations`` plumbing. + + Covers two layers: + - ``_build_scan_result_for_rotor`` shapes one rotor into a TCKDB-like + ``scan_result`` dict, returning ``None`` when the input is unusable. + - ``_build_scan_calculations`` aggregates across ``rotors_dict`` and + filters non-1D / failed / unparseable rotors. + - ``_get_torsions`` attaches ``source_scan_calculation_key`` only when + the corresponding scan log is on disk. + """ + + SCAN_LOG = os.path.join(ARC_TESTING_PATH, 'rotor_scans', 'sBuOH.out') + + def _rotor(self, **overrides) -> dict: + """Build a rotor-dict with sensible defaults; override per-test.""" + rotor: dict = { + 'success': True, + 'scan': [1, 2, 3, 4], + 'pivots': [2, 3], + 'symmetry': 3, + 'type': 'HinderedRotor', + 'scan_path': self.SCAN_LOG, + 'dimensions': 1, + } + rotor.update(overrides) + return rotor + + def test_build_scan_result_happy_path(self): + """Real Gaussian scan log → fully populated scan_result dict.""" + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNotNone(result) + self.assertEqual(result['dimension'], 1) + self.assertTrue(result['is_relaxed']) + # zero_energy_reference_hartree = min absolute energy on the curve. + self.assertIsInstance(result['zero_energy_reference_hartree'], float) + # One coordinate, dihedral, atoms 1–4, 1-based, with symmetry. + self.assertEqual(len(result['coordinates']), 1) + coord = result['coordinates'][0] + self.assertEqual(coord['coordinate_index'], 1) + self.assertEqual(coord['coordinate_kind'], 'dihedral') + self.assertEqual( + (coord['atom1_index'], coord['atom2_index'], + coord['atom3_index'], coord['atom4_index']), + (1, 2, 3, 4), + ) + self.assertEqual(coord['value_unit'], 'degree') + self.assertEqual(coord['symmetry_number'], 3) + self.assertEqual(coord['step_count'], len(result['points'])) + # Points carry index, energies, coordinate_values. Per-point + # geometries are now emitted under ``geometry.xyz_text`` so + # TCKDB can persist them into ``calc_scan_point.geometry_id``; + # geometries come straight from + # ``parse_1d_scan_full_result()['geometries']`` and are + # serialized in the TCKDB count-headered xyz convention. + self.assertGreater(len(result['points']), 0) + first = result['points'][0] + self.assertEqual(first['point_index'], 1) + self.assertEqual(first['coordinate_values'][0]['value_unit'], 'degree') + self.assertIn('relative_energy_kj_mol', first) + self.assertIn('electronic_energy_hartree', first) + self.assertNotIn('xyz', first) + # First point's relative energy ≈ 0 by zero-shift convention. + self.assertAlmostEqual(first['relative_energy_kj_mol'], 1.5753056e-05, + places=6) + + def test_build_scan_result_no_log(self): + """Empty scan_path → None, never an exception.""" + rotor = self._rotor(scan_path='') + self.assertIsNone(_build_scan_result_for_rotor(rotor, '/tmp/project')) + + def test_build_scan_result_missing_log(self): + """Path that doesn't resolve to a real file → None.""" + rotor = self._rotor(scan_path='/nonexistent/does/not/exist.log') + self.assertIsNone(_build_scan_result_for_rotor(rotor, '/tmp/project')) + + def test_build_scan_result_malformed_atom_indices(self): + """Non-quartet ``scan`` field → None (no fabricated atom list).""" + rotor = self._rotor(scan=[1, 2, 3]) # only 3 atoms + self.assertIsNone(_build_scan_result_for_rotor(rotor, '/tmp/project')) + + def test_build_scan_result_parser_failure_returns_none(self): + """Exceptions in the scan-result parser surface as ``None`` (no crash).""" + with patch('arc.output.parse_1d_scan_full_result', + side_effect=Exception('boom')): + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNone(result) + + def test_build_scan_result_missing_relative_energies_returns_none(self): + """Parser returning empty energies → None, even with angles present.""" + with patch('arc.output.parse_1d_scan_full_result', + return_value={ + 'angles_deg': [0.0, 90.0], + 'relative_energies_kj_mol': None, + 'absolute_energies_hartree': None, + 'zero_energy_reference_hartree': None, + 'geometries': None, + }): + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNone(result) + + def test_build_scan_calculations_emits_one_per_rotor(self): + """Two successful 1D rotors → two scan_rotor_ entries.""" + spc = MagicMock() + spc.rotors_dict = {0: self._rotor(), 1: self._rotor()} + calcs = _build_scan_calculations(spc, '/tmp/project') + self.assertEqual(len(calcs), 2) + self.assertEqual(calcs[0]['key'], 'scan_rotor_0') + self.assertEqual(calcs[0]['type'], 'scan') + self.assertEqual(calcs[1]['key'], 'scan_rotor_1') + self.assertIn('scan_result', calcs[0]) + self.assertEqual(calcs[0]['scan_result']['dimension'], 1) + + def test_build_scan_calculations_skips_failed_rotor(self): + spc = MagicMock() + spc.rotors_dict = { + 0: self._rotor(), # ok + 1: self._rotor(success=False), # filtered + } + calcs = _build_scan_calculations(spc, '/tmp/project') + self.assertEqual([c['key'] for c in calcs], ['scan_rotor_0']) + + def test_build_scan_calculations_skips_nd(self): + """ND rotors are deferred — only 1D scans are emitted today.""" + spc = MagicMock() + spc.rotors_dict = { + 0: self._rotor(), # 1D, ok + 1: self._rotor(dimensions=2), # ND, skipped + } + calcs = _build_scan_calculations(spc, '/tmp/project') + self.assertEqual([c['key'] for c in calcs], ['scan_rotor_0']) + + def test_build_scan_calculations_skips_unparseable(self): + """Unparseable scan (no log on disk) → no calc, no exception.""" + spc = MagicMock() + spc.rotors_dict = { + 0: self._rotor(scan_path=''), # log missing → skipped + 1: self._rotor(), # ok + } + calcs = _build_scan_calculations(spc, '/tmp/project') + self.assertEqual([c['key'] for c in calcs], ['scan_rotor_1']) + + def test_get_torsions_attaches_scan_key_when_log_present(self): + """``source_scan_calculation_key`` matches the scan calc key only when log resolves.""" + spc = MagicMock() + spc.rotors_dict = { + 0: { + 'success': True, + 'scan': [1, 2, 3, 4], + 'pivots': [2, 3], + 'symmetry': 3, + 'type': 'HinderedRotor', + 'scan_path': self.SCAN_LOG, + 'dimensions': 1, + }, + 7: { # intentional non-contiguous index — keys must use the dict key. + 'success': True, + 'scan': [5, 6, 7, 8], + 'pivots': [6, 7], + 'symmetry': 1, + 'type': 'HinderedRotor', + 'scan_path': '', + 'dimensions': 1, + }, + } + torsions = _get_torsions(spc, '/tmp/project') + self.assertEqual(len(torsions), 2) + self.assertEqual(torsions[0]['source_scan_calculation_key'], 'scan_rotor_0') + # Second rotor has no scan log on disk → no fabricated key. + self.assertIsNone(torsions[1]['source_scan_calculation_key']) + + # ---- per-point scan geometries (TCKDB calc_scan_point.geometry_id) ---- + # + # ARC's parser wrapper already returns aligned per-step xyz dicts. + # ``_build_scan_result_for_rotor`` previously dropped them; now it + # passes them through as ``points[i].geometry.xyz_text`` so TCKDB's + # bundle workflow can resolve and persist a geometry per scan point. + + def _stub_parsed(self, *, n_points=3, geometries='aligned'): + """Build a parser-wrapper return value with controllable geometry alignment. + + ``geometries`` is one of: + - ``'aligned'`` : list of length n_points, each a valid xyz dict. + - ``'mismatch'`` : list of length n_points + 1. + - ``'none'`` : ``None`` (parser returned no geometries). + - ``'malformed'`` : valid count, but one entry is malformed. + """ + valid_xyz = { + 'symbols': ('C', 'H'), + 'isotopes': (12, 1), + 'coords': ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)), + } + if geometries == 'aligned': + geom_list = [valid_xyz for _ in range(n_points)] + elif geometries == 'mismatch': + geom_list = [valid_xyz for _ in range(n_points + 1)] + elif geometries == 'none': + geom_list = None + elif geometries == 'malformed': + geom_list = [valid_xyz for _ in range(n_points)] + geom_list[1] = {'symbols': (), 'isotopes': (), 'coords': ()} + else: + raise ValueError(geometries) + return { + 'angles_deg': [i * 30.0 for i in range(n_points)], + 'relative_energies_kj_mol': [0.0] * n_points, + 'absolute_energies_hartree': [-100.0] * n_points, + 'zero_energy_reference_hartree': -100.0, + 'geometries': geom_list, + } + + def test_scan_points_include_geometry_when_aligned(self): + """Aligned geometries → every point carries ``geometry.xyz_text``.""" + with patch('arc.output.parse_1d_scan_full_result', + return_value=self._stub_parsed(n_points=3)): + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNotNone(result) + self.assertEqual(len(result['points']), 3) + for point in result['points']: + self.assertIn('geometry', point) + self.assertIn('xyz_text', point['geometry']) + xyz_text = point['geometry']['xyz_text'] + # TCKDB count-headered convention: "\\n\\n". + lines = xyz_text.splitlines() + self.assertEqual(int(lines[0].strip()), 2, + msg=f"first line must be atom count, got {lines[0]!r}") + # Body has the right atom count. + self.assertEqual(len(lines), 4) # count + comment + 2 atom rows + + def test_scan_point_geometry_uses_only_xyz_text_no_db_id(self): + """No ``geometry_id`` (or any DB id) anywhere under scan_result.""" + with patch('arc.output.parse_1d_scan_full_result', + return_value=self._stub_parsed(n_points=3)): + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNotNone(result) + forbidden = {'geometry_id', 'existing_geometry_id', 'id'} + for point in result['points']: + geom = point.get('geometry') or {} + self.assertEqual(set(geom.keys()), {'xyz_text'}, + msg=f"geometry must carry only xyz_text, got {geom}") + for k in forbidden: + self.assertNotIn(k, point, msg=f"{k} leaked onto scan point") + self.assertNotIn(k, geom) + # Top-level scan_result also has no DB ids. + for k in forbidden: + self.assertNotIn(k, result) + + def test_scan_points_omit_geometry_when_geometries_missing(self): + """Parser returned ``geometries=None`` → no point carries a geometry, + no warning, scan_result still emitted with energies and angles.""" + with patch('arc.output.parse_1d_scan_full_result', + return_value=self._stub_parsed(n_points=3, geometries='none')): + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNotNone(result) + for point in result['points']: + self.assertNotIn('geometry', point) + + def test_scan_points_omit_geometry_uniformly_on_length_mismatch(self): + """Length mismatch → drop geometries from ALL points (not partial) + and log a warning. The scan_result itself still uploads.""" + with patch('arc.output.parse_1d_scan_full_result', + return_value=self._stub_parsed(n_points=3, geometries='mismatch')): + with self.assertLogs('arc', level='WARNING') as cm: + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNotNone(result) + for point in result['points']: + self.assertNotIn('geometry', point) + self.assertTrue(any('does not match scan-point count' in m for m in cm.output), + msg=f"expected mismatch warning, got: {cm.output}") + + # ---- requested scan-grid metadata (TCKDB calc_scan_coordinate fields) ---- + + def test_scan_coord_includes_step_size_from_gaussian_header(self): + """Real Gaussian scan log → step_size + resolution_degrees populated + from the parsed ModRedundant header (not from the completed-point + spacing). The fixture log has ``S N 8.0`` in its header.""" + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNotNone(result) + coord = result['coordinates'][0] + self.assertIn('step_size', coord) + self.assertIn('resolution_degrees', coord) + # 1D dihedral torsion scan: resolution == step size. + self.assertEqual(coord['step_size'], coord['resolution_degrees']) + # ARC writes ``S 360/scan_res scan_res``; for the sBuOH fixture the + # requested step size is 8 degrees. + self.assertAlmostEqual(coord['step_size'], 8.0, places=6) + + def test_scan_coord_step_size_independent_from_completed_count(self): + """``step_count`` reflects the *completed* points, ``step_size`` the + *requested* grid — they're sourced separately and must not be + coupled (a partially-failed scan would otherwise emit a misleading + derived step_size). Spot-check both come from independent data.""" + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + coord = result['coordinates'][0] + # step_count is the point count we actually parsed; step_size is + # the requested grid spacing. Their product covers the requested + # range only when no points dropped. + self.assertEqual(coord['step_count'], len(result['points'])) + self.assertGreater(coord['step_size'], 0.0) + + def test_scan_coord_omits_grid_metadata_for_non_gaussian(self): + """``parse_scan_args`` raising NotImplementedError (ORCA, etc.) → + step_size / resolution_degrees absent, no exception, scan_result + still produced.""" + with patch('arc.output.parse_scan_args', + side_effect=NotImplementedError('ORCA path')): + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNotNone(result) + coord = result['coordinates'][0] + self.assertNotIn('step_size', coord) + self.assertNotIn('resolution_degrees', coord) + + def test_scan_coord_omits_grid_metadata_when_parser_raises(self): + """Generic parser failure (corrupt log, etc.) → grid fields absent, + no exception.""" + with patch('arc.output.parse_scan_args', + side_effect=RuntimeError('boom')): + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNotNone(result) + coord = result['coordinates'][0] + self.assertNotIn('step_size', coord) + self.assertNotIn('resolution_degrees', coord) + + def test_scan_coord_omits_grid_metadata_when_step_size_zero(self): + """``parse_scan_args`` returns step_size=0 by default when the + ModRedundant block isn't matched — must be treated as 'unknown', + not as a literal 0-degree step (which would be nonsense and + violate the schema's intent).""" + stub = {'scan': [1, 2, 3, 4], 'freeze': [], 'step': 0, + 'step_size': 0, 'n_atom': 0} + with patch('arc.output.parse_scan_args', return_value=stub): + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNotNone(result) + coord = result['coordinates'][0] + self.assertNotIn('step_size', coord) + self.assertNotIn('resolution_degrees', coord) + + def test_scan_coord_grid_metadata_does_not_affect_points(self): + """Independence: completed-point coordinate_values aren't touched + by the requested-grid plumbing.""" + with patch('arc.output.parse_scan_args', + return_value={'scan': [1, 2, 3, 4], 'freeze': [], + 'step': 36, 'step_size': 10.0, 'n_atom': 0}): + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + # Points still carry their actual coordinate_value list; step_size + # didn't propagate into per-point data. + self.assertGreater(len(result['points']), 0) + for point in result['points']: + self.assertIn('coordinate_values', point) + self.assertEqual(point['coordinate_values'][0]['value_unit'], 'degree') + + # ---- start_value / end_value (TCKDB requested-grid endpoints) ---- + # + # The dihedral is read from the input geometry the rotor scan was + # launched against. Gaussian's ModRedundant ``S`` syntax encodes + # ``end_value = start_value + step_size * (step_count - 1)``; + # we emit both values continuous (no [-180, 180] wrap) so a full + # rotation lands at start + 360, not back at start. + + @staticmethod + def _input_xyz_for_dihedral(start_dihedral_deg: float): + """Build a minimal 5-atom xyz whose 1-2-3-4 dihedral, as + measured by :func:`calculate_dihedral_angle` (the same helper + the production code uses), equals ``start_dihedral_deg`` in the + 0-360 convention. The internal rotation is offset by -270° to + compensate for the helper's right-hand-rule sign choice. + """ + import math as _math + rad = _math.radians(start_dihedral_deg - 270.0) + return { + 'symbols': ('C', 'C', 'C', 'C', 'H'), + 'isotopes': (12, 12, 12, 12, 1), + 'coords': ( + (0.0, 0.0, 1.0), + (1.0, 0.0, 0.0), + (2.0, 0.0, 0.0), + (3.0, _math.cos(rad), _math.sin(rad)), + (4.0, 0.0, 0.0), + ), + } + + def test_scan_start_value_computed_from_input_geometry(self): + """Input geometry → ``start_value`` matches the dihedral on the + scan atom quartet.""" + rotor = self._rotor(scan=[1, 2, 3, 4]) + xyz = self._input_xyz_for_dihedral(60.0) + result = _build_scan_result_for_rotor( + rotor, '/tmp/project', input_xyz=xyz, + ) + coord = result['coordinates'][0] + self.assertIn('start_value', coord) + # The fixture's offset compensates for the helper's right-hand + # rule, so 60° in → 60° out. + self.assertAlmostEqual(coord['start_value'], 60.0, places=4) + + def test_scan_end_value_extends_continuously_from_start(self): + """``end_value = start_value + step_size * (step_count - 1)`` — + not wrapped, so a 46-point 8° scan from 60° lands at 60 + 360 = 420°, + NOT back at 60° and NOT mod-360'd to 60°.""" + rotor = self._rotor(scan=[1, 2, 3, 4]) + xyz = self._input_xyz_for_dihedral(60.0) + result = _build_scan_result_for_rotor( + rotor, '/tmp/project', input_xyz=xyz, + ) + coord = result['coordinates'][0] + # Real Gaussian fixture: step_size=8, len(points)=46 → +360 span. + expected_end = coord['start_value'] + coord['step_size'] * ( + coord['step_count'] - 1 + ) + self.assertAlmostEqual(coord['end_value'], expected_end, places=6) + # Not wrapped: a full-rotation scan exceeds 360°, never re-folds. + self.assertGreater(coord['end_value'], 360.0) + + def test_scan_end_value_is_not_wrapped_into_minus_180_180(self): + """Continuity contract: even when start is near 180°, the end + value must not flip sign by wrapping into [-180, 180].""" + rotor = self._rotor(scan=[1, 2, 3, 4]) + xyz = self._input_xyz_for_dihedral(170.0) + result = _build_scan_result_for_rotor( + rotor, '/tmp/project', input_xyz=xyz, + ) + coord = result['coordinates'][0] + # 170 + 360 = 530 — must NOT have folded to -190 or 170. + self.assertGreater(coord['end_value'], 360.0) + self.assertGreater(coord['end_value'] - coord['start_value'], + coord['step_size'] * 0.99) + + def test_scan_start_end_absent_when_input_geometry_missing(self): + """No ``input_xyz`` and no parser fallback → fields stay absent.""" + rotor = self._rotor(scan=[1, 2, 3, 4]) + # Patch out both sources: input_xyz=None and parsed geometries=None. + with patch('arc.output.parse_1d_scan_full_result') as p: + from arc.parser.parser import parse_1d_scan_full_result as real + parsed = real(self.SCAN_LOG) + parsed['geometries'] = None # kill the fallback + p.return_value = parsed + result = _build_scan_result_for_rotor( + rotor, '/tmp/project', input_xyz=None, + ) + coord = result['coordinates'][0] + self.assertNotIn('start_value', coord) + self.assertNotIn('end_value', coord) + + def test_scan_start_end_absent_when_step_size_unknown(self): + """Without step_size we can't compute end_value, so we omit BOTH + rather than emit a half-populated range.""" + rotor = self._rotor(scan=[1, 2, 3, 4]) + xyz = self._input_xyz_for_dihedral(60.0) + with patch('arc.output.parse_scan_args', + side_effect=NotImplementedError('non-Gaussian')): + result = _build_scan_result_for_rotor( + rotor, '/tmp/project', input_xyz=xyz, + ) + coord = result['coordinates'][0] + self.assertNotIn('step_size', coord) # confirms the precondition + self.assertNotIn('start_value', coord) + self.assertNotIn('end_value', coord) + + def test_scan_start_end_absent_when_step_size_zero(self): + """``parse_scan_args`` returning step_size=0 → no end_value + possible, omit both.""" + rotor = self._rotor(scan=[1, 2, 3, 4]) + xyz = self._input_xyz_for_dihedral(60.0) + stub = {'scan': [1, 2, 3, 4], 'freeze': [], 'step': 0, + 'step_size': 0, 'n_atom': 0} + with patch('arc.output.parse_scan_args', return_value=stub): + result = _build_scan_result_for_rotor( + rotor, '/tmp/project', input_xyz=xyz, + ) + coord = result['coordinates'][0] + self.assertNotIn('start_value', coord) + self.assertNotIn('end_value', coord) + + def test_scan_start_end_absent_when_dihedral_calc_raises(self): + """A failed dihedral calculation logs a warning, omits start/end, + and does NOT abort the rest of scan_result emission.""" + rotor = self._rotor(scan=[1, 2, 3, 4]) + xyz = self._input_xyz_for_dihedral(60.0) + with patch('arc.output.calculate_dihedral_angle', + side_effect=RuntimeError('atom missing')): + with self.assertLogs('arc', level='WARNING') as cm: + result = _build_scan_result_for_rotor( + rotor, '/tmp/project', input_xyz=xyz, + ) + # scan_result is still emitted (energies + step_size + points all there). + self.assertIsNotNone(result) + coord = result['coordinates'][0] + self.assertNotIn('start_value', coord) + self.assertNotIn('end_value', coord) + self.assertIn('step_size', coord) # other grid fields untouched + self.assertGreater(len(result['points']), 0) + self.assertTrue(any('dihedral calculation failed' in m for m in cm.output), + msg=f"expected dihedral-failure warning, got: {cm.output}") + + def test_scan_point_coordinate_values_unchanged_by_start_end_addition(self): + """``points[i].coordinate_values`` must remain whatever + ``parse_1d_scan_full_result`` reported, regardless of start/end.""" + rotor = self._rotor(scan=[1, 2, 3, 4]) + xyz = self._input_xyz_for_dihedral(60.0) + result_with = _build_scan_result_for_rotor( + rotor, '/tmp/project', input_xyz=xyz, + ) + result_without = _build_scan_result_for_rotor( + rotor, '/tmp/project', input_xyz=None, + ) + # Same number of points, same coordinate_values per point. + self.assertEqual(len(result_with['points']), len(result_without['points'])) + for p_with, p_without in zip(result_with['points'], result_without['points']): + self.assertEqual(p_with['coordinate_values'], + p_without['coordinate_values']) + + def test_scan_falls_back_to_parsed_first_frame_when_input_xyz_missing(self): + """When ``input_xyz`` is None but the parser returned aligned + geometries, the first frame is a documented fallback for the + input dihedral (Gaussian ModRedundant freezes the scan dihedral + at the input value, so the first frame's dihedral IS the + requested start).""" + rotor = self._rotor(scan=[1, 2, 3, 4]) + result = _build_scan_result_for_rotor( + rotor, '/tmp/project', input_xyz=None, + ) + coord = result['coordinates'][0] + # Real Gaussian fixture has ``geometries`` populated, so the + # fallback resolves and start/end are emitted. + self.assertIn('start_value', coord) + self.assertIn('end_value', coord) + + def test_scan_points_omit_geometry_uniformly_on_serialization_failure(self): + """One unserializable xyz dict → drop geometries from ALL points, + warn once. Energies/angles still flow through.""" + with patch('arc.output.parse_1d_scan_full_result', + return_value=self._stub_parsed(n_points=3, geometries='malformed')): + with self.assertLogs('arc', level='WARNING') as cm: + result = _build_scan_result_for_rotor(self._rotor(), '/tmp/project') + self.assertIsNotNone(result) + # No partial coverage: nothing carries a geometry. + for point in result['points']: + self.assertNotIn('geometry', point) + self.assertTrue( + any('serialization failed' in m or 'empty text' in m for m in cm.output), + msg=f"expected serialization warning, got: {cm.output}", + ) + + +class TestScanConstraintDispatch(unittest.TestCase): + """Software-aware dispatch for rotor-scan constraint extraction. + + The scheduler stamps ``scan_software`` onto each rotor when a scan + job completes (``arc/scheduler.py``). ``_parse_scan_constraints`` + consumes that hint to call the right parser; everything else + degrades gracefully without failing payload generation. + """ + + SCAN_LOG = os.path.join(ARC_TESTING_PATH, 'rotor_scans', 'sBuOH.out') + + def _rotor(self, **overrides) -> dict: + rotor: dict = { + 'success': True, + 'scan': [1, 2, 3, 4], + 'pivots': [2, 3], + 'symmetry': 1, + 'type': 'HinderedRotor', + 'scan_path': self.SCAN_LOG, + 'dimensions': 1, + 'scan_software': '', + } + rotor.update(overrides) + return rotor + + def test_gaussian_hint_routes_to_gaussian_parser(self): + from arc.output import _parse_scan_constraints + sentinel = [{'constraint_kind': 'bond', 'atoms': [1, 2], + 'target_value': None}] + with patch('arc.parser.adapters.gaussian.parse_gaussian_constraints', + return_value=sentinel) as gauss, \ + patch('arc.parser.adapters.orca.parse_orca_constraints') as orca: + result = _parse_scan_constraints( + self._rotor(scan_software='gaussian'), '/tmp/project', + ) + self.assertEqual(result, sentinel) + gauss.assert_called_once_with(self.SCAN_LOG) + orca.assert_not_called() + + def test_orca_hint_routes_to_orca_parser(self): + from arc.output import _parse_scan_constraints + sentinel = [{'constraint_kind': 'dihedral', + 'atoms': [1, 2, 3, 4], 'target_value': 90.0}] + with patch('arc.parser.adapters.orca.parse_orca_constraints', + return_value=sentinel) as orca, \ + patch('arc.parser.adapters.gaussian.parse_gaussian_constraints') as gauss: + result = _parse_scan_constraints( + self._rotor(scan_software='orca'), '/tmp/project', + ) + self.assertEqual(result, sentinel) + orca.assert_called_once_with(self.SCAN_LOG) + gauss.assert_not_called() + + def test_missing_software_falls_back_to_gaussian(self): + # Empty / missing ``scan_software`` preserves the historical + # behavior: try Gaussian (the only software with ModRedundant + # emission). Restart files written before this field landed + # therefore keep producing constraints rather than silently + # losing them. + from arc.output import _parse_scan_constraints + with patch('arc.parser.adapters.gaussian.parse_gaussian_constraints', + return_value=[]) as gauss: + rotor_no_field = self._rotor() + rotor_no_field.pop('scan_software', None) + _parse_scan_constraints(rotor_no_field, '/tmp/project') + _parse_scan_constraints(self._rotor(scan_software=''), '/tmp/project') + self.assertEqual(gauss.call_count, 2) + + def test_unknown_software_returns_empty_list_no_parser_call(self): + from arc.output import _parse_scan_constraints + with patch('arc.parser.adapters.gaussian.parse_gaussian_constraints') as gauss, \ + patch('arc.parser.adapters.orca.parse_orca_constraints') as orca: + result = _parse_scan_constraints( + self._rotor(scan_software='qchem'), '/tmp/project', + ) + self.assertEqual(result, []) + gauss.assert_not_called() + orca.assert_not_called() + + def test_parser_exception_degrades_to_empty_list(self): + from arc.output import _parse_scan_constraints + with patch('arc.parser.adapters.gaussian.parse_gaussian_constraints', + side_effect=RuntimeError('parser crashed')): + result = _parse_scan_constraints( + self._rotor(scan_software='gaussian'), '/tmp/project', + ) + self.assertEqual(result, []) + + def test_missing_scan_path_returns_empty_list(self): + # Defensive: never invoke a parser without a real path. + from arc.output import _parse_scan_constraints + rotor = self._rotor(scan_path='', scan_software='gaussian') + with patch('arc.parser.adapters.gaussian.parse_gaussian_constraints') as gauss: + self.assertEqual(_parse_scan_constraints(rotor, '/tmp/project'), []) + gauss.assert_not_called() + + if __name__ == '__main__': unittest.main() diff --git a/arc/parser/adapter.py b/arc/parser/adapter.py index 4f91dfda3e..a09139acec 100644 --- a/arc/parser/adapter.py +++ b/arc/parser/adapter.py @@ -198,6 +198,22 @@ def parse_polarizability(self) -> float | None: """ pass + def parse_1d_scan_energies_hartree(self) -> tuple[list[float] | None, list[float] | None]: + """ + Parse the 1D torsion scan absolute electronic energies in Hartree. + + Default returns ``(None, None)`` so adapters that haven't implemented + Hartree-preserving extraction don't break the new + :func:`parse_1d_scan_full_result` wrapper. Callers must treat absence as + "absolute energies unavailable for this ESS" and not as an error. + + Returns: tuple[list[float] | None, list[float] | None] + The absolute electronic energy in Hartree and the dihedral scan angle + in degrees, with the same point-filtering applied as + :meth:`parse_1d_scan_energies`. + """ + return None, None + def parse_opt_steps(self) -> int | None: """ Parse the number of geometry optimization steps from an opt job output file. diff --git a/arc/parser/adapters/gaussian.py b/arc/parser/adapters/gaussian.py index aa2c223826..0e011918ab 100644 --- a/arc/parser/adapters/gaussian.py +++ b/arc/parser/adapters/gaussian.py @@ -5,10 +5,11 @@ from abc import ABC import numpy as np +import os import pandas as pd import re -from arc.common import SYMBOL_BY_NUMBER, is_same_pivot +from arc.common import SYMBOL_BY_NUMBER, is_same_pivot, get_logger from arc.constants import E_h_kJmol, bohr_to_angstrom from arc.species.converter import str_to_xyz, xyz_from_data from arc.parser.adapter import ESSAdapter @@ -16,6 +17,9 @@ from arc.parser.parser import _get_lines_from_file +logger = get_logger() + + class GaussianParser(ESSAdapter, ABC): """ A class for parsing Gaussian log files. @@ -322,19 +326,23 @@ def parse_zpe_correction(self) -> float | None: return zpe_hartree * E_h_kJmol return None - def parse_1d_scan_energies(self) -> tuple[list[float] | None, list[float] | None]: + def _parse_1d_scan_walk(self): """ - Parse the 1D torsion scan energies from an ESS log file. - - Returns: tuple[list[float] | None, list[float] | None] - The electronic energy in kJ/mol and the dihedral scan angle in degrees. + Walk the log once and return ``(vlist_hartree, angle_list, opt_freq, non_optimized)``, + or ``(None, None, _, _)`` on parse failure. ``vlist_hartree`` is the raw list of + absolute SCF energies (Hartree) for every point recorded by the optimizer + (including those flagged as non-optimized); ``angle_list`` is the matching list + of dihedral angles (degrees), already shifted to start at zero. The two filter + signals (``opt_freq``, ``non_optimized``) are applied by the caller so that the + legacy zero-against-full-min behavior of :meth:`parse_1d_scan_energies` is + preserved bit-for-bit. """ opt_freq = False rigid_scan = False energy = None - vlist = [] - non_optimized = [] - angle = [] + vlist: list[float] = [] + non_optimized: list[int] = [] + angle: list[float] = [] scan_pivot_atoms = self.load_scan_pivot_atoms() internal_coord = f"D({','.join(str(i) for i in scan_pivot_atoms)})" @@ -364,37 +372,75 @@ def parse_1d_scan_energies(self) -> tuple[list[float] | None, list[float] | None continue if not vlist: - return None, None + return None, None, opt_freq, non_optimized if rigid_scan: try: scan_angle_resolution_deg = self.load_scan_angle() except AttributeError: - return None, None - angle = [i * scan_angle_resolution_deg for i in range(len(vlist))] + return None, None, opt_freq, non_optimized + angle_list = [i * scan_angle_resolution_deg for i in range(len(vlist))] else: - angle = np.array(angle, float) - if len(angle) != len(vlist): - return None, None - angle -= angle[0] - angle[angle < 0] += 360.0 - if len(angle) > 1 and angle[-1] < 2 * (angle[1] - angle[0]): - angle[-1] += 360.0 - angle = angle.tolist() - - vlist = np.array(vlist, float) - vlist -= np.min(vlist) - vlist *= E_h_kJmol + angle_arr = np.array(angle, float) + if len(angle_arr) != len(vlist): + return None, None, opt_freq, non_optimized + angle_arr -= angle_arr[0] + angle_arr[angle_arr < 0] += 360.0 + if len(angle_arr) > 1 and angle_arr[-1] < 2 * (angle_arr[1] - angle_arr[0]): + angle_arr[-1] += 360.0 + angle_list = angle_arr.tolist() + + return vlist, angle_list, opt_freq, non_optimized + + def parse_1d_scan_energies(self) -> tuple[list[float] | None, list[float] | None]: + """ + Parse the 1D torsion scan energies from an ESS log file. + + Returns: tuple[list[float] | None, list[float] | None] + The electronic energy in kJ/mol and the dihedral scan angle in degrees. + """ + vlist, angle, opt_freq, non_optimized = self._parse_1d_scan_walk() + if vlist is None: + return None, None + + # Preserve legacy ordering: zero against the full vlist's min, then convert, + # then apply opt_freq trim, then drop non-optimized indices. + v = np.array(vlist, float) + v -= np.min(v) + v *= E_h_kJmol if opt_freq: - vlist = vlist[:-1] + v = v[:-1] angle = angle[:-1] if non_optimized: - vlist = np.delete(vlist, non_optimized) + v = np.delete(v, non_optimized) angle = np.delete(angle, non_optimized) - return vlist.tolist(), angle + return v.tolist(), angle + + def parse_1d_scan_energies_hartree(self) -> tuple[list[float] | None, list[float] | None]: + """ + Parse the 1D torsion scan absolute electronic energies in Hartree. + + Returns: tuple[list[float] | None, list[float] | None] + The absolute electronic energy in Hartree and the dihedral scan angle + in degrees, with the same point-filtering applied as + :meth:`parse_1d_scan_energies` (opt_freq tail dropped, non-optimized + indices removed). Returns ``(None, None)`` on parse failure. + """ + vlist, angle, opt_freq, non_optimized = self._parse_1d_scan_walk() + if vlist is None: + return None, None + if opt_freq: + vlist = vlist[:-1] + angle = angle[:-1] + if non_optimized: + drop = set(non_optimized) + keep = [i for i in range(len(vlist)) if i not in drop] + vlist = [vlist[i] for i in keep] + angle = [angle[i] for i in keep] + return vlist, angle def parse_1d_scan_coords(self) -> list[dict[str, tuple]] | None: """ @@ -491,6 +537,159 @@ def parse_irc_traj(self) -> list[dict[str, tuple]] | None: i += 1 return traj if traj else None + def parse_irc_path(self) -> list[dict] | None: + """ + Parse the IRC path with per-point structured data. + + Walks the Gaussian log once and emits one record per converged + IRC point that carries a CURRENT STRUCTURE block. Records are in + file order — the TS seed (Point Number: 0) has no structure block + in Gaussian logs and is therefore not emitted; the caller is + expected to supply a TS reference energy separately. + + Returns: list[dict] | None + A list of point dicts. Keys (any may be ``None`` if absent + from the log): + + - ``point_number`` (int): Gaussian's per-branch index. + - ``direction`` (str | None): ``'forward'`` / ``'reverse'``, + taken from the ``Point Number N in FORWARD/REVERSE path + direction.`` announcement that precedes the converged + block. ``None`` if no announcement was seen yet. + - ``electronic_energy_hartree`` (float | None): the most + recent ``SCF Done`` energy preceding the converged block. + - ``max_gradient`` (float | None): max Cartesian force + (Hartrees/Bohr). + - ``rms_gradient`` (float | None): RMS Cartesian force. + - ``reaction_coordinate`` (float | None): ``NET REACTION + COORDINATE UP TO THIS POINT`` (sqrt(amu)*bohr in + Gaussian's mass-weighted convention). + - ``xyz`` (dict | None): the parsed Cartesian geometry, + in ARC's standard xyz dict shape. + """ + lines = _get_lines_from_file(self.log_file_path) + num_pat = r"[-+]?\d*\.?\d+(?:[EDed][-+]?\d+)?" + energy_re = re.compile(r"SCF Done:\s+E\([^)]*\)\s*=\s*(" + num_pat + r")") + forces_re = re.compile( + r"Cartesian Forces:\s+Max\s+(" + num_pat + r")\s+RMS\s+(" + num_pat + r")" + ) + dir_re = re.compile( + r"Point Number\s+\d+\s+in\s+(FORWARD|REVERSE)\s+path direction" + ) + point_re = re.compile(r"Point Number:\s+(\d+)\s+Path Number:\s+(\d+)") + rc_re = re.compile( + r"NET REACTION COORDINATE UP TO THIS POINT\s*=\s*(" + num_pat + r")" + ) + + def _to_float(text: str) -> float | None: + try: + return float(text.replace('D', 'E').replace('d', 'e')) + except (ValueError, TypeError): + return None + + points: list[dict] = [] + cur_dir: str | None = None + last_energy: float | None = None + last_max_grad: float | None = None + last_rms_grad: float | None = None + + i = 0 + n = len(lines) + while i < n: + line = lines[i] + m = energy_re.search(line) + if m: + last_energy = _to_float(m.group(1)) + i += 1 + continue + m = forces_re.search(line) + if m: + last_max_grad = _to_float(m.group(1)) + last_rms_grad = _to_float(m.group(2)) + i += 1 + continue + m = dir_re.search(line) + if m: + cur_dir = m.group(1).lower() + i += 1 + continue + m = point_re.search(line) + if m: + point_num = int(m.group(1)) + # Look for CURRENT STRUCTURE within a small window. Gaussian + # emits the converged-point block as + # Point Number: N Path Number: M + # CURRENT STRUCTURE + # Cartesian Coordinates (Ang): + # Point 0 (the TS seed) has no CURRENT STRUCTURE block — + # we skip it; the caller supplies a TS reference energy + # outside of this parser. + j = i + 1 + window_end = min(j + 6, n) + struct_start = None + while j < window_end: + if 'CURRENT STRUCTURE' in lines[j]: + struct_start = j + break + j += 1 + if struct_start is None: + i += 1 + continue + # Walk past two dashed boundary lines, then read coord + # rows (atom_index atomic_number x y z) until the closing + # dashed line. + k = struct_start + 1 + dash_count = 0 + while k < n and dash_count < 2: + if '----' in lines[k]: + dash_count += 1 + k += 1 + coords: list[list[float]] = [] + numbers: list[int] = [] + while k < n and '----' not in lines[k]: + parts = lines[k].split() + if len(parts) >= 5: + try: + atomic_num = int(parts[1]) + x, y, z = float(parts[2]), float(parts[3]), float(parts[4]) + except (ValueError, IndexError): + k += 1 + continue + coords.append([x, y, z]) + numbers.append(atomic_num) + k += 1 + xyz = ( + xyz_from_data(coords=np.array(coords), numbers=numbers) + if coords and numbers + else None + ) + # NET REACTION COORDINATE shows up within ~6 lines after + # the closing dashed boundary; cap the lookahead so we + # never spill into the next point's block. + rc: float | None = None + rc_end = min(k + 8, n) + p = k + while p < rc_end: + rc_match = rc_re.search(lines[p]) + if rc_match: + rc = _to_float(rc_match.group(1)) + break + p += 1 + points.append({ + "point_number": point_num, + "direction": cur_dir, + "electronic_energy_hartree": last_energy, + "max_gradient": last_max_grad, + "rms_gradient": last_rms_grad, + "reaction_coordinate": rc, + "xyz": xyz, + }) + i = p + 1 if p < rc_end else k + continue + i += 1 + + return points or None + def parse_scan_conformers(self) -> pd.DataFrame | None: """ Parse all internal coordinates of scan conformers into a DataFrame. @@ -981,6 +1180,240 @@ def search(x, y): return blks +_GAUSSIAN_LETTER_TO_TCKDB_KIND: dict[str, tuple[str, int]] = { + 'X': ('cartesian_atom', 1), + 'B': ('bond', 2), + 'A': ('angle', 3), + 'D': ('dihedral', 4), +} + +# Letters Gaussian ModRedundant uses for non-constraint coordinate types +# (linear bend, out-of-plane bookkeeping). Recognised so the caller logs +# at debug rather than warning. +_GAUSSIAN_NON_CONSTRAINT_LETTERS: frozenset[str] = frozenset({'L', 'O'}) + + +def _gaussian_letter_to_tckdb_kind(letter: str, n_atoms: int) -> str | None: + """Map a Gaussian ModRedundant coordinate letter to a TCKDB constraint kind. + + Returns None for ModRedundant coordinate types that TCKDB does not model + as calculation constraints (L/O), unknown letters, or any letter whose + arity does not match the atom-count in the parsed line. + """ + entry = _GAUSSIAN_LETTER_TO_TCKDB_KIND.get(letter) + if entry is None: + return None + kind, expected_n = entry + if n_atoms != expected_n: + return None + return kind + + +def parse_gaussian_constraints(file_path: str) -> list[dict]: + """Parse held-fixed coordinate constraints from a Gaussian input deck or log. + + Reads either a Gaussian input deck (the ``.gjf``-style file ARC writes + via ``arc/job/adapters/gaussian.py``) or a Gaussian log file's + ``ModRedundant input section has been read:`` block. Returns one record + per ``F`` (frozen) coordinate; ``S`` (scan) coordinates are deliberately + excluded — those belong in ``scan_result.coordinates[]``, not in + ``calculation.constraints[]``. + + Each record has the shape:: + + { + 'constraint_kind': 'cartesian_atom' | 'bond' | 'angle' | 'dihedral', + 'atoms': [int, ...], # 1-based atom indices + 'target_value': float | None, # None when no value parsed + } + + The function never raises on malformed input: unparseable lines are + skipped and logged at warning level. Returns ``[]`` when the file + doesn't exist or contains no recognised constraints. + """ + try: + lines = _read_modredundant_block(file_path) + except (OSError, IOError) as exc: + logger.warning("parse_gaussian_constraints: cannot read %s: %s", + file_path, exc) + return [] + + constraints: list[dict] = [] + for raw in lines: + record = _parse_gaussian_constraint_line(raw) + if record is not None: + constraints.append(record) + return constraints + + +def _read_modredundant_block(file_path: str) -> list[str]: + """Return candidate ModRedundant lines from a Gaussian input deck OR log file. + + Heuristic: if the file contains + ``"The following ModRedundant input section has been read:"`` (a log), + return the lines between that marker and the next blank line / known + end-of-block sentinel. Otherwise treat the file as an input deck and + return every line that begins with a constraint-coordinate letter + (B/A/D/X/L/O) — Gaussian decks place ModRedundant lines after a blank + line at the bottom of the file, but we don't depend on the exact + layout. + """ + with open(file_path, 'r') as f: + all_lines = f.readlines() + + log_marker = 'The following ModRedundant input section has been read:' + for idx, line in enumerate(all_lines): + if log_marker in line: + block: list[str] = [] + for follow in all_lines[idx + 1:]: + stripped = follow.strip() + if not stripped: + break + # Gaussian's log echoes the block then prints either a + # blank line or a non-constraint line; the leading-letter + # filter below also catches the Isotopes/GradGrad sentinels. + first = stripped.split()[0].upper() + if first not in _GAUSSIAN_LETTER_TO_TCKDB_KIND \ + and first not in _GAUSSIAN_NON_CONSTRAINT_LETTERS: + break + block.append(stripped) + return block + + # No ModRedundant marker. Only run the deck-line heuristic on actual + # input decks — applying it to a log file scans every Berny optimizer + # diagnostic / banner line whose first token is a single letter and + # floods the user with false-positive warnings. + if os.path.splitext(file_path)[1].lower() not in ('.gjf', '.com'): + return [] + + # Input deck path: pick lines whose first token is a coordinate letter. + deck_lines: list[str] = [] + for line in all_lines: + stripped = line.strip() + if not stripped: + continue + first = stripped.split()[0].upper() + if first in _GAUSSIAN_LETTER_TO_TCKDB_KIND \ + or first in _GAUSSIAN_NON_CONSTRAINT_LETTERS: + deck_lines.append(stripped) + return deck_lines + + +def _parse_gaussian_constraint_line(line: str) -> dict | None: + """Parse one ModRedundant line into a constraint record, or None. + + Honours: + - Only ``F`` (frozen) coordinates are emitted as constraints; + ``S`` (scan) and ``B`` (build/define) action codes return None. + - Lines without an explicit action code default to ``F`` (Gaussian's + implicit-freeze convention; mirrors the existing ``_load_scan_specs`` + logic in this file). + - Optional target value preceding the action code is preserved + when present and parseable; absent or unparseable values yield + ``target_value=None``. + """ + tokens = line.split() + if not tokens: + return None + letter = tokens[0].upper() + + if letter in _GAUSSIAN_NON_CONSTRAINT_LETTERS: + logger.debug("parse_gaussian_constraints: skipping non-constraint " + "coordinate type %s in line: %s", letter, line) + return None + + if letter not in _GAUSSIAN_LETTER_TO_TCKDB_KIND: + logger.warning("parse_gaussian_constraints: unknown ModRedundant " + "letter %s in line: %s", letter, line) + return None + + _kind_name, expected_n = _GAUSSIAN_LETTER_TO_TCKDB_KIND[letter] + + # Atom indices are tokens[1 : 1 + expected_n] when the line is well-formed. + if len(tokens) < 1 + expected_n: + logger.warning("parse_gaussian_constraints: line has too few tokens " + "for letter %s (expected %d atoms): %s", + letter, expected_n, line) + return None + + try: + atoms = [int(tok) for tok in tokens[1:1 + expected_n]] + except ValueError: + logger.warning("parse_gaussian_constraints: non-integer atom index " + "in line: %s", line) + return None + + kind = _gaussian_letter_to_tckdb_kind(letter, len(atoms)) + if kind is None: + logger.warning("parse_gaussian_constraints: arity mismatch for " + "letter %s with %d atoms in line: %s", + letter, len(atoms), line) + return None + + # Action code + target value are after the atoms. The shape is one of: + # → implicit F + # F → explicit F, no value + # = F → '=' separated value + # F → bare numeric value + # S → scan, skip + rest = tokens[1 + expected_n:] + action, target_value = _extract_modredundant_action(rest) + + if action == 'S': + # Scan coordinate; not a held constraint. + return None + if action == 'B': + # Build/define-only; not a held constraint. + return None + if action != 'F': + # Anything we don't recognise (K/H/R/...): conservatively skip. + logger.debug("parse_gaussian_constraints: skipping line with action " + "%r: %s", action, line) + return None + + return { + 'constraint_kind': kind, + 'atoms': atoms, + 'target_value': target_value, + } + + +def _extract_modredundant_action(rest: list[str]) -> tuple[str, float | None]: + """Read the action code + optional target value from the tail of a ModRedundant line. + + ``rest`` is the slice of tokens *after* the atom indices. Returns + ``(action, target_value)`` where ``action`` defaults to ``'F'`` when + the tail is empty (Gaussian's implicit-freeze convention). + """ + target_value: float | None = None + action = 'F' + if not rest: + return action, target_value + + # Strip leading '=' if present ("D 1 2 3 4 = 180.0 F" form). + cleaned: list[str] = [] + for tok in rest: + if tok == '=': + continue + # Token like "=180.0" — split off the leading '='. + if tok.startswith('=') and len(tok) > 1: + cleaned.append(tok[1:]) + else: + cleaned.append(tok) + + # If a numeric token precedes the action code, that's the target value. + for tok in cleaned: + if tok.upper() in {'F', 'S', 'B', 'K', 'H', 'R', 'D', 'A'}: + action = tok.upper() + break + try: + target_value = float(tok) + except ValueError: + # Unparseable token before the action — leave value as-is. + continue + return action, target_value + + def parse_scan_args(file_path: str) -> dict: """ Get the scan arguments, including which internal coordinates (IC) are being scanned, which are frozen, diff --git a/arc/parser/adapters/orca.py b/arc/parser/adapters/orca.py index 225d65ddc0..0e143d4f9c 100644 --- a/arc/parser/adapters/orca.py +++ b/arc/parser/adapters/orca.py @@ -8,7 +8,7 @@ import pandas as pd import re -from arc.common import SYMBOL_BY_NUMBER +from arc.common import SYMBOL_BY_NUMBER, get_logger from arc.constants import E_h_kJmol, bohr_to_angstrom from arc.species.converter import str_to_xyz, xyz_from_data from arc.parser.adapter import ESSAdapter @@ -16,6 +16,9 @@ from arc.parser.parser import _get_lines_from_file +logger = get_logger() + + class OrcaParser(ESSAdapter, ABC): """ A class for parsing Orca log files. @@ -362,4 +365,120 @@ def parse_ess_version(self) -> str | None: return None +_ORCA_LETTER_TO_TCKDB_KIND: dict[str, tuple[str, int]] = { + 'C': ('cartesian_atom', 1), + 'B': ('bond', 2), + 'A': ('angle', 3), + 'D': ('dihedral', 4), +} + + +def parse_orca_constraints(file_path: str) -> list[dict]: + """Parse held-fixed constraints from an ORCA input deck (best-effort). + + Recognises the standard ORCA ``%geom Constraints`` block:: + + %geom Constraints + { B 0 1 1.4 C } + { A 0 1 2 90.0 C } + { D 0 1 2 3 180.0 C } + { C 0 C } + end + + Notes / known limitations: + - ORCA atom indices in the input deck are 0-based; this parser + converts them to TCKDB's 1-based convention at the boundary. + - ARC's ORCA adapter does not currently emit ``%geom Constraints`` + blocks (only ``%geom Scan``). This parser is therefore mainly + defensive — it handles user-supplied decks and any future ARC + emission. Scan blocks are *not* parsed as constraints. + - Variants like ``optimize { B i j C }`` (single-coordinate form) + and ``Constraints` blocks scattered across multiple ``%geom`` + sections are recognised; everything else is ignored with a + debug log rather than failing the whole parse. + + Returns ``[]`` on file read errors or when no recognised + ``Constraints`` block is found. + """ + try: + with open(file_path, 'r') as f: + text = f.read() + except (OSError, IOError) as exc: + logger.warning("parse_orca_constraints: cannot read %s: %s", + file_path, exc) + return [] + + constraints: list[dict] = [] + # Find every Constraints block: from 'Constraints' up to the matching + # 'end' (case-insensitive). ORCA blocks are not nested. + pattern = re.compile(r'Constraints(.*?)end', re.IGNORECASE | re.DOTALL) + for match in pattern.finditer(text): + block = match.group(1) + for raw in block.splitlines(): + record = _parse_orca_constraint_line(raw) + if record is not None: + constraints.append(record) + return constraints + + +def _parse_orca_constraint_line(line: str) -> dict | None: + """Parse one ``{ B i j v C }``-style ORCA constraint line into a record. + + ORCA constraint syntax inside ``%geom Constraints``:: + + { [] C } + + The trailing ``C`` flags the coordinate as constrained. ``value`` is + optional. Atom indices are converted from 0-based (ORCA) to 1-based + (TCKDB). Unparseable lines return None and are skipped silently at + debug level so the rest of the block still parses. + """ + stripped = line.strip() + if not stripped or stripped.startswith('#'): + return None + # Tolerate either '{' or no braces (rare hand-written variants). + inner = stripped.strip('{}').strip() + if not inner: + return None + tokens = inner.split() + if len(tokens) < 2: + return None + letter = tokens[0].upper() + entry = _ORCA_LETTER_TO_TCKDB_KIND.get(letter) + if entry is None: + logger.debug("parse_orca_constraints: skipping unrecognised letter " + "%s in line: %s", letter, line) + return None + kind, expected_n = entry + + if len(tokens) < 1 + expected_n: + logger.debug("parse_orca_constraints: too few atom tokens for letter " + "%s (need %d): %s", letter, expected_n, line) + return None + + try: + zero_based = [int(tok) for tok in tokens[1:1 + expected_n]] + except ValueError: + logger.debug("parse_orca_constraints: non-integer atom index in: %s", + line) + return None + atoms = [a + 1 for a in zero_based] + + target_value: float | None = None + rest = tokens[1 + expected_n:] + for tok in rest: + if tok.upper() == 'C': + break + try: + target_value = float(tok) + except ValueError: + continue + + return { + 'constraint_kind': kind, + 'atoms': atoms, + 'target_value': target_value, + } + + register_ess_adapter('orca', OrcaParser) diff --git a/arc/parser/constraints_test.py b/arc/parser/constraints_test.py new file mode 100644 index 0000000000..9ae9327584 --- /dev/null +++ b/arc/parser/constraints_test.py @@ -0,0 +1,247 @@ +"""Tests for the Gaussian + ORCA constraint parsers.""" + +import os +import tempfile +import unittest + +from arc.parser.adapters.gaussian import ( + _gaussian_letter_to_tckdb_kind, + parse_gaussian_constraints, +) +from arc.parser.adapters.orca import parse_orca_constraints + + +def _write(text: str) -> str: + """Write ``text`` to a NamedTemporaryFile and return the path. + + The caller is responsible for cleanup; tests use ``addCleanup``. + """ + fd, path = tempfile.mkstemp(suffix='.gjf') + os.close(fd) + with open(path, 'w') as f: + f.write(text) + return path + + +class TestGaussianLetterClassifier(unittest.TestCase): + """Direct tests of the letter → constraint_kind helper.""" + + def test_known_letters_with_correct_arity(self): + self.assertEqual(_gaussian_letter_to_tckdb_kind('X', 1), 'cartesian_atom') + self.assertEqual(_gaussian_letter_to_tckdb_kind('B', 2), 'bond') + self.assertEqual(_gaussian_letter_to_tckdb_kind('A', 3), 'angle') + self.assertEqual(_gaussian_letter_to_tckdb_kind('D', 4), 'dihedral') + + def test_letters_with_wrong_arity_return_none(self): + self.assertIsNone(_gaussian_letter_to_tckdb_kind('B', 3)) + self.assertIsNone(_gaussian_letter_to_tckdb_kind('A', 2)) + self.assertIsNone(_gaussian_letter_to_tckdb_kind('D', 3)) + self.assertIsNone(_gaussian_letter_to_tckdb_kind('X', 2)) + + def test_non_constraint_letters_return_none(self): + self.assertIsNone(_gaussian_letter_to_tckdb_kind('L', 4)) + self.assertIsNone(_gaussian_letter_to_tckdb_kind('O', 3)) + + def test_unknown_letter_returns_none(self): + self.assertIsNone(_gaussian_letter_to_tckdb_kind('Q', 2)) + self.assertIsNone(_gaussian_letter_to_tckdb_kind('', 2)) + + +class TestGaussianInputDeckConstraints(unittest.TestCase): + """Parsing held-fixed constraints from Gaussian input decks.""" + + def setUp(self): + self._paths: list[str] = [] + + def tearDown(self): + for p in self._paths: + try: + os.remove(p) + except OSError: + pass + + def _deck(self, body: str) -> str: + path = _write(body) + self._paths.append(path) + return path + + def test_bond_freeze(self): + path = self._deck("B 1 2 F\n") + result = parse_gaussian_constraints(path) + self.assertEqual(result, [ + {'constraint_kind': 'bond', 'atoms': [1, 2], 'target_value': None}, + ]) + + def test_angle_freeze(self): + path = self._deck("A 1 2 3 F\n") + result = parse_gaussian_constraints(path) + self.assertEqual(result, [ + {'constraint_kind': 'angle', 'atoms': [1, 2, 3], 'target_value': None}, + ]) + + def test_dihedral_freeze(self): + path = self._deck("D 1 2 3 4 F\n") + result = parse_gaussian_constraints(path) + self.assertEqual(result, [ + {'constraint_kind': 'dihedral', 'atoms': [1, 2, 3, 4], 'target_value': None}, + ]) + + def test_cartesian_atom_freeze(self): + path = self._deck("X 7 F\n") + result = parse_gaussian_constraints(path) + self.assertEqual(result, [ + {'constraint_kind': 'cartesian_atom', 'atoms': [7], 'target_value': None}, + ]) + + def test_target_value_with_equals_sign(self): + path = self._deck("D 1 2 3 4 = 180.0 F\n") + result = parse_gaussian_constraints(path) + self.assertEqual(len(result), 1) + self.assertAlmostEqual(result[0]['target_value'], 180.0) + self.assertEqual(result[0]['constraint_kind'], 'dihedral') + + def test_target_value_bare(self): + path = self._deck("B 1 2 1.45 F\n") + result = parse_gaussian_constraints(path) + self.assertEqual(len(result), 1) + self.assertAlmostEqual(result[0]['target_value'], 1.45) + + def test_implicit_freeze_no_action_code(self): + # ARC's ``_load_scan_specs`` treats no-action-code lines as F. + path = self._deck("B 1 2\n") + result = parse_gaussian_constraints(path) + self.assertEqual(result, [ + {'constraint_kind': 'bond', 'atoms': [1, 2], 'target_value': None}, + ]) + + def test_scan_line_is_not_a_constraint(self): + # An ``S`` line is the active scan coordinate — must NOT appear in + # the constraint list. It belongs in scan_result.coordinates[]. + path = self._deck("D 1 2 3 4 S 36 10.0\n") + self.assertEqual(parse_gaussian_constraints(path), []) + + def test_mixed_scan_and_freeze(self): + path = self._deck( + "D 1 2 3 4 S 36 10.0\n" + "B 5 6 F\n" + "A 7 8 9 F\n" + ) + result = parse_gaussian_constraints(path) + self.assertEqual(result, [ + {'constraint_kind': 'bond', 'atoms': [5, 6], 'target_value': None}, + {'constraint_kind': 'angle', 'atoms': [7, 8, 9], 'target_value': None}, + ]) + + def test_linear_bend_letter_skipped(self): + path = self._deck( + "L 1 2 3 4 F\n" + "B 5 6 F\n" + ) + result = parse_gaussian_constraints(path) + self.assertEqual(result, [ + {'constraint_kind': 'bond', 'atoms': [5, 6], 'target_value': None}, + ]) + + def test_malformed_line_skipped(self): + # Letter B but only one atom token; must be skipped, not raise. + path = self._deck( + "B 1 F\n" + "A 1 2 3 F\n" + ) + result = parse_gaussian_constraints(path) + self.assertEqual(result, [ + {'constraint_kind': 'angle', 'atoms': [1, 2, 3], 'target_value': None}, + ]) + + def test_unknown_letter_skipped(self): + path = self._deck( + "Q 1 2 F\n" + "B 3 4 F\n" + ) + result = parse_gaussian_constraints(path) + self.assertEqual(result, [ + {'constraint_kind': 'bond', 'atoms': [3, 4], 'target_value': None}, + ]) + + def test_missing_file_returns_empty_list(self): + self.assertEqual(parse_gaussian_constraints('/nonexistent/path.gjf'), []) + + +class TestGaussianLogConstraints(unittest.TestCase): + """Parsing held-fixed constraints from Gaussian log files.""" + + def test_modredundant_block_in_log(self): + log_body = ( + " Some preamble\n" + " ...\n" + " The following ModRedundant input section has been read:\n" + " D 4 1 2 5 S 36 10.000\n" + " B 1 2 F\n" + " A 3 4 5 F\n" + "\n" + " Isotopes and Nuclear Properties:\n" + ) + fd, path = tempfile.mkstemp(suffix='.log') + os.close(fd) + self.addCleanup(os.remove, path) + with open(path, 'w') as f: + f.write(log_body) + + result = parse_gaussian_constraints(path) + self.assertEqual(result, [ + {'constraint_kind': 'bond', 'atoms': [1, 2], 'target_value': None}, + {'constraint_kind': 'angle', 'atoms': [3, 4, 5], 'target_value': None}, + ]) + + +class TestOrcaConstraints(unittest.TestCase): + """Defensive ORCA constraint parsing — atom indices converted to 1-based.""" + + def setUp(self): + self._paths: list[str] = [] + + def tearDown(self): + for p in self._paths: + try: + os.remove(p) + except OSError: + pass + + def _deck(self, body: str) -> str: + fd, path = tempfile.mkstemp(suffix='.in') + os.close(fd) + self._paths.append(path) + with open(path, 'w') as f: + f.write(body) + return path + + def test_full_constraints_block(self): + path = self._deck( + "! B3LYP def2-SVP Opt\n" + "%geom Constraints\n" + " { B 0 1 1.4 C }\n" + " { A 0 1 2 90.0 C }\n" + " { D 0 1 2 3 180.0 C }\n" + " { C 4 C }\n" + "end\n" + "* xyz 0 1\n" + ) + result = parse_orca_constraints(path) + # ORCA is 0-based; TCKDB is 1-based. + self.assertEqual(result, [ + {'constraint_kind': 'bond', 'atoms': [1, 2], 'target_value': 1.4}, + {'constraint_kind': 'angle', 'atoms': [1, 2, 3], 'target_value': 90.0}, + {'constraint_kind': 'dihedral', 'atoms': [1, 2, 3, 4], 'target_value': 180.0}, + {'constraint_kind': 'cartesian_atom', 'atoms': [5], 'target_value': None}, + ]) + + def test_no_constraints_block(self): + path = self._deck("! B3LYP def2-SVP Opt\n* xyz 0 1\n") + self.assertEqual(parse_orca_constraints(path), []) + + def test_missing_file(self): + self.assertEqual(parse_orca_constraints('/nonexistent/path.in'), []) + + +if __name__ == '__main__': + unittest.main() diff --git a/arc/parser/parser.py b/arc/parser/parser.py index 3aea29cbf6..ba061033ae 100644 --- a/arc/parser/parser.py +++ b/arc/parser/parser.py @@ -219,12 +219,29 @@ def parser(log_file_path: str, raise_error: bool = False) -> return_type: error_message='Could not parse 1d scan coords from {path}', ) +parse_1d_scan_energies_hartree = make_parser( + parse_method='parse_1d_scan_energies_hartree', + return_type=tuple[list[float] | None, list[float] | None], + error_message='Could not parse 1d scan absolute energies (Hartree) from {path}', +) + parse_irc_traj = make_parser( parse_method='parse_irc_traj', return_type=list[dict[str, tuple]] | None, error_message='Could not parse IRC trajectory from {path}', ) +# Rich IRC parser. Sibling of parse_irc_traj — emits per-point dicts with +# energy, reaction coordinate, gradients, direction, and xyz so consumers +# don't have to align fields across multiple narrow parses. ESS adapters +# that don't implement parse_irc_path get None back from make_parser +# (graceful fallback to the geometry-only parse_irc_traj at call sites). +parse_irc_path = make_parser( + parse_method='parse_irc_path', + return_type=list[dict] | None, + error_message='Could not parse rich IRC path from {path}', +) + parse_scan_conformers = make_parser( parse_method='parse_scan_conformers', return_type=pd.DataFrame | None, @@ -262,6 +279,69 @@ def parser(log_file_path: str, raise_error: bool = False) -> return_type: ) +def parse_1d_scan_full_result(log_file_path: str) -> dict: + """ + Parse a 1D rotor scan log into a single bundle of derived quantities suitable + for both ARC-internal use and TCKDB upload. + + The returned dict aggregates what individual narrow parsers expose so that + callers (in particular the ``output.yml`` writer) don't have to re-walk the + log file three times. Fields are populated independently — any of them may + be ``None`` if the underlying parser cannot extract that information for the + given ESS log; the wrapper never raises. + + Args: + log_file_path (str): Path to the ESS scan log. + + Returns: dict + ``{ + 'angles_deg': list[float] | None, + 'relative_energies_kj_mol': list[float] | None, + 'absolute_energies_hartree': list[float] | None, + 'zero_energy_reference_hartree': float | None, # min absolute energy + 'geometries': list[dict[str, tuple]] | None, + }`` + + ``angles_deg`` is taken from the relative-energies parse (the + established source of truth) when available, else from the absolute + parse. ``zero_energy_reference_hartree`` is the minimum of + ``absolute_energies_hartree`` when present, else ``None``. + """ + rel_energies, rel_angles = parse_1d_scan_energies(log_file_path=log_file_path) + abs_energies, abs_angles = parse_1d_scan_energies_hartree(log_file_path=log_file_path) + geometries = parse_1d_scan_coords(log_file_path=log_file_path) + + # Coerce numpy outputs (Gaussian's relative path returns lists; older code + # paths and ORCA may return ndarrays) to plain lists so YAML/JSON + # serialisers downstream don't choke on numpy scalars. + def _to_list(x): + if x is None: + return None + try: + return [float(v) for v in x] + except TypeError: + return None + + rel_energies = _to_list(rel_energies) + rel_angles = _to_list(rel_angles) + abs_energies = _to_list(abs_energies) + abs_angles = _to_list(abs_angles) + + angles = rel_angles if rel_angles is not None else abs_angles + + zero_ref = None + if abs_energies: + zero_ref = min(abs_energies) + + return { + 'angles_deg': angles, + 'relative_energies_kj_mol': rel_energies, + 'absolute_energies_hartree': abs_energies, + 'zero_energy_reference_hartree': zero_ref, + 'geometries': geometries, + } + + def parse_1d_scan_energies_from_specific_angle(log_file_path: str, initial_angle: float, ) -> tuple[list[float] | None, list[float] | None]: @@ -420,7 +500,11 @@ def parse_trajectory(path: str) -> list[dict[str, tuple]] | None: Entries are xyz's on the trajectory. """ ess_file, traj = False, None - if path.split('.')[-1] != 'xyz': + # GSM emits stringfile.xyz0000, stringfile.xyz0001, ... — the four-digit + # suffix is the GSM iteration index, not a different format. Treat any + # ``.xyz`` followed by optional digits as a plain XYZ trajectory and skip + # the ESS sniff (which would warn benignly on a non-ESS file). + if not re.search(r'\.xyz\d*$', path): ess_file = bool(determine_ess(log_file_path=path, raise_error=False)) if ess_file: try: diff --git a/arc/parser/parser_test.py b/arc/parser/parser_test.py index e10496d842..871854fbaa 100644 --- a/arc/parser/parser_test.py +++ b/arc/parser/parser_test.py @@ -582,6 +582,50 @@ def test_parse_trajectory(self): self.assertIsInstance(trajectory[0], dict) self.assertEqual(len(trajectory[0]['symbols']), 3) + def test_parse_irc_path_gaussian_forward(self): + """parse_irc_path emits per-point energy, RC, grads, direction, and xyz.""" + path = os.path.join(ARC_TESTING_PATH, 'irc', 'rxn_1_irc_1.out') + points = parser.parse_irc_path(log_file_path=path) + self.assertIsNotNone(points) + # Gaussian's Point Number: 0 has no CURRENT STRUCTURE block — the + # rich parser correctly skips it; 50 stepped points remain. + self.assertEqual(len(points), 50) + first = points[0] + self.assertEqual(first['point_number'], 1) + self.assertEqual(first['direction'], 'forward') + self.assertAlmostEqual(first['electronic_energy_hartree'], -303.578211343) + self.assertAlmostEqual(first['reaction_coordinate'], 0.07236) + self.assertAlmostEqual(first['max_gradient'], 0.007275026) + self.assertAlmostEqual(first['rms_gradient'], 0.002502813) + self.assertEqual(len(first['xyz']['symbols']), 8) + # Reaction coordinate increases monotonically along a single + # Gaussian IRC branch (cumulative path length). + rcs = [p['reaction_coordinate'] for p in points] + self.assertEqual(rcs, sorted(rcs)) + # Every emitted point inherits the FORWARD label. + self.assertEqual( + sorted({p['direction'] for p in points}), + ['forward'], + ) + + def test_parse_irc_path_gaussian_reverse_direction(self): + """The REVERSE branch fixture flips per-point direction labels.""" + path = os.path.join(ARC_TESTING_PATH, 'irc', 'rxn_1_irc_2.out') + points = parser.parse_irc_path(log_file_path=path) + self.assertIsNotNone(points) + self.assertEqual( + sorted({p['direction'] for p in points}), + ['reverse'], + ) + + def test_parse_irc_path_failed_log_returns_none(self): + """A truncated/failed IRC log yields None — the upstream upload + path interprets this as "no rich data" and falls back to the + geometry-only parser without aborting the upload.""" + path = os.path.join(ARC_TESTING_PATH, 'irc', 'irc_failed.out') + points = parser.parse_irc_path(log_file_path=path) + self.assertIsNone(points) + def test_parse_1d_scan_coords(self): """Test parsing the optimized coordinates of a torsion scan at each optimization point""" path_1 = os.path.join(ARC_TESTING_PATH, 'rotor_scans', 'H2O2.out') diff --git a/arc/scheduler.py b/arc/scheduler.py index 84b660d7f3..7c25996327 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -67,6 +67,28 @@ logger = get_logger() + +# TS-guess adapter ``method`` strings → ``output[label]['paths']`` slot +# carrying the produced log-path. Distinct slots so consumers (notably +# the TCKDB adapter) can dispatch a method-aware path_search calc +# without inspecting the file. Match is case- and whitespace-insensitive. +# Geometry-only methods (heuristics, AutoTST, KinBot, GCN, user XYZ) +# don't appear here — they have no log artifact to file. +_TS_GUESS_METHOD_TO_PATHS_KEY: dict[str, str] = { + 'orca_neb': 'neb', + 'xtb_gsm': 'gsm', + 'xtb-gsm': 'gsm', +} + + +def _ts_guess_paths_key(method: object) -> str | None: + """Return the ``output[label]['paths']`` slot for a TS-guess method, + or ``None`` for geometry-only / unknown methods.""" + if not isinstance(method, str): + return None + return _TS_GUESS_METHOD_TO_PATHS_KEY.get(method.strip().lower()) + + LOWEST_MAJOR_TS_FREQ, HIGHEST_MAJOR_TS_FREQ, default_job_settings, \ default_job_types, default_ts_adapters, max_ess_trsh, max_rotor_trsh, rotor_scan_resolution, servers_dict = \ settings['LOWEST_MAJOR_TS_FREQ'], settings['HIGHEST_MAJOR_TS_FREQ'], settings['default_job_settings'], \ @@ -295,6 +317,7 @@ def __init__(self, self.freq_scale_factor = freq_scale_factor self.ts_adapters = ts_adapters if ts_adapters is not None else default_ts_adapters self.ts_adapters = [ts_adapter.lower() for ts_adapter in self.ts_adapters] + self.ts_adapters = self._filter_unavailable_ts_adapters(self.ts_adapters) self.output = output or dict() self.output_multi_spc = dict() self.report_e_elect = report_e_elect @@ -526,6 +549,40 @@ def __init__(self, if not self.testing: self.schedule_jobs() + @staticmethod + def _filter_unavailable_ts_adapters(ts_adapters: list[str]) -> list[str]: + """Drop TS adapters whose backing software/conda env isn't installed. + + ARC's default ``ts_adapters`` list assumes every sister env (ts_gcn, + tst_env, ...) exists on every host. On dev machines that's rarely + true; the missing-env case used to surface 300 frames deep as + ``TypeError: argument should be a str ... not 'NoneType'`` from + ``Path(None)``. Filtering at scheduler init turns that into a clear + warning the user can act on. + """ + env_requirements = { + 'gcn': ('TS_GCN_PYTHON', 'ts_gcn'), + 'autotst': ('AUTOTST_PYTHON', 'tst_env'), + } + kept = [] + for adapter in ts_adapters: + requirement = env_requirements.get(adapter) + if requirement is None: + kept.append(adapter) + continue + setting_name, env_name = requirement + if settings.get(setting_name): + kept.append(adapter) + continue + logger.warning( + f"TS adapter '{adapter}' is configured but its backing software " + f"was not found ({setting_name} is unset; expected the '{env_name}' " + f"conda env). Skipping this adapter for the current run. To use it, " + f"either install the '{env_name}' env or remove '{adapter}' from " + f"your ts_adapters in input.yml / arc/settings/settings.py." + ) + return kept + def flush_pending_pipe_batches(self) -> None: """ Attempt to submit accumulated deferred pipe batches for SP, freq, IRC, and conf_sp. @@ -1144,6 +1201,7 @@ def end_job(self, job: JobAdapter, for rotors_dict in self.species_dict[label].rotors_dict.values(): if rotors_dict['pivots'] in [job.pivots, job.pivots[0]]: rotors_dict['scan_path'] = job.local_path_to_output_file + rotors_dict['scan_software'] = job.job_adapter try: job.remove_remote_files() except Exception as e: @@ -1304,8 +1362,10 @@ def run_ts_conformer_jobs(self, label: str): self.run_composite_job(label) self.species_dict[label].chosen_ts_method = self.species_dict[label].ts_guesses[0].method self.species_dict[label].successful_methods = [self.species_dict[label].ts_guesses[0].method] - if getattr(self.species_dict[label].ts_guesses[0], 'log_path', None): - self.output[label]['paths']['neb'] = self.species_dict[label].ts_guesses[0].log_path + tsg0 = self.species_dict[label].ts_guesses[0] + paths_key = _ts_guess_paths_key(tsg0.method) + if paths_key and getattr(tsg0, 'log_path', None): + self.output[label]['paths'][paths_key] = tsg0.log_path def run_opt_job(self, label: str, fine: bool = False): """ @@ -2342,8 +2402,9 @@ def determine_most_likely_ts_conformer(self, label: str): self.species_dict[label].initial_xyz = tsg.opt_xyz self.species_dict[label].final_xyz = None self.species_dict[label].ts_guesses_exhausted = False - if getattr(tsg, 'log_path', None): - self.output[label]['paths']['neb'] = tsg.log_path + paths_key = _ts_guess_paths_key(tsg.method) + if paths_key and getattr(tsg, 'log_path', None): + self.output[label]['paths'][paths_key] = tsg.log_path if tsg.success and tsg.energy is not None: # guess method and ts_level opt were both successful tsg.energy -= e_min im_freqs = f', imaginary frequencies {tsg.imaginary_freqs}' if tsg.imaginary_freqs is not None else '' @@ -2889,6 +2950,13 @@ def spawn_post_irc_jobs(self, job (JobAdapter): The IRC job object. """ self.output[label]['paths']['irc'].append(job.local_path_to_output_file) + # Track IRC direction in lockstep with the path list so downstream + # consumers (TCKDB computed-reaction upload) can label points + # forward/reverse without filename guesswork. setdefault handles + # restarts from older projects whose 'paths' dict predates this key. + self.output[label]['paths'].setdefault('irc_directions', list()).append( + getattr(job, 'irc_direction', None) + ) index = 1 if len(self.output[label]['paths']['irc']) == 2: index = 2 @@ -3059,6 +3127,7 @@ def check_scan_job(self, # Save the path and invalidation reason for debugging and tracking the file. # If ``success`` is None, it means that the job is being troubleshooted. self.species_dict[label].rotors_dict[job.rotor_index]['scan_path'] = job.local_path_to_output_file + self.species_dict[label].rotors_dict[job.rotor_index]['scan_software'] = job.job_adapter self.species_dict[label].rotors_dict[job.rotor_index]['invalidation_reason'] += invalidation_reason # If energies were obtained, draw the scan curve. @@ -3744,7 +3813,10 @@ def delete_all_species_jobs(self, label: str): logger.info(f'Deleted job {job_name}') job.delete() self.running_jobs[label] = list() - self.output[label]['paths'] = {key: '' if key != 'irc' else list() for key in self.output[label]['paths'].keys()} + self.output[label]['paths'] = { + key: list() if key in ('irc', 'irc_directions') else '' + for key in self.output[label]['paths'].keys() + } for job_type in self.output[label]['job_types']: # rotors and bde are initialised to True (see initialize_output_dict) because # species with no torsional modes / no BDE targets should not be blocked from @@ -3952,8 +4024,12 @@ def initialize_output_dict(self, label: str | None = None): if species.is_ts: if 'irc' not in self.output[species.label]['paths']: self.output[species.label]['paths']['irc'] = list() + if 'irc_directions' not in self.output[species.label]['paths']: + self.output[species.label]['paths']['irc_directions'] = list() if 'neb' not in self.output[species.label]['paths']: self.output[species.label]['paths']['neb'] = '' + if 'gsm' not in self.output[species.label]['paths']: + self.output[species.label]['paths']['gsm'] = '' if 'job_types' not in self.output[species.label]: self.output[species.label]['job_types'] = dict() for job_type in list(set(self.job_types.keys())) + ['opt', 'freq', 'sp', 'composite', 'onedmin']: diff --git a/arc/scheduler_test.py b/arc/scheduler_test.py index 1727a7d8ea..9588ddaade 100644 --- a/arc/scheduler_test.py +++ b/arc/scheduler_test.py @@ -1195,5 +1195,63 @@ def tearDownClass(cls): shutil.rmtree(project_directory, ignore_errors=True) +class TestTsGuessPathsKey(unittest.TestCase): + """Direct unit tests for ``_ts_guess_paths_key``. + + Guards the contract that the scheduler routes each TS-guess + adapter's log path into a method-specific slot under + ``output[label]['paths']`` (``neb`` / ``gsm``) — so the TCKDB + adapter can dispatch a method-aware ``path_search`` parent calc + without inspecting the file. Geometry-only methods (no log to + file) must return ``None``. + """ + + def setUp(self): + from arc.scheduler import _ts_guess_paths_key + self.resolve = _ts_guess_paths_key + + def test_orca_neb_routes_to_neb_slot(self): + self.assertEqual(self.resolve('orca_neb'), 'neb') + + def test_xtb_gsm_underscore_routes_to_gsm_slot(self): + self.assertEqual(self.resolve('xtb_gsm'), 'gsm') + + def test_xtb_gsm_dash_form_routes_to_gsm_slot(self): + # The xtb_gsm adapter sets ``tsg.method = 'xTB-GSM'`` (capital + # form with dash) on the produced TSGuess — see + # ``arc/job/adapters/ts/xtb_gsm.py:process_run``. The lookup + # must be case- and whitespace-insensitive so the scheduler + # routes both string forms to the same slot. + self.assertEqual(self.resolve('xTB-GSM'), 'gsm') + self.assertEqual(self.resolve(' xtb-gsm '), 'gsm') + self.assertEqual(self.resolve('XTB_GSM'), 'gsm') + + def test_geometry_only_methods_return_none(self): + for m in ('Heuristics', 'AutoTST', 'KinBot', 'GCN', + 'user guess 0', 'user guess 1'): + self.assertIsNone(self.resolve(m), msg=f'unexpected match: {m}') + + def test_non_string_inputs_return_none(self): + self.assertIsNone(self.resolve(None)) + self.assertIsNone(self.resolve(42)) + self.assertIsNone(self.resolve({'method': 'xtb_gsm'})) + + +class TestPathsTemplateInitialization(unittest.TestCase): + """``initialize_output_dict`` must seed both ``neb`` and ``gsm`` + slots on TS species so the per-method routing in + ``run_ts_conformer_jobs`` / ``determine_most_likely_ts_conformer`` + can write into pre-existing keys (and the post-restart reset path + in ``restart_species`` preserves them). + """ + + def test_ts_species_paths_template_includes_gsm(self): + # Light test: assert the source of truth at the literal call + # site; a full Scheduler-instance test is heavy and adds no + # signal beyond the static template check. + sched_src = open(os.path.join(ARC_PATH, 'arc', 'scheduler.py')).read() + self.assertIn("self.output[species.label]['paths']['gsm'] = ''", sched_src) + + if __name__ == '__main__': unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/scripts/get_species_corrections.py b/arc/scripts/get_species_corrections.py new file mode 100644 index 0000000000..fd80595503 --- /dev/null +++ b/arc/scripts/get_species_corrections.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +Compute per-species applied AEC (atom energy correction) and BAC (bond +additivity correction) totals by delegating to Arkane's public correction +functions. Run from the RMG conda environment so that Arkane is importable. + +Inputs and outputs are exchanged via YAML files so the caller (in arc_env) +does not need to import Arkane. + +Usage:: + + python get_species_corrections.py input.yaml output.yaml + +Input YAML:: + + level_of_theory: "LevelOfTheory(method='wb97xd',basis='def2tzvp',software='gaussian')" + bac_type: p # 'p', 'm', or null + species: + - label: CH4 + atoms: {C: 1, H: 4} + bonds: {C-H: 4} + coords: [[0, 0, 0], [0.63, 0.63, 0.63], ...] + nums: [6, 1, 1, 1, 1] + multiplicity: 1 + +Output YAML:: + + species: + - label: CH4 + aec: + value: -0.0234 # in Hartree + value_unit: hartree + components: + - component_kind: atom + key: C + multiplicity: 1 + parameter_value: -37.84706210301937 + parameter_unit: hartree + contribution_value: -0.0153 + - ... + bac: + value: -0.7 # in kcal/mol (Petersson) or kcal/mol (Melius) + value_unit: kcal_mol + bac_type: p + components: # only for bac_type == 'p'; absent for 'm' + - component_kind: bond + key: C-H + multiplicity: 4 + parameter_value: -0.1735 + parameter_unit: kcal_mol + contribution_value: -0.694 + +Per-species failure (e.g. missing parameters for that level of theory) is +reported by omitting the failing block (``aec``, ``bac``, or both) from +that species' output; the species entry is still present. +""" + +import re +import sys +import traceback + +import rmgpy.constants as constants + +from arkane.encorr.corr import get_atom_correction, get_bac +from arkane.encorr.data import atom_energies, pbac +from arkane.modelchem import LevelOfTheory + +from common import read_yaml_file, save_yaml_file + + +HARTREE_TO_J_MOL = constants.E_h * constants.Na # ~2625499.858 +KCAL_TO_J_MOL = 4184.0 + + +def _lot_from_string(lot_str): + """Construct a LevelOfTheory from its repr string.""" + kwargs = dict(re.findall(r"(\w+)\s*=\s*'([^']*)'", lot_str)) + return LevelOfTheory(**kwargs) + + +def _aec_for(lot, atoms): + """Total AEC in Hartree, plus atom components (key, multiplicity, + parameter_value [Hartree], contribution_value [Hartree]). + + Per-element contribution is computed by calling Arkane on a singleton + ``{element: count}`` dict and converting J/mol → Hartree. The sum of + component contributions equals the total (modulo float roundoff) + because Arkane's atom-correction formula is linear in atom counts. + """ + total_j_mol = get_atom_correction(lot, atoms) + total_hartree = total_j_mol / HARTREE_TO_J_MOL + + # Look up the per-element parameter (Hartree) from Arkane's atom_energies + # table via the same fallback Arkane uses internally. + energy_level = getattr(lot, 'energy', lot) + params = atom_energies.get(energy_level) or atom_energies.get(energy_level.simple()) + + components = [] + for symbol, count in atoms.items(): + per_element_j_mol = get_atom_correction(lot, {symbol: count}) + components.append({ + 'component_kind': 'atom', + 'key': symbol, + 'multiplicity': int(count), + 'parameter_value': float(params[symbol]) if params and symbol in params else None, + 'parameter_unit': 'hartree', + 'contribution_value': per_element_j_mol / HARTREE_TO_J_MOL, + }) + return { + 'value': total_hartree, + 'value_unit': 'hartree', + 'components': components, + } + + +def _pbac_for(lot, bonds, coords, nums, multiplicity): + """Total Petersson BAC in kcal/mol, plus bond components. + + Per-bond contribution is ``count * pbac[bond_key]`` (kcal/mol). The + parameter dict is looked up via Arkane's level-of-theory fallbacks + (full → simple → energy → energy.simple). Bond keys absent from the + parameter table are reported with ``parameter_value: null`` so the + caller can decide whether to keep the components. + """ + total_j_mol = get_bac(lot, bonds, coords, nums, + bac_type='p', multiplicity=multiplicity) + total_kcal = total_j_mol / KCAL_TO_J_MOL + + # Find the matching pbac parameter dict. Mirrors the fallback chain + # used by ``arkane.encorr.bac.BAC`` so component lookups match the + # parameters the total was computed with. + candidates = [lot, lot.simple()] + energy_level = getattr(lot, 'energy', None) + if energy_level is not None: + candidates.extend([energy_level, energy_level.simple()]) + params = None + for cand in candidates: + if cand in pbac: + params = pbac[cand] + break + + components = [] + for bond_key, count in bonds.items(): + param_value = None + if params: + if bond_key in params: + param_value = float(params[bond_key]) + else: + # Arkane accepts reversed bond keys (e.g. 'H-C' for 'C-H'). + flipped = ''.join(re.findall(r'[a-zA-Z]+|[^a-zA-Z]+', bond_key)[::-1]) + if flipped in params: + param_value = float(params[flipped]) + components.append({ + 'component_kind': 'bond', + 'key': bond_key, + 'multiplicity': int(count), + 'parameter_value': param_value, + 'parameter_unit': 'kcal_mol', + 'contribution_value': (param_value * count) if param_value is not None else None, + }) + return { + 'value': total_kcal, + 'value_unit': 'kcal_mol', + 'bac_type': 'p', + 'components': components, + } + + +def _mbac_for(lot, bonds, coords, nums, multiplicity): + """Total Melius BAC in kcal/mol, no components. + + Melius BAC is a pairwise atom-pair function plus a multiplicity + correction; there is no clean per-bond decomposition, so components + are deliberately omitted. + """ + total_j_mol = get_bac(lot, bonds, coords, nums, + bac_type='m', multiplicity=multiplicity) + return { + 'value': total_j_mol / KCAL_TO_J_MOL, + 'value_unit': 'kcal_mol', + 'bac_type': 'm', + } + + +def _process_species(spc, lot, bac_type): + """Compute AEC and BAC for one species; failure of either branch is + isolated so a partial result is still emitted.""" + out = {'label': spc['label']} + + atoms = spc.get('atoms') or {} + if atoms: + try: + out['aec'] = _aec_for(lot, atoms) + except Exception: + out['aec_error'] = traceback.format_exc(limit=2).strip().splitlines()[-1] + + bonds = spc.get('bonds') or {} + coords = spc.get('coords') + nums = spc.get('nums') + multiplicity = spc.get('multiplicity', 1) + if bac_type in ('p', 'm') and coords and nums: + try: + if bac_type == 'p': + out['bac'] = _pbac_for(lot, bonds, coords, nums, multiplicity) + else: + out['bac'] = _mbac_for(lot, bonds, coords, nums, multiplicity) + except Exception: + out['bac_error'] = traceback.format_exc(limit=2).strip().splitlines()[-1] + + return out + + +def main(input_path, output_path): + """Compute per-species totals + components for the given level of theory.""" + params = read_yaml_file(input_path) or {} + + lot_str = params.get('level_of_theory') + bac_type = params.get('bac_type') + species = params.get('species') or [] + + result = {'species': []} + if not lot_str: + save_yaml_file(output_path, result) + return + + lot = _lot_from_string(lot_str) + for spc in species: + result['species'].append(_process_species(spc, lot, bac_type)) + + save_yaml_file(output_path, result) + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print(f'Usage: {sys.argv[0]} input.yaml output.yaml', file=sys.stderr) + sys.exit(1) + main(sys.argv[1], sys.argv[2]) diff --git a/arc/scripts/save_arkane_thermo.py b/arc/scripts/save_arkane_thermo.py index f273eb28bd..a9f1c96350 100644 --- a/arc/scripts/save_arkane_thermo.py +++ b/arc/scripts/save_arkane_thermo.py @@ -34,19 +34,54 @@ def _extract_nasa(thermo_data): ) -def _extract_cp(thermo_data): - """Return a list of {temperature_k, cp_j_mol_k} dicts, or None.""" +def _extract_thermo_points(thermo_data): + """Return a list of per-temperature thermochemistry dicts, or None. + + Each entry carries the full set of TCKDB-shaped fields that + ``thermo_point`` accepts: + + ``temperature_k`` - the evaluation temperature in K + ``cp_j_mol_k`` - heat capacity (J/(mol*K)) + ``h_kj_mol`` - enthalpy (kJ/mol) + ``s_j_mol_k`` - entropy (J/(mol*K)) + ``g_kj_mol`` - Gibbs free energy (kJ/mol) + + RMG's NASA / ThermoData accessors return SI units (J/mol for energies, + J/(mol*K) for capacities/entropies); enthalpy and free energy are + converted to kJ/mol at the boundary because TCKDB persists them in + those units. + + Any per-temperature evaluation that raises (e.g., the polynomial is + not valid at that T) is skipped silently — the goal is best-effort + enrichment, not failing the whole library extraction over one out- + of-range point. + """ try: tmin = thermo_data.Tmin.value_si tmax = thermo_data.Tmax.value_si - return [ - {'temperature_k': T, 'cp_j_mol_k': float(thermo_data.get_heat_capacity(T))} - for T in _CP_TEMPS - if tmin <= T <= tmax - ] except Exception: return None + points = [] + for T in _CP_TEMPS: + if not (tmin <= T <= tmax): + continue + try: + cp = float(thermo_data.get_heat_capacity(T)) + h_kj = float(thermo_data.get_enthalpy(T)) / 1000.0 + s = float(thermo_data.get_entropy(T)) + g_kj = float(thermo_data.get_free_energy(T)) / 1000.0 + except Exception: + continue + points.append({ + 'temperature_k': T, + 'cp_j_mol_k': cp, + 'h_kj_mol': h_kj, + 's_j_mol_k': s, + 'g_kj_mol': g_kj, + }) + return points or None + def main(): """ @@ -72,14 +107,14 @@ def main(): S298 = thermo_data.get_entropy(RT) data = str(thermo_data) nasa_low, nasa_high = _extract_nasa(thermo_data) - cp_data = _extract_cp(thermo_data) + thermo_points = _extract_thermo_points(thermo_data) result[entry.label] = { 'H298': H298, 'S298': S298, 'data': data, 'nasa_low': nasa_low, 'nasa_high': nasa_high, - 'cp_data': cp_data, + 'thermo_points': thermo_points, } if result: result_path = os.path.join(os.getcwd(), 'thermo.yaml') diff --git a/arc/scripts_test.py b/arc/scripts_test.py index 5aa2ab44a4..46347a3820 100644 --- a/arc/scripts_test.py +++ b/arc/scripts_test.py @@ -99,8 +99,8 @@ def test_nasa_polynomials_present(self): self.assertEqual(len(data[label]['nasa_low']['coeffs']), 7) self.assertEqual(len(data[label]['nasa_high']['coeffs']), 7) - def test_cp_data_present(self): - """Verify tabulated Cp data is extracted.""" + def test_thermo_points_present(self): + """Verify tabulated thermo points (Cp + H + S + G per T) are extracted.""" script = os.path.join(ARC_PATH, 'arc', 'scripts', 'save_arkane_thermo.py') subprocess.run( ['conda', 'run', '-n', 'rmg_env', 'python', script], @@ -109,12 +109,17 @@ def test_cp_data_present(self): data = read_yaml_file(os.path.join(self.tmp_dir, 'thermo.yaml')) for label in ['CHO', 'CH4', 'CH2O', 'CH3']: - self.assertIn('cp_data', data[label], f'Missing cp_data for {label}') - cp = data[label]['cp_data'] - self.assertIsInstance(cp, list) - self.assertGreater(len(cp), 0) - self.assertIn('temperature_k', cp[0]) - self.assertIn('cp_j_mol_k', cp[0]) + self.assertIn('thermo_points', data[label], + f'Missing thermo_points for {label}') + points = data[label]['thermo_points'] + self.assertIsInstance(points, list) + self.assertGreater(len(points), 0) + first = points[0] + for key in ('temperature_k', 'cp_j_mol_k', 'h_kj_mol', + 's_j_mol_k', 'g_kj_mol'): + self.assertIn(key, first, + f'{label} thermo_points[0] missing {key}') + self.assertIsInstance(first[key], (int, float)) if __name__ == '__main__': diff --git a/arc/settings/settings.py b/arc/settings/settings.py index 1b066f07bc..2cfc0e3a7b 100644 --- a/arc/settings/settings.py +++ b/arc/settings/settings.py @@ -255,7 +255,7 @@ 'nnodes': 15, 'preopt': 'true', }, - 'level': 'wb97xd/def2tzvp', + 'level': 'wb97x-d3/def2tzvp', } valid_chars = "-_[]=.,%s%s" % (string.ascii_letters, string.digits) diff --git a/arc/settings/submit.py b/arc/settings/submit.py index bc1e23ec5b..d77dbed2d5 100644 --- a/arc/settings/submit.py +++ b/arc/settings/submit.py @@ -1,5 +1,22 @@ """ -Submit scripts and incore commands +Submit scripts and incore commands. + +The ``submit_scripts`` dict below ships with example templates keyed by +``EXAMPLE_*`` names. Each example is a real working configuration from +a specific machine (MIT Supercloud Slurm at a particular path layout, +a generic PBS skeleton, etc.); the names make that explicit so they +aren't mistaken for generic placeholders. **None of these examples +match a typical user's environment out of the box** — paths to the ESS +binaries (``g16root``, ``orcadir``, etc.), scratch directory locations, +queue names, and even the cluster scheduler are all machine-specific. + +Users should override ``submit_scripts`` in ``~/.arc/submit.py`` with +templates whose dict keys match the server names declared in +``~/.arc/settings.py`` and whose directives match each server's +``cluster_soft``. Mismatches between template directives (``#SBATCH``, +``#PBS``, ``Universe``) and the server's declared cluster_soft are now +rejected at adapter init by ``_validate_submit_script_for_server`` +in ``arc/job/adapter.py``. """ @@ -105,7 +122,11 @@ } -# Submission scripts stored as a dictionary with server and software as primary and secondary keys +# Submission scripts stored as a dictionary with server and software as +# primary and secondary keys. The keys below use ``EXAMPLE_*`` names to +# make clear these are reference templates, not generic placeholders; +# each was authored for a specific machine and won't work elsewhere +# without edits. See the module docstring above. submit_scripts = { 'local': { # ARC passes a base-2-derived MiB integer into this sample PBS @@ -184,6 +205,11 @@ mkdir -p $WorkDir cd $WorkDir cp $SubmitDir/input.in . +# NEB needs reactant.xyz/product.xyz; restarts need .gbw; freq follow-ups need .hess. +# Globs are benign for plain ORCA jobs (match nothing). +cp $SubmitDir/*.xyz . 2>/dev/null || true +cp $SubmitDir/*.gbw . 2>/dev/null || true +cp $SubmitDir/*.hess . 2>/dev/null || true $orcadir/orca input.in > input.log cp input.log $SubmitDir/ @@ -438,6 +464,11 @@ cd $WorkDir cp "$SubmitDir/input.in" . +# NEB needs reactant.xyz/product.xyz; restarts need .gbw; freq follow-ups need .hess. +# Globs are benign for plain ORCA jobs (match nothing). +cp "$SubmitDir"/*.xyz . 2>/dev/null || true +cp "$SubmitDir"/*.gbw . 2>/dev/null || true +cp "$SubmitDir"/*.hess . 2>/dev/null || true ${{OrcaDir}}/orca input.in > input.log @@ -613,6 +644,11 @@ cd $WorkDir cp "$SubmitDir/input.in" . +# NEB needs reactant.xyz/product.xyz; restarts need .gbw; freq follow-ups need .hess. +# Globs are benign for plain ORCA jobs (match nothing). +cp "$SubmitDir"/*.xyz . 2>/dev/null || true +cp "$SubmitDir"/*.gbw . 2>/dev/null || true +cp "$SubmitDir"/*.hess . 2>/dev/null || true ${ORCA_DIR}/orca input.in > input.log cp * "$SubmitDir/" @@ -831,6 +867,11 @@ cd $WorkDir cp "$SubmitDir/input.in" . +# NEB needs reactant.xyz/product.xyz; restarts need .gbw; freq follow-ups need .hess. +# Globs are benign for plain ORCA jobs (match nothing). +cp "$SubmitDir"/*.xyz . 2>/dev/null || true +cp "$SubmitDir"/*.gbw . 2>/dev/null || true +cp "$SubmitDir"/*.hess . 2>/dev/null || true /opt/orca/orca input.in > input.log cp * "$SubmitDir/" @@ -880,6 +921,11 @@ mkdir -p $WorkDir cd $WorkDir cp $SubmitDir/input.in . +# NEB needs reactant.xyz/product.xyz; restarts need .gbw; freq follow-ups need .hess. +# Globs are benign for plain ORCA jobs (match nothing). +cp $SubmitDir/*.xyz . 2>/dev/null || true +cp $SubmitDir/*.gbw . 2>/dev/null || true +cp $SubmitDir/*.hess . 2>/dev/null || true $orcadir/orca input.in > input.log cp input.log $SubmitDir/ @@ -889,7 +935,7 @@ """, }, - 'pbs_sample': { + 'EXAMPLE_pbs': { 'gaussian': """#!/bin/bash -l #PBS -q {queue} #PBS -l nodes=1:ppn={cpus} @@ -919,7 +965,7 @@ """, }, - 'server3': { + 'EXAMPLE_slurm_alongd_supercloud': { 'gaussian': """#!/bin/bash -l #SBATCH -p normal #SBATCH -J {name} @@ -993,6 +1039,11 @@ mkdir -p $WorkDir cd $WorkDir cp $SubmitDir/input.in . +# NEB needs reactant.xyz/product.xyz; restarts need .gbw; freq follow-ups need .hess. +# Globs are benign for plain ORCA jobs (match nothing). +cp $SubmitDir/*.xyz . 2>/dev/null || true +cp $SubmitDir/*.gbw . 2>/dev/null || true +cp $SubmitDir/*.hess . 2>/dev/null || true $orcadir/orca input.in > input.log cp input.log $SubmitDir/ @@ -1161,3 +1212,13 @@ """, }, } + + +# Back-compat aliases. The original keys (``'local'``, ``'pbs_sample'``, +# ``'server3'``) shipped with the repo for years and downstream forks +# may still reference them by those names. Aliasing keeps existing +# tests + user setups working while the ``EXAMPLE_*`` names are the +# documented, recommended way to refer to these example templates. +submit_scripts['local'] = submit_scripts['EXAMPLE_slurm_supercloud'] +submit_scripts['pbs_sample'] = submit_scripts['EXAMPLE_pbs'] +submit_scripts['server3'] = submit_scripts['EXAMPLE_slurm_alongd_supercloud'] diff --git a/arc/settings/submit_test.py b/arc/settings/submit_test.py index 08ca08ed01..33ec011ae0 100644 --- a/arc/settings/submit_test.py +++ b/arc/settings/submit_test.py @@ -16,9 +16,29 @@ class TestSubmit(unittest.TestCase): """ def test_servers(self): - """Test server keys in submit_scripts""" - for server in submit_scripts.keys(): - self.assertTrue(server in ['local', 'atlas', 'txe1', 'pbs_sample', 'server1', 'server2', 'server3']) + """Test that the repo-shipped server keys are present in submit_scripts. + + ``arc.imports`` merges ``~/.arc/submit.py`` overrides into the + runtime ``submit_scripts`` dict, so checking the full key set + against an allowlist would falsely fail on any developer + machine with a personalized override. Instead, assert that the + repo's canonical example keys exist (proving the file + loaded), without restricting what else the user added. + """ + repo_required = { + 'EXAMPLE_slurm_supercloud', + 'EXAMPLE_pbs', + 'EXAMPLE_slurm_alongd_supercloud', + } + # Back-compat aliases must also resolve so older tests + forks + # that reference the historical key names keep working. + backcompat_aliases = {'local', 'pbs_sample', 'server3'} + for required in repo_required | backcompat_aliases: + self.assertIn(required, submit_scripts) + # The aliases point at the same template objects, not copies. + self.assertIs(submit_scripts['local'], submit_scripts['EXAMPLE_slurm_supercloud']) + self.assertIs(submit_scripts['pbs_sample'], submit_scripts['EXAMPLE_pbs']) + self.assertIs(submit_scripts['server3'], submit_scripts['EXAMPLE_slurm_alongd_supercloud']) if __name__ == '__main__': diff --git a/arc/species/species.py b/arc/species/species.py index ec3a7cb2b0..c6de4010e9 100644 --- a/arc/species/species.py +++ b/arc/species/species.py @@ -1344,6 +1344,7 @@ def initialize_directed_rotors(self): 'trsh_counter': 0, 'trsh_methods': list(), 'scan_path': '', + 'scan_software': '', 'directed_scan_type': key, 'directed_scan': dict(), 'dimensions': 0, @@ -2524,7 +2525,7 @@ def __init__(self, comment='', nasa_low=None, nasa_high=None, - cp_data=None, + thermo_points=None, ): """ Args: @@ -2540,7 +2541,11 @@ def __init__(self, comment (str): Additional comments or description nasa_low (dict): Low-temperature NASA polynomial: {tmin_k, tmax_k, coeffs}. nasa_high (dict): High-temperature NASA polynomial: {tmin_k, tmax_k, coeffs}. - cp_data (list): Tabulated Cp: list of {temperature_k, cp_j_mol_k} dicts. + thermo_points (list): Tabulated per-temperature thermochemistry: + list of dicts with ``temperature_k``, ``cp_j_mol_k``, + ``h_kj_mol``, ``s_j_mol_k``, ``g_kj_mol``. Older field + name was ``cp_data`` (Cp-only); the field now carries + the full TCKDB ``thermo_point`` shape. """ self.H298 = H298 self.S298 = S298 @@ -2554,7 +2559,7 @@ def __init__(self, self.comment = comment self.nasa_low = nasa_low self.nasa_high = nasa_high - self.cp_data = cp_data + self.thermo_points = thermo_points def __repr__(self): """ @@ -2588,7 +2593,7 @@ def __reduce__(self): return (ThermoData, (self.H298, self.S298, self.Tdata, self.Cpdata, self.Cp0, self.CpInf, self.Tmin, self.Tmax, self.data, self.comment, - self.nasa_low, self.nasa_high, self.cp_data)) + self.nasa_low, self.nasa_high, self.thermo_points)) def update(self, data: dict): """ diff --git a/arc/statmech/arkane.py b/arc/statmech/arkane.py index bea8005850..eab1280150 100644 --- a/arc/statmech/arkane.py +++ b/arc/statmech/arkane.py @@ -37,6 +37,13 @@ MBAC_SECTION_END = "freq_dict =" FREQ_SECTION_START = "freq_dict = {" +# Tunneling method ARC uses for every reaction kinetics fit. Single source +# of truth: the Arkane input template renders this constant, and output.yml +# / the TCKDB adapter both read it back so downstream consumers know which +# correction was applied. +ARKANE_TUNNELING_METHOD = 'Eckart' + + main_input_template = """#!/usr/bin/env python # -*- coding: utf-8 -*- @@ -75,7 +82,7 @@ reactants=${rxn.reactants}, products=${rxn.products}, transitionState='${rxn.ts_species.label}', - tunneling='Eckart', + tunneling='${tunneling_method}', ) % endfor @@ -386,6 +393,7 @@ def render_arkane_input_template(self, t_min=self.T_min, t_max=self.T_max, t_count=self.T_count, + tunneling_method=ARKANE_TUNNELING_METHOD, ) def generate_species_files(self, @@ -503,7 +511,7 @@ def parse_arkane_thermo_output(self, statmech_dir: str) -> None: spc.thermo.data = content[lbl]['data'] spc.thermo.nasa_low = content[lbl].get('nasa_low') spc.thermo.nasa_high = content[lbl].get('nasa_high') - spc.thermo.cp_data = content[lbl].get('cp_data') + spc.thermo.thermo_points = content[lbl].get('thermo_points') line = ( f" {lbl:<{label_width}} " diff --git a/arc/tckdb/adapter.py b/arc/tckdb/adapter.py index 2777dea3fa..3e0ab621b3 100644 --- a/arc/tckdb/adapter.py +++ b/arc/tckdb/adapter.py @@ -24,8 +24,10 @@ import base64 import hashlib +import json import logging import os +import re from collections.abc import Mapping from dataclasses import dataclass, field from pathlib import Path @@ -33,13 +35,29 @@ from tckdb_client import TCKDBClient -from arc.tckdb.config import IMPLEMENTED_ARTIFACT_KINDS, TCKDBConfig +from arc.tckdb.config import ( + IMPLEMENTED_ARTIFACT_KINDS, + TCKDBConfig, + UPLOAD_MODE_COMPUTED_REACTION, + UPLOAD_MODE_COMPUTED_SPECIES, +) + +# Upload modes whose bundle payload already carries input/output_log +# artifacts inline under each calculation. Standalone artifact sidecars +# in these modes would (a) duplicate uploads and (b) lock in +# server-assigned calculation IDs that go stale on DB resets — see +# `submit_artifacts_for_calculation` for the gating logic. +_BUNDLE_MODES_WITH_INLINE_ARTIFACTS = frozenset({ + UPLOAD_MODE_COMPUTED_SPECIES, + UPLOAD_MODE_COMPUTED_REACTION, +}) from arc.tckdb.idempotency import ( ArtifactIdempotencyInputs, IdempotencyInputs, build_artifact_idempotency_key, build_idempotency_key, ) +from arc.tckdb.constraints import serialize_constraints from arc.tckdb.payload_writer import ( ArtifactSidecarMetadata, PayloadWriter, @@ -50,6 +68,29 @@ ) +def _serialize_calc_constraints(source) -> list[dict]: + """Translate a parser-shaped constraint list into the TCKDB payload shape. + + ``source`` is whatever ``arc/output.py`` attached to the record (a + list of parser dicts) or what the caller passed explicitly. Empty / + None / unrecognised input produces ``[]``. Wraps + ``arc.tckdb.constraints.serialize_constraints`` so per-calc parse + failures never bubble up into payload generation. + """ + if not source: + return [] + if not isinstance(source, (list, tuple)): + logger.warning("TCKDB constraints: expected list, got %s; emitting [].", + type(source).__name__) + return [] + try: + return serialize_constraints(source) + except Exception as exc: # defensive — serializer logs internally too + logger.warning("TCKDB constraints: serialization failed: %s; emitting [].", + exc) + return [] + + logger = logging.getLogger("arc") CONFORMER_UPLOAD_ENDPOINT = "/uploads/conformers" @@ -62,12 +103,20 @@ COMPUTED_SPECIES_ENDPOINT = "/uploads/computed-species" COMPUTED_SPECIES_KIND = "computed_species" +# Computed-reaction bundle endpoint. Like computed-species but adds +# reactant/product species blocks, an inline transition_state, and a +# modified-Arrhenius kinetics block with producer-declared +# source_calculations. +COMPUTED_REACTION_ENDPOINT = "/uploads/computed-reaction" +COMPUTED_REACTION_KIND = "computed_reaction" + # Local calculation-key namespace within a computed-species bundle. # These keys are referenced from `depends_on.parent_calculation_key` and # `thermo.source_calculations[].calculation_key`. They have no relation # to TCKDB-assigned calculation_ids — the bundle endpoint mints those # server-side and returns them in the response. _CALC_KEY_OPT = "opt" +_CALC_KEY_OPT_COARSE = "opt_coarse" _CALC_KEY_FREQ = "freq" _CALC_KEY_SP = "sp" @@ -77,19 +126,38 @@ # `arc/output.py::_spc_to_dict`. _LOG_FIELD_BY_CALC_KEY = { _CALC_KEY_OPT: "opt_log", + _CALC_KEY_OPT_COARSE: "coarse_opt_log", _CALC_KEY_FREQ: "freq_log", _CALC_KEY_SP: "sp_log", + # ts_guess role is method-dispatched at lookup time (see + # ``_resolve_log_field``) because the producing TS-guess adapter + # determines which record field carries the log path + # (``neb_log`` for orca_neb, ``gsm_log`` for xtb_gsm). The gate in + # ``_build_ts_block`` is what decides whether the calc is emitted + # at all. } # Same shape, for input-deck paths. `arc/output.py` emits these per-job # (with per-job software → per-job filename), only when the deck file # actually exists on disk; null when the deck wasn't kept (archived runs). +# (No coarse_opt_input field today — coarse jobs share the engine's +# input filename with the fine opt; the deck is keyed off the log dir.) _INPUT_FIELD_BY_CALC_KEY = { _CALC_KEY_OPT: "opt_input", _CALC_KEY_FREQ: "freq_input", _CALC_KEY_SP: "sp_input", } +# Held-fixed coordinate constraints emitted onto the species record by +# ``arc/output.py::_spc_to_dict`` (one list per ESS-job calc). Scan calcs +# carry their constraints inline on the ``additional_calculations`` entry +# rather than via species_record, so they're not in this map. +_CONSTRAINTS_FIELD_BY_CALC_KEY = { + _CALC_KEY_OPT: "opt_constraints", + _CALC_KEY_FREQ: "freq_constraints", + _CALC_KEY_SP: "sp_constraints", +} + # (artifact_kind, record-field-map) pairs that the inline-artifact # helper iterates per calc. Keeping the mapping data-driven so adding # checkpoints (or any future kind) is one tuple, not a code branch. @@ -98,6 +166,152 @@ ("input", _INPUT_FIELD_BY_CALC_KEY), ) +# TS-side calc role; reaction bundles can also carry an irc on the TS, +# but ARC may not always have one parsed cleanly, so it's optional. +_CALC_KEY_IRC = "irc" + +# TS guess parent calculation (only emitted when the chosen guess was +# itself a real path-search calculation — orca_neb or xtb_gsm). +# Geometry-only guesses (heuristics, AutoTST/KinBot wrappers, GCN, +# user-supplied XYZ) stay as ``calculation_input_geometry`` and never +# produce a calc node here, per the calculation_dependency contract: +# parent must be a real calc. +_CALC_KEY_TS_GUESS = "ts_guess" + +# ARC's TSGuess.method strings → TCKDB ``path_search_result.method`` +# enum value. Lookup is case- and whitespace-insensitive at the call +# site (producer strings are stable identifiers but historically have +# casing variations like ``orca_neb`` vs ``ORCA_NEB`` and module-name +# vs class-name forms ``xtb_gsm`` vs ``xTB-GSM``). Geometry-only +# methods (heuristics, AutoTST, KinBot, GCN, user XYZ) are deliberately +# absent — they have no path-search log to anchor a parent calc. +_TS_GUESS_PATH_SEARCH_METHODS: dict[str, str] = { + "orca_neb": "neb", + "xtb_gsm": "gsm", + "xtb-gsm": "gsm", +} + +# Per-method record-field name carrying the path-search log path. +# ``neb_log`` is populated by ``arc/output.py`` from ``paths['neb']`` +# (set by the scheduler from ``tsg.log_path`` when orca_neb wins). +# ``gsm_log`` is reserved for the GSM equivalent — xtb_gsm does not +# currently set ``tsg.log_path``, so the field is unpopulated today. +# When the producer wires it up, the gate below activates without any +# adapter changes. +_TS_GUESS_LOG_FIELD_BY_METHOD: dict[str, str] = { + "neb": "neb_log", + "gsm": "gsm_log", +} + + +def _resolve_ts_guess_path_search(method: object) -> str | None: + """Return the TCKDB ``path_search_result.method`` enum value for an + ARC TSGuess.method string, or ``None`` for geometry-only / unknown + methods. + + Conservative: only matches strings explicitly listed in + ``_TS_GUESS_PATH_SEARCH_METHODS`` (case- and whitespace-insensitive). + Non-string and unknown values return ``None`` — the caller falls + back to geometry-only TS provenance (no parent calc emitted). + """ + if not isinstance(method, str): + return None + return _TS_GUESS_PATH_SEARCH_METHODS.get(method.strip().lower()) + + +def _resolve_log_field(role: str, record: Mapping[str, Any]) -> str | None: + """Resolve the species-record field carrying the output-log path for ``role``. + + Ordinary roles dispatch through ``_LOG_FIELD_BY_CALC_KEY``. The + ``ts_guess`` role is method-dispatched: the field name depends on + which TS-guess adapter produced the chosen guess (orca_neb → + ``neb_log``, xtb_gsm → ``gsm_log``). Returns ``None`` when the role + has no log mapping or the TS-guess method is geometry-only. + """ + if role == _CALC_KEY_TS_GUESS: + method_enum = _resolve_ts_guess_path_search(record.get("chosen_ts_method")) + if method_enum is None: + return None + return _TS_GUESS_LOG_FIELD_BY_METHOD.get(method_enum) + return _LOG_FIELD_BY_CALC_KEY.get(role) + +# Bundle-local calc role for rotor scans. Per-rotor keys are minted as +# ``f"{_CALC_KEY_SCAN}_rotor_{i}"`` by ``arc/output.py``; the adapter +# accepts whatever key the species record provides without re-minting, +# so the source-of-truth for naming stays on the producer side. +_CALC_KEY_SCAN = "scan" + +# ARC kinetics-unit string → TCKDB enum string. Source of truth for the +# enum values is TCKDB's ``ArrheniusAUnits`` and ``ActivationEnergyUnits`` +# (backend/app/db/models/common.py). The ARC side normalizes whitespace +# and quotes before lookup so cosmetic variations don't miss. +_ARC_TO_TCKDB_A_UNITS: dict[str, str] = { + "s^-1": "per_s", + "1/s": "per_s", + "cm^3/(mol*s)": "cm3_mol_s", + "cm^3/(molecule*s)": "cm3_molecule_s", + "m^3/(mol*s)": "m3_mol_s", + "cm^6/(mol^2*s)": "cm6_mol2_s", + "cm^6/(molecule^2*s)": "cm6_molecule2_s", + "m^6/(mol^2*s)": "m6_mol2_s", +} + +_ARC_TO_TCKDB_EA_UNITS: dict[str, str] = { + "j/mol": "j_mol", + "kj/mol": "kj_mol", + "cal/mol": "cal_mol", + "kcal/mol": "kcal_mol", +} + + +def _normalize_unit_key(text: str | None) -> str | None: + """Lowercase + strip whitespace so ``" kJ/mol "`` matches ``"kj/mol"``. + + The ARC unit strings come from RMG and aren't perfectly consistent in + case/whitespace; the TCKDB enums are. This is the boundary helper. + """ + if text is None: + return None + s = str(text).strip().lower() + return s or None + + +def arc_to_tckdb_a_units(arc_units: str | None) -> str | None: + """Map an ARC ``A_units`` string to a TCKDB ``ArrheniusAUnits`` enum value. + + Returns ``None`` for null/empty input or unrecognized strings — the + caller decides whether to omit the field or raise. Unrecognized + units log a debug line so a producer with a typo can spot it + without spamming warnings on every reaction. + """ + key = _normalize_unit_key(arc_units) + if key is None: + return None + enum_value = _ARC_TO_TCKDB_A_UNITS.get(key) + if enum_value is None: + logger.debug( + "TCKDB kinetics: unrecognized A_units %r; field will be omitted.", + arc_units, + ) + return enum_value + + +def arc_to_tckdb_ea_units(arc_units: str | None) -> str | None: + """Map an ARC ``Ea_units`` (or ``dEa_units``) string to a TCKDB ``ActivationEnergyUnits`` enum value. + + Same null-or-unknown → ``None`` policy as :func:`arc_to_tckdb_a_units`. + """ + key = _normalize_unit_key(arc_units) + if key is None: + return None + enum_value = _ARC_TO_TCKDB_EA_UNITS.get(key) + if enum_value is None: + logger.debug( + "TCKDB kinetics: unrecognized Ea_units %r; field will be omitted.", + arc_units, + ) + return enum_value + @dataclass(frozen=True) class UploadOutcome: @@ -254,6 +468,24 @@ def submit_artifacts_for_calculation( species_label = species_record.get("label") or "unlabeled" artifact_cfg = self._config.artifacts + # Defense-in-depth: in bundle modes the bundle payload already + # carries input/output_log artifacts inline under each calc, so a + # standalone POST to /calculations/{id}/artifacts is redundant + # AND fragile — the integer ``calculation_id`` baked into the + # sidecar goes stale on DB resets, producing 404 cascades on + # replay. The sweep already gates this at the call site; the + # adapter-level skip protects future callers from regressing + # the invariant. + if ( + self._config.upload_mode in _BUNDLE_MODES_WITH_INLINE_ARTIFACTS + and kind in IMPLEMENTED_ARTIFACT_KINDS + ): + return _skip( + calculation_id, kind, + f"upload_mode={self._config.upload_mode!r} carries kind={kind!r} " + "inline in the bundle; standalone artifact upload suppressed", + ) + if not artifact_cfg.upload: return _skip(calculation_id, kind, "artifacts.upload is False") if kind not in artifact_cfg.kinds: @@ -422,6 +654,15 @@ def _build_computed_species_payload( "conformers": [conformer_block], } + applied_corrections = _build_applied_energy_corrections( + species_record.get("applied_energy_corrections") or [], + source_calculation_key=( + _CALC_KEY_SP if _CALC_KEY_SP in included_keys else None + ), + ) + if applied_corrections: + bundle["applied_energy_corrections"] = applied_corrections + thermo_block = _build_thermo_block( species_record.get("thermo"), included_calc_keys=included_keys, @@ -432,15 +673,31 @@ def _build_computed_species_payload( # Workflow-tool release at bundle level (in addition to per-calc): # mirrors what the conformer adapter records and lets the server # tag the species_entry with the producer. - arc_version = output_doc.get("arc_version") - arc_git_commit = output_doc.get("arc_git_commit") - if arc_version or arc_git_commit: - wt: dict[str, Any] = {"name": "ARC"} - if arc_version: - wt["version"] = str(arc_version) - if arc_git_commit: - wt["git_commit"] = str(arc_git_commit) - bundle["workflow_tool_release"] = wt + arc_wt = _arc_workflow_tool_release(output_doc) + if arc_wt is not None: + bundle["workflow_tool_release"] = arc_wt + + # Statmech block: carries frequency-scale-factor provenance, + # plus richer base statmech metadata (external_symmetry, + # is_linear, rigid_rotor_kind, statmech_treatment, point_group) + # and slim torsion summaries when ARC populated them in the + # species record's ``statmech`` subdict (written by + # ``arc/output.py::_statmech_to_dict``). The container is omitted + # entirely when nothing useful resolves — no empty containers. + # Computed-species emits unscoped role keys (``opt`` / ``freq`` + # / ``sp``) directly as the bundle-local calc keys. + species_calc_keys_by_role: dict[str, str] = { + role: role for role in (_CALC_KEY_OPT, _CALC_KEY_FREQ, _CALC_KEY_SP) + if role in included_keys + } + statmech_block = _build_statmech_block_for_species( + output_doc=output_doc, + species_record=species_record, + calc_keys_by_role=species_calc_keys_by_role, + workflow_tool_release=arc_wt, + ) + if statmech_block is not None: + bundle["statmech"] = statmech_block return bundle @@ -455,7 +712,39 @@ def _build_conformer_block( The keys list is what's actually present in the bundle's calc namespace, used to drive thermo's source_calculations links. + + The conformer's reference xyz (the optimized geometry) is + normalized once and threaded through to ``_build_calc_in_bundle`` + so freq + sp can declare it as their explicit input geometry — + ARC's invariant guarantees they ran on this geometry. Server + auto-fill would also reach the same answer for freq/sp, but + being explicit makes the bundle self-describing. """ + # Required: the bundle's ConformerInBundle.geometry, also reused + # as the explicit input_geometries entry on freq + sp. + conformer_xyz_text = _require_xyz_text(species_record) + + # Coarse-opt provenance. Emitted only when ``coarse_opt_log`` + # exists AND ``coarse_opt_output_xyz`` is populated — the + # output xyz is what chains the fine opt to the coarse opt + # (it's both opt_coarse's output and the fine opt's declared + # input). When the coarse log existed but its geometry didn't + # parse cleanly, ``arc/output.py`` deliberately leaves + # ``coarse_opt_output_xyz`` null so we fall back to single-stage + # bundle shape rather than emit a half-described opt_coarse. + opt_coarse_calc = self._build_opt_coarse_calc( + output_doc=output_doc, species_record=species_record, + ) + + # Fine opt's depends_on: gain an ``optimized_from → opt_coarse`` + # edge when coarse was emitted. Otherwise no upstream calc. + fine_opt_depends_on: list[Mapping[str, Any]] | None = None + if opt_coarse_calc is not None: + fine_opt_depends_on = [ + {"parent_calculation_key": _CALC_KEY_OPT_COARSE, + "role": "optimized_from"} + ] + primary_calc = self._build_calc_in_bundle( output_doc=output_doc, species_record=species_record, @@ -465,12 +754,20 @@ def _build_conformer_block( ess_job_key="opt", result_field="opt_result", result_payload=_opt_result_payload(species_record), - depends_on=None, + depends_on=fine_opt_depends_on, tckdb_origin=None, + conformer_xyz_text=conformer_xyz_text, ) included: list[str] = [_CALC_KEY_OPT] additional: list[dict[str, Any]] = [] + # Coarse opt is an *additional* calc, not primary — fine opt is + # the geometry of record. Order matters for thermo-source-link + # determinism: coarse goes first so the bundle reads + # opt → opt_coarse → freq → sp in additional_calculations. + if opt_coarse_calc is not None: + additional.append(opt_coarse_calc) + included.append(_CALC_KEY_OPT_COARSE) freq_result = _freq_result_payload(species_record) if freq_result is not None: @@ -486,6 +783,7 @@ def _build_conformer_block( result_payload=freq_result, depends_on=[{"parent_calculation_key": _CALC_KEY_OPT, "role": "freq_on"}], tckdb_origin=None, + conformer_xyz_text=conformer_xyz_text, )) included.append(_CALC_KEY_FREQ) except ValueError as exc: @@ -510,6 +808,7 @@ def _build_conformer_block( tckdb_origin=( _reused_origin("opt") if _sp_is_reused_from_opt(output_doc) else None ), + conformer_xyz_text=conformer_xyz_text, )) included.append(_CALC_KEY_SP) except ValueError as exc: @@ -518,9 +817,53 @@ def _build_conformer_block( species_record.get("label"), exc, ) + # Rotor scans. ``arc/output.py`` populates the species record's + # ``additional_calculations`` list with one entry per successful + # 1D rotor whose log was parseable. Each entry already carries a + # bundle-local ``key`` (``scan_rotor_``) and a TCKDB-shaped + # ``scan_result`` dict; the adapter only needs to layer level / + # software / workflow_tool_release on top, with opt-level + # fallback because rotors_dict has no per-scan level field. The + # ``depends_on`` edge points back to opt — the scan is a series + # of constrained reoptimizations from that geometry. + for scan_entry in (species_record.get("additional_calculations") or []): + if not isinstance(scan_entry, Mapping): + continue + if scan_entry.get("type") != _CALC_KEY_SCAN: + continue + scan_key = scan_entry.get("key") + scan_result = scan_entry.get("scan_result") + if not isinstance(scan_key, str) or not scan_key: + continue + if not isinstance(scan_result, Mapping): + continue + try: + additional.append(self._build_calc_in_bundle( + output_doc=output_doc, + species_record=species_record, + calc_key=scan_key, + calc_type=_CALC_KEY_SCAN, + level_kind="opt", + ess_job_key="opt", + result_field="scan_result", + result_payload=scan_result, + depends_on=[{"parent_calculation_key": _CALC_KEY_OPT, + "role": "scan_parent"}], + tckdb_origin=None, + conformer_xyz_text=conformer_xyz_text, + calc_role=_CALC_KEY_SCAN, + source_constraints=scan_entry.get("constraints"), + )) + included.append(scan_key) + except ValueError as exc: + logger.warning( + "TCKDB computed-species: scan calculation %s skipped for label=%s: %s", + scan_key, species_record.get("label"), exc, + ) + block: dict[str, Any] = { "key": conformer_key, - "geometry": {"xyz_text": _require_xyz_text(species_record)}, + "geometry": {"xyz_text": conformer_xyz_text}, "primary_calculation": primary_calc, "additional_calculations": additional, } @@ -542,12 +885,37 @@ def _build_calc_in_bundle( result_payload: Mapping[str, Any] | None, depends_on: list[Mapping[str, Any]] | None, tckdb_origin: Mapping[str, Any] | None, + conformer_xyz_text: str | None = None, + calc_role: str | None = None, + source_constraints: list | None = None, ) -> dict[str, Any]: """Build one CalculationInBundle dict. + ``calc_key`` is the bundle-local identity (e.g. ``"opt"`` for a + single-species bundle, ``"r0_opt"`` for a reaction bundle). + ``calc_role`` is the chemistry role used for policy lookups + (input/output geometry rules, log-field map). When ``calc_role`` + is ``None`` it defaults to ``calc_key`` — that's the + single-species case where the two are identical and the existing + callers don't have to change. + Reuses :meth:`_calculation_payload` for the level/software/result plumbing, then layers on the bundle-specific fields: ``key``, - ``depends_on``, and inline ``artifacts``. + ``depends_on``, ``input_geometries``, and inline ``artifacts``. + + Input-geometry policy (matches TCKDB v0 backend): + - ``opt``: emit ``input_geometries`` only when ARC has the + actual pre-opt xyz on the record (``opt_input_xyz``). Never + fall back to the conformer's optimized geometry — that's + opt's *output*, not its input. If absent, the field is + omitted; backend has no auto-fill for opt. + - ``freq`` / ``sp``: emit ``input_geometries`` set to the + conformer's optimized geometry (passed in via + ``conformer_xyz_text``). ARC's invariant guarantees these + ran on that geometry. Backend would auto-fill the same value + if we omitted, but explicit is better — keeps the bundle + self-describing and removes ambiguity for any consumer that + doesn't replicate the auto-fill rule. """ level = _resolve_level(output_doc, level_kind) calc = self._calculation_payload( @@ -562,29 +930,225 @@ def _build_calc_in_bundle( calc["key"] = calc_key if depends_on: calc["depends_on"] = [dict(d) for d in depends_on] - artifacts = self._inline_artifacts_for_calc(species_record, calc_key=calc_key) + role = calc_role if calc_role is not None else calc_key + input_geometries = self._input_geometries_for_calc( + calc_role=role, + species_record=species_record, + conformer_xyz_text=conformer_xyz_text, + ) + if input_geometries: + calc["input_geometries"] = input_geometries + output_geometries = self._output_geometries_for_calc( + calc_role=role, + species_record=species_record, + conformer_xyz_text=conformer_xyz_text, + ) + if output_geometries: + calc["output_geometries"] = output_geometries + artifacts = self._inline_artifacts_for_calc(species_record, calc_role=role) # Schema defaults `artifacts: []`. Emit explicitly only when we have # bytes to send (or when artifact upload is enabled and we want to # signal "no log available" with an empty list); omit otherwise. if artifacts: calc["artifacts"] = artifacts + + # Held-fixed coordinate constraints. ``source_constraints`` wins + # when caller supplies it explicitly (scan calcs pull from the + # ``additional_calculations`` entry). Otherwise we pick up the + # per-job list ``arc/output.py`` populated on the species record. + constraints_source = source_constraints + if constraints_source is None: + field = _CONSTRAINTS_FIELD_BY_CALC_KEY.get(role) + if field: + constraints_source = species_record.get(field) + constraint_payload = _serialize_calc_constraints(constraints_source) + if constraint_payload: + calc["constraints"] = constraint_payload return calc + @staticmethod + def _output_geometries_for_calc( + *, + calc_role: str, + species_record: Mapping[str, Any], + conformer_xyz_text: str | None, + ) -> list[dict[str, Any]]: + """Compute the ``output_geometries`` list for one calc, per-kind policy. + + Each entry is shaped ``{"geometry": {"xyz_text": ...}, "role": + "final"}`` per TCKDB's ``OutputGeometryEntry``. Returns ``[]`` + when nothing should be emitted; the caller drops empty lists. + + Policy mirrors what the calc actually produced: + + - ``opt`` (fine): produced the conformer's geometry of record. + Emit ``conformer_xyz_text`` with ``role=final``. If absent + (e.g., bundle-build path that skipped requiring xyz), the + backend still has its single-stage fallback that links opt + to the conformer geometry — but that fallback is server- + side and shouldn't be relied on once we're declaring outputs + explicitly elsewhere. + - ``opt_coarse``: produced ``coarse_opt_output_xyz``. Emit it + with ``role=final``. + - ``freq`` / ``sp``: don't move atoms; we don't surface a + standalone "freq output geometry" or "sp output geometry" + today. Backend's freq/sp fallback now creates zero output + rows for these (per the new contract), so omitting is correct. + """ + if calc_role == _CALC_KEY_OPT: + if not conformer_xyz_text: + return [] + return [{"geometry": {"xyz_text": conformer_xyz_text}, "role": "final"}] + if calc_role == _CALC_KEY_OPT_COARSE: + coarse_out = species_record.get("coarse_opt_output_xyz") + if not coarse_out: + return [] + normalized = _normalize_xyz_text(coarse_out, species_record.get("label")) + if not normalized: + return [] + return [{"geometry": {"xyz_text": normalized}, "role": "final"}] + # freq / sp / irc / others: no output_geometries today. + return [] + + @staticmethod + def _input_geometries_for_calc( + *, + calc_role: str, + species_record: Mapping[str, Any], + conformer_xyz_text: str | None, + ) -> list[dict[str, Any]]: + """Compute the ``input_geometries`` list for one calc, per-kind policy. + + Returns ``[]`` when nothing should be emitted. The caller checks + truthiness — empty lists are dropped from the payload so we don't + send a zero-length array where None is more accurate. + """ + if calc_role == _CALC_KEY_OPT: + opt_input_xyz = species_record.get("opt_input_xyz") + if not opt_input_xyz: + return [] + normalized = _normalize_xyz_text(opt_input_xyz, species_record.get("label")) + if not normalized: + return [] + return [{"xyz_text": normalized}] + if calc_role == _CALC_KEY_OPT_COARSE: + # Coarse opt's input is the species' truly-initial xyz — + # ``coarse_opt_input_xyz`` from arc/output.py. The caller + # only invokes this when the coarse stage actually ran, so + # an absent value here is a real bug; skip rather than + # fabricate (the parent ``_build_opt_coarse_calc`` short- + # circuits before reaching us if either coarse field is + # missing). + coarse_in = species_record.get("coarse_opt_input_xyz") + if not coarse_in: + return [] + normalized = _normalize_xyz_text(coarse_in, species_record.get("label")) + return [{"xyz_text": normalized}] if normalized else [] + if calc_role in (_CALC_KEY_FREQ, _CALC_KEY_SP, _CALC_KEY_IRC): + # ARC invariant: freq, sp, and (TS) irc all run on the + # conformer's optimized xyz. Surface it explicitly rather + # than relying on backend auto-fill — keeps the bundle + # self-describing. + if not conformer_xyz_text: + return [] + return [{"xyz_text": conformer_xyz_text}] + return [] + + def _build_opt_coarse_calc( + self, + *, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any], + calc_key: str = _CALC_KEY_OPT_COARSE, + ) -> dict[str, Any] | None: + """Build the coarse opt's CalculationInBundle dict, or return None. + + ``calc_key`` is the bundle-local identity. Computed-species uses + the bare role (``"opt_coarse"``); computed-reaction passes a + species-namespaced variant (``"r0_opt_coarse"``, ``"p1_opt_coarse"``) + so it stays globally unique across the bundle. The chemistry + role passed to ``_build_calc_in_bundle`` stays + ``_CALC_KEY_OPT_COARSE`` either way — it's what drives the + ``coarse_opt_log`` / ``coarse_opt_input_xyz`` lookups on the + species record. + + Returns ``None`` when: + - ``coarse_opt_log`` is absent (no coarse stage ran) + - ``coarse_opt_output_xyz`` is null (coarse log existed but its + final geometry couldn't be parsed; modeling the calc without + its output geometry would create a half-described provenance + row, so we drop the whole opt_coarse rather than mislead) + + Note on output geometry: TCKDB's bundle workflow auto-anchors + every calc's ``CalculationOutputGeometry`` to the conformer + geometry (the FINE opt's output). Until the bundle schema + gains an ``output_geometries`` field on ``CalculationInBundle``, + opt_coarse's output-geometry row will incorrectly point at + the fine geometry server-side. The input chain + (``calculation_input_geometry``) is correct — that's what was + empty before this change and is what the producer can fix. + """ + if not species_record.get("coarse_opt_log"): + return None + if not species_record.get("coarse_opt_output_xyz"): + logger.debug( + "TCKDB: coarse_opt_log present but coarse_opt_output_xyz " + "is null for label=%s — falling back to single-stage opt " + "bundle.", + species_record.get("label"), + ) + return None + result_payload = _coarse_opt_result_payload(species_record) + # Reuse the standard calc-in-bundle builder. opt_coarse takes + # the same level/software as the fine opt (ARC runs both + # stages at the configured opt level — the difference is + # convergence criterion, not method/basis). No depends_on + # (it's the chain head). No tckdb_origin (it's a real ESS run, + # not a reuse of another calc's result). + try: + return self._build_calc_in_bundle( + output_doc=output_doc, + species_record=species_record, + calc_key=calc_key, + calc_role=_CALC_KEY_OPT_COARSE, + calc_type="opt", + level_kind="opt", + ess_job_key="opt", + result_field="opt_result", + result_payload=result_payload, + depends_on=None, + tckdb_origin=None, + conformer_xyz_text=None, # opt_coarse's input is its own xyz, not the conformer + ) + except ValueError as exc: + logger.warning( + "TCKDB: opt_coarse calculation skipped for label=%s " + "(calc_key=%s): %s", + species_record.get("label"), calc_key, exc, + ) + return None + def _inline_artifacts_for_calc( self, species_record: Mapping[str, Any], *, - calc_key: str, + calc_role: str, ) -> list[dict[str, Any]]: """Return the inline artifact list for one calc within a bundle. + ``calc_role`` is the chemistry role (``opt``/``freq``/``sp``/ + ``opt_coarse``/``irc``) used to look up the matching record + field name (``opt_log`` etc.). The bundle-local key — e.g. + ``r0_opt`` for a reaction — is irrelevant here; ARC stores the + log paths on the species record under role-keyed names. + Iterates ``_INLINE_ARTIFACT_SOURCES`` (currently ``output_log`` and ``input``) and emits one ArtifactIn dict per kind whose record path resolves to a real file on disk. Each kind is independently gated on ``config.artifacts.kinds``, so a user can opt into logs but not decks (or vice versa). - Skip rules per (calc_key, kind): + Skip rules per (calc_role, kind): - artifacts globally disabled → entire list = [] - kind not in config.artifacts.kinds → that kind only - record-field path missing / null → that kind only @@ -598,11 +1162,18 @@ def _inline_artifacts_for_calc( for kind, field_map in _INLINE_ARTIFACT_SOURCES: if kind not in artifact_cfg.kinds: continue + # ``output_log`` for ``ts_guess`` is method-dispatched + # (NEB → ``neb_log``, GSM → ``gsm_log``); other (kind, role) + # combinations look up in the static map. + if kind == "output_log": + record_field = _resolve_log_field(calc_role, species_record) + else: + record_field = field_map.get(calc_role) artifact = self._read_inline_artifact( species_record, - calc_key=calc_key, + calc_role=calc_role, kind=kind, - record_field=field_map.get(calc_key), + record_field=record_field, ) if artifact is not None: artifacts.append(artifact) @@ -612,7 +1183,7 @@ def _read_inline_artifact( self, species_record: Mapping[str, Any], *, - calc_key: str, + calc_role: str, kind: str, record_field: str | None, ) -> dict[str, Any] | None: @@ -631,16 +1202,16 @@ def _read_inline_artifact( resolved = self._resolve_local_path(path_value) if resolved is None or not resolved.is_file(): logger.debug( - "TCKDB computed-species: %s %s artifact missing on disk for %s (path=%s)", - calc_key, kind, species_record.get("label"), path_value, + "TCKDB bundle: %s %s artifact missing on disk for %s (path=%s)", + calc_role, kind, species_record.get("label"), path_value, ) return None size_bytes = resolved.stat().st_size max_bytes = self._config.artifacts.max_size_mb * 1024 * 1024 if size_bytes > max_bytes: logger.warning( - "TCKDB computed-species: %s %s %s skipped (%s bytes > %s MB cap)", - calc_key, kind, resolved.name, size_bytes, + "TCKDB bundle: %s %s %s skipped (%s bytes > %s MB cap)", + calc_role, kind, resolved.name, size_bytes, self._config.artifacts.max_size_mb, ) return None @@ -665,100 +1236,923 @@ def _resolve_local_path(self, file_path: str | Path) -> Path | None: return Path(self._project_directory) / path return path.resolve() + def _parse_irc_trajectories( + self, + ts_record: Mapping[str, Any], + ) -> list[dict[str, Any]] | None: + """Parse each IRC log into a trajectory dict for payload assembly. + + Returns one dict per parsed log: + ``{ + "direction": "forward"|"reverse"|None, + "rich_points": [, ...] | None, + "geom_points": [, ...] | None, + }`` + + Each log is first attempted with the rich parser + (:func:`arc.parser.parser.parse_irc_path`) which carries energies, + gradients, reaction coordinates, and per-point direction labels. + On rich-parser failure (or for ESS backends that haven't + implemented it), the geometry-only :func:`parse_irc_traj` runs + as a fallback so the upload retains the geometry-only IRC payload + behavior described in the task spec. + + Direction-resolution order (used when a per-point direction + isn't present in the rich data): + 1. Rich parser's per-point ``direction`` (Gaussian's + ``FORWARD/REVERSE path direction.`` announcement). + 2. ``ts_record['irc_log_directions'][i]`` — the authoritative + value the scheduler captured from ``job.irc_direction`` and + ``arc/output.py`` paired with ``irc_logs``. Production ARC + IRC log filenames are just ``output.log`` inside an + ``irc_`` folder, so filename detection alone + always returns None on real runs. + 3. ``_detect_irc_direction(filename)`` — back-compat fallback + for output.yml files written before the paired-list + tracking, and for test fixtures that hand-craft directional + filenames. + 4. ``None`` — last resort. The trajectory is still emitted; + per-point direction is omitted (the schema allows nullable + ``IRCDirection`` on each point). + + Returns ``None`` when no logs resolve, no logs exist on disk, or + every parse attempt failed. The caller uses ``None`` as the + signal to omit ``irc_result`` (partial-data fallback per spec). + """ + log_paths = ts_record.get("irc_logs") or [] + if not log_paths: + return None + # Lazy import: parser imports drag in heavyweight ESS adapters; + # the adapter module is loaded even when IRC isn't in play. + from arc.parser.parser import parse_irc_path, parse_irc_traj + + log_directions = list(ts_record.get("irc_log_directions") or []) + trajectories: list[dict[str, Any]] = [] + for i, log_path in enumerate(log_paths): + resolved = self._resolve_local_path(log_path) + if resolved is None or not resolved.is_file(): + logger.debug( + "TCKDB computed-reaction: IRC log path not found on disk: %r", + log_path, + ) + continue + # Resolve the trajectory-level direction once: scheduler- + # tracked first, filename heuristic second, None last. This + # is used as a per-point fallback when the rich parser + # doesn't carry FORWARD/REVERSE labels (currently only + # Gaussian does). + direction = log_directions[i] if i < len(log_directions) else None + if direction not in (_IRC_DIRECTION_FORWARD, _IRC_DIRECTION_REVERSE): + direction = _detect_irc_direction(str(log_path)) + rich_points: list[dict[str, Any]] | None = None + try: + rich_points = parse_irc_path(log_file_path=str(resolved)) + except Exception as exc: + logger.debug( + "TCKDB computed-reaction: parse_irc_path failed for %s: %s", + resolved, exc, + ) + geom_points: list[dict[str, Any]] | None = None + if not rich_points: + # Geometry-only fallback so non-Gaussian logs (and + # malformed Gaussian logs) keep producing a usable IRC + # payload, matching the pre-rich-parser behavior. + try: + geom_points = parse_irc_traj(log_file_path=str(resolved)) + except Exception as exc: + logger.debug( + "TCKDB computed-reaction: parse_irc_traj failed for %s: %s", + resolved, exc, + ) + continue + if not geom_points: + continue + trajectories.append({ + "direction": direction, + "rich_points": rich_points, + "geom_points": geom_points, + }) + return trajectories or None + # ------------------------------------------------------------------ - # Payload construction + # Computed-reaction bundle path (POST /uploads/computed-reaction) # ------------------------------------------------------------------ - @staticmethod - def _species_entry_payload(record: Mapping[str, Any]) -> dict[str, Any]: - smiles = record.get("smiles") - if not smiles: - raise ValueError( - f"output.yml record for label={record.get('label')!r} has no smiles; " - "TCKDB upload requires a SMILES on the species_entry." - ) - is_ts = bool(record.get("is_ts")) - return { - "molecule_kind": "molecule", - "smiles": str(smiles), - "charge": int(record.get("charge", 0) or 0), - "multiplicity": int(record.get("multiplicity", 1) or 1), - "species_entry_kind": "transition_state" if is_ts else "minimum", - } - - def _build_payload( + def submit_computed_reaction_from_output( self, *, output_doc: Mapping[str, Any], - species_record: Mapping[str, Any], - ) -> dict[str, Any]: - species_entry = self._species_entry_payload(species_record) - geometry_payload = {"xyz_text": _require_xyz_text(species_record)} - primary, additional = self._build_calculations(output_doc, species_record) + reaction_record: Mapping[str, Any], + ) -> UploadOutcome | None: + """Build, write, and (if configured) upload one computed-reaction bundle. + + Walks the reaction's ``reactant_labels``/``product_labels`` against + ``output_doc['species']`` (and ``output_doc['transition_states']`` + for ``ts_label``) and assembles a self-contained + ``ComputedReactionUploadRequest`` covering species, TS, and + modified-Arrhenius kinetics — all cross-referenced by local + string keys, no DB ids. + + Returns ``None`` if the adapter is disabled. Build failures (e.g. + missing reactant species, missing opt level) raise; the caller is + responsible for wrapping the per-reaction call in a try/except so + one bad reaction doesn't take down the rest of the run. + """ + if not self._config.enabled: + return None - payload: dict[str, Any] = { - "species_entry": species_entry, - "geometry": geometry_payload, - "calculation": primary, - "scientific_origin": "computed", - } - if additional: - payload["additional_calculations"] = additional - label = species_record.get("label") - if label: - payload["label"] = str(label)[:64] - return payload + reaction_label = reaction_record.get("label") or "unlabeled" + project_label = self._config.project_label or output_doc.get("project") - @classmethod - def _build_calculations( - cls, + payload = self._build_computed_reaction_payload( + output_doc=output_doc, + reaction_record=reaction_record, + ) + + idempotency_inputs = IdempotencyInputs.from_payload( + project_label=project_label, + species_label=reaction_label, + # The reaction has no conformer concept at the bundle level — + # just one fit per upload — so the conformer slot in the key + # carries the TS label (or "noTS"). This keeps the key shape + # consistent with the species path while still uniquely + # identifying the reaction within a project. + conformer_label=str(reaction_record.get("ts_label") or "noTS"), + payload_kind=COMPUTED_REACTION_KIND, + payload=payload, + ) + idempotency_key = build_idempotency_key(idempotency_inputs) + + written = self._writer.write( + label=reaction_label, + payload=payload, + endpoint=COMPUTED_REACTION_ENDPOINT, + idempotency_key=idempotency_key, + payload_kind=COMPUTED_REACTION_KIND, + base_url=self._config.base_url, + subdir=PayloadWriter.COMPUTED_REACTION_SUBDIR, + ) + logger.info( + "TCKDB computed-reaction payload written: %s (key=%s)", + written.payload_path, + idempotency_key, + ) + + if not self._config.upload: + return self._finalize_skipped(written) + + return self._upload(written, payload, endpoint=COMPUTED_REACTION_ENDPOINT) + + def _build_computed_reaction_payload( + self, + *, output_doc: Mapping[str, Any], - record: Mapping[str, Any], - ) -> tuple[dict[str, Any], list[dict[str, Any]]]: - """Return (primary opt calculation, [freq, sp] additional calculations). + reaction_record: Mapping[str, Any], + ) -> dict[str, Any]: + """Compose one ``ComputedReactionUploadRequest`` dict. - Additional calculations are skipped (with a warning) when their - result fields are absent or malformed, or when no level of theory - is available. Skipping an optional calc never fails the upload. + Resolves reactant/product/TS records from ``output_doc``, + delegates per-actor block construction (species + TS) to the + shared per-actor helpers, and stitches in a single + modified-Arrhenius kinetics fit when ARC produced one. """ - primary = cls._calculation_payload( - output_doc, - record, - calc_type="opt", - level=_resolve_level(output_doc, "opt"), - ess_job_key="opt", - result_field="opt_result", - result_payload=_opt_result_payload(record), - ) + species_index = _index_species(output_doc) + ts_index = _index_transition_states(output_doc) - additional: list[dict[str, Any]] = [] - freq_result = _freq_result_payload(record) - if freq_result is not None: - freq_level = _resolve_level(output_doc, "freq") - try: - additional.append( - cls._calculation_payload( - output_doc, - record, - calc_type="freq", - level=freq_level, - ess_job_key="freq", - result_field="freq_result", - result_payload=freq_result, - ) + reactant_labels = list(reaction_record.get("reactant_labels") or []) + product_labels = list(reaction_record.get("product_labels") or []) + if not reactant_labels: + raise ValueError( + f"reaction label={reaction_record.get('label')!r} has no reactant_labels." + ) + if not product_labels: + raise ValueError( + f"reaction label={reaction_record.get('label')!r} has no product_labels." + ) + + # Build reactant/product species blocks under namespaced keys. + species_blocks: list[dict[str, Any]] = [] + reactant_keys: list[str] = [] + product_keys: list[str] = [] + # Per-actor calc-role → bundle-key map; consumed by the kinetics + # builder so source_calculations references resolve to the same + # local keys the species blocks declared. + actor_calc_keys: dict[str, dict[str, str]] = {} + + for i, label in enumerate(reactant_labels): + actor_key = _local_key_for_actor("r", i, label) + calc_prefix = _calc_prefix_for_actor("r", i) + record = species_index.get(label) + if record is None: + raise ValueError( + f"reaction {reaction_record.get('label')!r}: reactant " + f"label {label!r} not found in output_doc.species." ) - except ValueError as exc: - logger.warning( - "TCKDB freq additional calculation skipped for label=%s: %s", - record.get("label"), exc, + block, calc_keys = self._build_reaction_species_block( + output_doc=output_doc, + species_record=record, + actor_key=actor_key, + calc_prefix=calc_prefix, + ) + species_blocks.append(block) + reactant_keys.append(actor_key) + actor_calc_keys[actor_key] = calc_keys + + for j, label in enumerate(product_labels): + actor_key = _local_key_for_actor("p", j, label) + calc_prefix = _calc_prefix_for_actor("p", j) + record = species_index.get(label) + if record is None: + raise ValueError( + f"reaction {reaction_record.get('label')!r}: product " + f"label {label!r} not found in output_doc.species." + ) + block, calc_keys = self._build_reaction_species_block( + output_doc=output_doc, + species_record=record, + actor_key=actor_key, + calc_prefix=calc_prefix, + ) + species_blocks.append(block) + product_keys.append(actor_key) + actor_calc_keys[actor_key] = calc_keys + + # TS block (inline). Optional — a reaction with no TS still + # carries kinetics but server-side it's a thinner record. + ts_label = reaction_record.get("ts_label") + ts_block: dict[str, Any] | None = None + ts_calc_keys: dict[str, str] = {} + if ts_label: + ts_record = ts_index.get(ts_label) + if ts_record is None: + raise ValueError( + f"reaction {reaction_record.get('label')!r}: ts_label " + f"{ts_label!r} not found in output_doc.transition_states." ) + ts_block, ts_calc_keys = self._build_ts_block( + output_doc=output_doc, + ts_record=ts_record, + ts_label=ts_label, + reaction_multiplicity=reaction_record.get("multiplicity"), + unmapped_smiles=_ts_unmapped_smiles_handle( + ts_record=ts_record, + reaction_record=reaction_record, + species_index=species_index, + ), + ) - sp_result = _sp_result_payload(record) - if sp_result is not None: - sp_level = _resolve_level(output_doc, "sp") - sp_origin = _reused_origin("opt") if _sp_is_reused_from_opt(output_doc) else None - try: - additional.append( + # Kinetics. ARC produces at most one fit per reaction today. + kinetics_payload = reaction_record.get("kinetics") + kinetics_blocks: list[dict[str, Any]] = [] + if isinstance(kinetics_payload, Mapping): + kinetics_block = _build_kinetics_block( + kinetics_record=kinetics_payload, + reactant_keys=reactant_keys, + product_keys=product_keys, + actor_calc_keys=actor_calc_keys, + ts_calc_keys=ts_calc_keys, + ) + if kinetics_block is not None: + kinetics_blocks.append(kinetics_block) + + bundle: dict[str, Any] = { + "species": species_blocks, + "reactant_keys": reactant_keys, + "product_keys": product_keys, + } + if ts_block is not None: + bundle["transition_state"] = ts_block + if kinetics_blocks: + bundle["kinetics"] = kinetics_blocks + + # Final pass: flatten every calc's wrapped result into the + # network_pdep flat fields the computed-reaction endpoint + # expects. Keeping this as a single end-of-build walker means + # any new calc-emit site automatically gets the right shape. + _flatten_all_reaction_calcs(bundle) + + family = reaction_record.get("family") + if family: + bundle["reaction_family"] = str(family) + # The server validates against a canonical-family list and + # demands a source_note when the supplied name is unknown. + # We don't have that list at the producer, so always tag + # the source — a no-op for canonical names, a safety net + # for non-canonical ones. + bundle["reaction_family_source_note"] = "ARC-reported family" + + arc_version = output_doc.get("arc_version") + arc_git_commit = output_doc.get("arc_git_commit") + if arc_version or arc_git_commit: + wt: dict[str, Any] = {"name": "ARC"} + if arc_version: + wt["version"] = str(arc_version) + if arc_git_commit: + wt["git_commit"] = str(arc_git_commit) + bundle["workflow_tool_release"] = wt + + return bundle + + def _build_reaction_species_block( + self, + *, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any], + actor_key: str, + calc_prefix: str, + ) -> tuple[dict[str, Any], dict[str, str]]: + """Build one ``BundleSpeciesIn`` dict + a calc-role → bundle-key map. + + ``actor_key`` (e.g. ``"r0_CHO"``) becomes the species block's + ``key`` and the geometry/conformer key tails. ``calc_prefix`` + (e.g. ``"r0"``) is the namespace for calculation keys — + deliberately shorter than ``actor_key`` so source_calculations + references stay compact. + + The returned map (e.g. ``{"opt": "r0_opt", "freq": "r0_freq", + "sp": "r0_sp"}``) is what the kinetics builder uses to wire + ``source_calculations`` back to the freshly-minted local keys. + It only contains the roles whose calculation actually made it + into the bundle. + """ + conformer_xyz_text = _require_xyz_text(species_record) + opt_key = f"{calc_prefix}_{_CALC_KEY_OPT}" + opt_coarse_key = f"{calc_prefix}_{_CALC_KEY_OPT_COARSE}" + freq_key = f"{calc_prefix}_{_CALC_KEY_FREQ}" + sp_key = f"{calc_prefix}_{_CALC_KEY_SP}" + geom_key = f"{actor_key}_geom" + conf_key = f"{actor_key}_conf0" + + # Coarse-opt provenance, parallel to the computed-species path. + # Same gate (both ``coarse_opt_log`` and ``coarse_opt_output_xyz`` + # must be present) — when only one resolves, we fall back to + # single-stage rather than emit a half-described opt_coarse. + # Namespacing the calc key with ``calc_prefix`` keeps it globally + # unique across reactant/product/TS, which the bundle's + # ``validate_unique_keys`` requires. + opt_coarse_calc = self._build_opt_coarse_calc( + output_doc=output_doc, + species_record=species_record, + calc_key=opt_coarse_key, + ) + fine_opt_depends_on: list[Mapping[str, Any]] | None = None + if opt_coarse_calc is not None: + fine_opt_depends_on = [ + {"parent_calculation_key": opt_coarse_key, + "role": "optimized_from"} + ] + + primary_calc = self._build_calc_in_bundle( + output_doc=output_doc, + species_record=species_record, + calc_key=opt_key, + calc_role=_CALC_KEY_OPT, + calc_type="opt", + level_kind="opt", + ess_job_key="opt", + result_field="opt_result", + result_payload=_opt_result_payload(species_record), + depends_on=fine_opt_depends_on, + tckdb_origin=None, + conformer_xyz_text=conformer_xyz_text, + ) + + calc_keys: dict[str, str] = {_CALC_KEY_OPT: opt_key} + additional: list[dict[str, Any]] = [] + # opt_coarse is type=opt, so the schema validator at + # computed_reaction_upload.py:446 exempts it from needing + # geometry_key — leave it bare. + if opt_coarse_calc is not None: + additional.append(opt_coarse_calc) + calc_keys[_CALC_KEY_OPT_COARSE] = opt_coarse_key + + freq_result = _freq_result_payload(species_record) + if freq_result is not None: + try: + freq_calc = self._build_calc_in_bundle( + output_doc=output_doc, + species_record=species_record, + calc_key=freq_key, + calc_role=_CALC_KEY_FREQ, + calc_type="freq", + level_kind="freq", + ess_job_key="freq", + result_field="freq_result", + result_payload=freq_result, + depends_on=[{"parent_calculation_key": opt_key, "role": "freq_on"}], + tckdb_origin=None, + conformer_xyz_text=conformer_xyz_text, + ) + # Server requires non-opt species calcs to reference a + # conformer geometry by key (BundleSpeciesIn validator + # validate_calc_geometry_keys). + freq_calc["geometry_key"] = geom_key + additional.append(freq_calc) + calc_keys[_CALC_KEY_FREQ] = freq_key + except ValueError as exc: + logger.warning( + "TCKDB computed-reaction: %s freq calculation skipped: %s", + actor_key, exc, + ) + + sp_result = _sp_result_payload(species_record) + if sp_result is not None: + try: + sp_calc = self._build_calc_in_bundle( + output_doc=output_doc, + species_record=species_record, + calc_key=sp_key, + calc_role=_CALC_KEY_SP, + calc_type="sp", + level_kind="sp", + ess_job_key="sp", + result_field="sp_result", + result_payload=sp_result, + depends_on=[{"parent_calculation_key": opt_key, "role": "single_point_on"}], + tckdb_origin=( + _reused_origin("opt") if _sp_is_reused_from_opt(output_doc) else None + ), + conformer_xyz_text=conformer_xyz_text, + ) + sp_calc["geometry_key"] = geom_key + additional.append(sp_calc) + calc_keys[_CALC_KEY_SP] = sp_key + except ValueError as exc: + logger.warning( + "TCKDB computed-reaction: %s sp calculation skipped: %s", + actor_key, exc, + ) + + # Rotor scans. Mirrors ``_build_conformer_block``'s scan loop, + # with two reaction-path-specific deltas: + # + # - Calc keys are namespaced (``r0_scan_rotor_0``, etc.). The + # bundle schema's ``validate_unique_keys`` requires *globally* + # unique calc keys across all species + the TS, so two + # reactants both reporting ``scan_rotor_0`` would collide. + # The torsion's ``source_scan_calculation_key`` gets rewritten + # in lockstep via ``scan_key_renames`` so the reference still + # resolves. + # - ``geometry_key`` is set: non-opt species calcs in reaction + # bundles must point at the conformer geometry (same reason + # freq/sp set it above). + scan_key_renames: dict[str, str] = {} + for scan_entry in (species_record.get("additional_calculations") or []): + if not isinstance(scan_entry, Mapping): + continue + if scan_entry.get("type") != _CALC_KEY_SCAN: + continue + original_scan_key = scan_entry.get("key") + scan_result = scan_entry.get("scan_result") + if not isinstance(original_scan_key, str) or not original_scan_key: + continue + if not isinstance(scan_result, Mapping): + continue + namespaced_scan_key = f"{calc_prefix}_{original_scan_key}" + try: + scan_calc = self._build_calc_in_bundle( + output_doc=output_doc, + species_record=species_record, + calc_key=namespaced_scan_key, + calc_role=_CALC_KEY_SCAN, + calc_type=_CALC_KEY_SCAN, + level_kind="opt", + ess_job_key="opt", + result_field="scan_result", + result_payload=scan_result, + depends_on=[{"parent_calculation_key": opt_key, + "role": "scan_parent"}], + tckdb_origin=None, + conformer_xyz_text=conformer_xyz_text, + source_constraints=scan_entry.get("constraints"), + ) + scan_calc["geometry_key"] = geom_key + additional.append(scan_calc) + scan_key_renames[original_scan_key] = namespaced_scan_key + except ValueError as exc: + logger.warning( + "TCKDB computed-reaction: %s scan calculation %s skipped: %s", + actor_key, original_scan_key, exc, + ) + + species_block: dict[str, Any] = { + "key": actor_key, + "species_entry": self._species_entry_payload(species_record), + "conformers": [ + { + "key": conf_key, + "geometry": {"key": geom_key, "xyz_text": conformer_xyz_text}, + "calculation": primary_calc, + } + ], + "calculations": additional, + } + thermo_block = _build_thermo_block( + species_record.get("thermo"), + included_calc_keys=[ + # Pass the bundle-local keys so source_calculations links + # match the calc keys actually emitted in this species + # block. _build_thermo_block expects role names — we + # pre-translate to keys via the role→key map below. + # Order kept deterministic: opt, freq, sp. + calc_keys[role] + for role in (_CALC_KEY_OPT, _CALC_KEY_FREQ, _CALC_KEY_SP) + if role in calc_keys + ], + ) + if thermo_block is not None: + species_block["thermo"] = thermo_block + + # Per-species AEC/BAC corrections target this species's resolved + # species_entry. Anchor each correction to this species's own SP + # calc (e.g. r0_sp / p1_sp); never to ts_sp or to a sibling + # species's SP key — the server enforces ownership in + # ``_persist_species_applied_corrections`` and would 422 on a + # cross-species reference. + applied_corrections = _build_applied_energy_corrections( + species_record.get("applied_energy_corrections") or [], + source_calculation_key=calc_keys.get(_CALC_KEY_SP), + ) + if applied_corrections: + species_block["applied_energy_corrections"] = applied_corrections + + # Statmech block: carries frequency-scale-factor provenance plus + # the schema-supported base statmech fields for this reactant/ + # product (external_symmetry, is_linear, rigid_rotor_kind, + # statmech_treatment, point_group, slim torsions, and + # source_calculations referencing this species's own opt/freq/ + # sp calcs). ``BundleStatmechIn`` accepts the same field set as + # ``StatmechInBundle``, so the shared builder produces the same + # shape; the only mode-specific input is the calc-key namespace + # — we pass the *species-scoped* ``calc_keys`` (e.g. + # ``{"opt": "r0_opt", ...}``) so the server-side ownership check + # sees only this species's own calculations. Sibling species + # and the TS use disjoint namespaces (``r1_*``/``p0_*``/``ts_*``) + # and therefore can't leak in here. The helper returns ``None`` + # (and we skip the whole statmech block) when nothing useful + # resolves, keeping payloads backward-compatible with FSF-less + # runs. + species_statmech = _build_statmech_block_for_species( + output_doc=output_doc, + species_record=species_record, + calc_keys_by_role=calc_keys, + workflow_tool_release=_arc_workflow_tool_release(output_doc), + scan_key_renames=scan_key_renames or None, + ) + if species_statmech is not None: + species_block["statmech"] = species_statmech + + return species_block, calc_keys + + def _build_ts_block( + self, + *, + output_doc: Mapping[str, Any], + ts_record: Mapping[str, Any], + ts_label: str, + reaction_multiplicity: int | None, + unmapped_smiles: str | None = None, + ) -> tuple[dict[str, Any], dict[str, str]]: + """Build one ``BundleTransitionStateIn`` dict + a calc-role → key map. + + ARC stores TS records in ``output_doc['transition_states']`` with + the same shape as species records (the TS is just a stationary + point with ``is_ts: true``); the multiplicity falls back to the + reaction's multiplicity when the TS record doesn't carry one. + + ``unmapped_smiles`` is a deterministic textual handle for the + TS, computed by :func:`_ts_unmapped_smiles_handle` at the + caller. ``None`` is the common case (TS has no Lewis structure + and no upstream textual identifier was derivable); the field is + omitted from the payload, leaving the server to store NULL. + The producer never fabricates a normal-molecule SMILES from + the TS geometry — the ``mol`` field stays absent regardless. + + IRC provenance: when ARC has ``irc_logs`` populated, emit a + ``ts_irc`` calc with ``depends_on(role=irc_start)`` pointing at + ``ts_opt`` — IRC is seeded from the optimized TS saddle, so + ``ts_opt`` is its primary geometry-producing parent (the TS freq + validates the saddle but isn't the geometry source). When the + IRC log files parse cleanly, attach a structured ``irc_result`` + with forward/reverse points and producer-declared + ``output_geometries`` (``irc_forward``/``irc_reverse`` endpoints). + Forward/reverse are ESS path-direction labels — the producer + does NOT infer reactant/product side from them. + """ + conformer_xyz_text = _require_xyz_text(ts_record) + ts_opt_key = f"ts_{_CALC_KEY_OPT}" + # ``_CALC_KEY_TS_GUESS`` already starts with ``ts_``; using it + # bare keeps the bundle key as ``ts_guess`` (don't double-prefix). + ts_guess_key = _CALC_KEY_TS_GUESS + ts_freq_key = f"ts_{_CALC_KEY_FREQ}" + ts_sp_key = f"ts_{_CALC_KEY_SP}" + ts_irc_key = f"ts_{_CALC_KEY_IRC}" + ts_geom_key = "ts_geom" + + # Path-search ts_guess provenance. Emit a parent calculation + # only when the chosen TS guess is itself a real path-search + # calculation (orca_neb → method=neb, xtb_gsm → method=gsm) AND + # the producer recorded the corresponding log path + # (``neb_log`` / ``gsm_log``). Heuristics / AutoTST / KinBot / + # GCN / user-supplied guesses stay geometry-only on the + # ts_opt: ``calculation_dependency`` requires a real parent + # calculation, never a geometry. The artifact path picks up + # the log via ``_resolve_log_field`` when artifact upload is + # enabled. + ts_guess_calc: dict[str, Any] | None = None + ts_guess_method = _resolve_ts_guess_path_search( + ts_record.get("chosen_ts_method"), + ) + ts_guess_log_field = ( + _TS_GUESS_LOG_FIELD_BY_METHOD.get(ts_guess_method) + if ts_guess_method else None + ) + if ( + ts_guess_method + and ts_guess_log_field + and ts_record.get(ts_guess_log_field) + ): + try: + ts_guess_calc = self._build_calc_in_bundle( + output_doc=output_doc, + species_record=ts_record, + calc_key=ts_guess_key, + calc_role=_CALC_KEY_TS_GUESS, + calc_type="path_search", + # No ts_guess_level in output_doc today; fall back + # to opt_level (consistent with how irc resolves). + level_kind="opt", + ess_job_key="opt", + result_field="path_search_result", + result_payload={"method": ts_guess_method}, + depends_on=None, + tckdb_origin=None, + conformer_xyz_text=None, + ) + except ValueError as exc: + logger.warning( + "TCKDB computed-reaction: ts_guess (%s) calc skipped: %s", + ts_guess_method, exc, + ) + + ts_opt_depends_on: list[Mapping[str, Any]] | None = None + if ts_guess_calc is not None: + ts_opt_depends_on = [ + {"parent_calculation_key": ts_guess_key, + "role": "optimized_from"} + ] + + primary_calc = self._build_calc_in_bundle( + output_doc=output_doc, + species_record=ts_record, + calc_key=ts_opt_key, + calc_role=_CALC_KEY_OPT, + calc_type="opt", + level_kind="opt", + ess_job_key="opt", + result_field="opt_result", + result_payload=_opt_result_payload(ts_record), + depends_on=ts_opt_depends_on, + tckdb_origin=None, + conformer_xyz_text=conformer_xyz_text, + ) + + calc_keys: dict[str, str] = {_CALC_KEY_OPT: ts_opt_key} + additional: list[dict[str, Any]] = [] + # Order: ts_guess (parent) first, then freq/sp/irc descend + # from ts_opt. ``BundleTransitionStateIn`` doesn't enforce a + # geometry_key on additional calcs (unlike the species-side + # validator), so type=path_search stays bare. + if ts_guess_calc is not None: + additional.append(ts_guess_calc) + calc_keys[_CALC_KEY_TS_GUESS] = ts_guess_key + + freq_result = _freq_result_payload(ts_record) + if freq_result is not None: + try: + additional.append(self._build_calc_in_bundle( + output_doc=output_doc, + species_record=ts_record, + calc_key=ts_freq_key, + calc_role=_CALC_KEY_FREQ, + calc_type="freq", + level_kind="freq", + ess_job_key="freq", + result_field="freq_result", + result_payload=freq_result, + depends_on=[{"parent_calculation_key": ts_opt_key, "role": "freq_on"}], + tckdb_origin=None, + conformer_xyz_text=conformer_xyz_text, + )) + calc_keys[_CALC_KEY_FREQ] = ts_freq_key + except ValueError as exc: + logger.warning("TCKDB computed-reaction: ts freq calc skipped: %s", exc) + + sp_result = _sp_result_payload(ts_record) + if sp_result is not None: + try: + additional.append(self._build_calc_in_bundle( + output_doc=output_doc, + species_record=ts_record, + calc_key=ts_sp_key, + calc_role=_CALC_KEY_SP, + calc_type="sp", + level_kind="sp", + ess_job_key="sp", + result_field="sp_result", + result_payload=sp_result, + depends_on=[{"parent_calculation_key": ts_opt_key, "role": "single_point_on"}], + tckdb_origin=( + _reused_origin("opt") if _sp_is_reused_from_opt(output_doc) else None + ), + conformer_xyz_text=conformer_xyz_text, + )) + calc_keys[_CALC_KEY_SP] = ts_sp_key + except ValueError as exc: + logger.warning("TCKDB computed-reaction: ts sp calc skipped: %s", exc) + + # IRC: emit a calc when ARC has irc_logs populated. Always carry + # depends_on(role=irc_start) → ts_opt, because IRC is seeded from + # the optimized TS saddle. When the logs parse cleanly, attach a + # structured irc_result with forward/reverse points and producer- + # declared output_geometries for the irc_forward/irc_reverse + # endpoints. When parsing fails we still emit the type=irc calc + # (so kinetics.source_calculations(role=irc) and the dependency + # edge can stand on their own) but omit irc_result rather than + # fabricating incomplete structured data. + if ts_record.get("irc_logs"): + try: + irc_calc = self._build_calc_in_bundle( + output_doc=output_doc, + species_record=ts_record, + calc_key=ts_irc_key, + calc_role=_CALC_KEY_IRC, + calc_type="irc", + level_kind="opt", # ARC runs IRC at the opt level + ess_job_key="opt", + result_field=None, + result_payload=None, + depends_on=[{"parent_calculation_key": ts_opt_key, "role": "irc_start"}], + tckdb_origin=None, + conformer_xyz_text=conformer_xyz_text, + ) + # IRC output geometries are derived by the server from + # ``irc_result.points``: ``_persist_irc_result`` writes a + # ``calculation_output_geometry`` row (role=irc_forward / + # irc_reverse) for every forward/reverse point. Emitting + # explicit ``output_geometries`` here would double-claim + # those geometries and the + # ``attach_calculation_output_geometries`` uniqueness check + # would 422. We therefore attach only ``irc_result`` and + # let the server own the output-geometry links. + irc_parsed = self._parse_irc_trajectories(ts_record) + if irc_parsed is not None: + zero_ref = _resolve_irc_zero_energy_reference( + output_doc=output_doc, + ts_record=ts_record, + ) + irc_result = _build_irc_result_payload( + irc_parsed, + zero_energy_reference_hartree=zero_ref, + ) + if irc_result is not None: + irc_calc["irc_result"] = irc_result + additional.append(irc_calc) + calc_keys[_CALC_KEY_IRC] = ts_irc_key + except ValueError as exc: + logger.debug("TCKDB computed-reaction: ts irc calc skipped: %s", exc) + + # TS multiplicity: the TS record's own field is authoritative; + # fall back to the reaction's multiplicity when missing (some + # ARC runs only carry it on the reaction). + ts_mult = ts_record.get("multiplicity") + if ts_mult is None: + ts_mult = reaction_multiplicity + if ts_mult is None: + raise ValueError( + f"transition state {ts_label!r} has no multiplicity and the " + "reaction record has none either; cannot build TS block." + ) + + ts_block: dict[str, Any] = { + "charge": int(ts_record.get("charge", 0) or 0), + "multiplicity": int(ts_mult), + "geometry": {"key": ts_geom_key, "xyz_text": conformer_xyz_text}, + "calculation": primary_calc, + "calculations": additional, + "label": str(ts_label)[:64], + } + if unmapped_smiles: + ts_block["unmapped_smiles"] = str(unmapped_smiles) + + # TS-side AEC/BAC corrections target the TS entry directly via + # ``target_transition_state_entry_id`` server-side — never via the + # reaction entry. Anchor to ts_sp specifically; species-side SP + # keys (r0_sp / p1_sp) belong to other species and a cross-owner + # reference would 422. + applied_corrections = _build_applied_energy_corrections( + ts_record.get("applied_energy_corrections") or [], + source_calculation_key=calc_keys.get(_CALC_KEY_SP), + ) + if applied_corrections: + ts_block["applied_energy_corrections"] = applied_corrections + + return ts_block, calc_keys + + # ------------------------------------------------------------------ + # Payload construction + # ------------------------------------------------------------------ + + @staticmethod + def _species_entry_payload(record: Mapping[str, Any]) -> dict[str, Any]: + smiles = record.get("smiles") + if not smiles: + raise ValueError( + f"output.yml record for label={record.get('label')!r} has no smiles; " + "TCKDB upload requires a SMILES on the species_entry." + ) + is_ts = bool(record.get("is_ts")) + return { + "molecule_kind": "molecule", + "smiles": str(smiles), + "charge": int(record.get("charge", 0) or 0), + "multiplicity": int(record.get("multiplicity", 1) or 1), + "species_entry_kind": "transition_state" if is_ts else "minimum", + } + + def _build_payload( + self, + *, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any], + ) -> dict[str, Any]: + species_entry = self._species_entry_payload(species_record) + geometry_payload = {"xyz_text": _require_xyz_text(species_record)} + primary, additional = self._build_calculations(output_doc, species_record) + + payload: dict[str, Any] = { + "species_entry": species_entry, + "geometry": geometry_payload, + "calculation": primary, + "scientific_origin": "computed", + } + if additional: + payload["additional_calculations"] = additional + label = species_record.get("label") + if label: + payload["label"] = str(label)[:64] + return payload + + @classmethod + def _build_calculations( + cls, + output_doc: Mapping[str, Any], + record: Mapping[str, Any], + ) -> tuple[dict[str, Any], list[dict[str, Any]]]: + """Return (primary opt calculation, [freq, sp] additional calculations). + + Additional calculations are skipped (with a warning) when their + result fields are absent or malformed, or when no level of theory + is available. Skipping an optional calc never fails the upload. + """ + primary = cls._calculation_payload( + output_doc, + record, + calc_type="opt", + level=_resolve_level(output_doc, "opt"), + ess_job_key="opt", + result_field="opt_result", + result_payload=_opt_result_payload(record), + ) + + additional: list[dict[str, Any]] = [] + freq_result = _freq_result_payload(record) + if freq_result is not None: + freq_level = _resolve_level(output_doc, "freq") + try: + additional.append( + cls._calculation_payload( + output_doc, + record, + calc_type="freq", + level=freq_level, + ess_job_key="freq", + result_field="freq_result", + result_payload=freq_result, + ) + ) + except ValueError as exc: + logger.warning( + "TCKDB freq additional calculation skipped for label=%s: %s", + record.get("label"), exc, + ) + + sp_result = _sp_result_payload(record) + if sp_result is not None: + sp_level = _resolve_level(output_doc, "sp") + sp_origin = _reused_origin("opt") if _sp_is_reused_from_opt(output_doc) else None + try: + additional.append( cls._calculation_payload( output_doc, record, @@ -808,10 +2202,14 @@ def _calculation_payload( "cannot identify the ESS for TCKDB." ) - level_of_theory: dict[str, Any] = {"method": str(method)} - basis = level.get("basis") - if basis: - level_of_theory["basis"] = str(basis) + level_of_theory = _arc_level_to_tckdb_lot(level) + if level_of_theory is None: + # method was already validated above; this is defensive against + # a future change to _arc_level_to_tckdb_lot that drops the row. + raise ValueError( + f"could not project level of theory for {calc_type} onto " + "TCKDB LevelOfTheoryRef shape." + ) software_release: dict[str, Any] = {"name": str(software_name)} ess_versions = record.get("ess_versions") @@ -880,7 +2278,8 @@ def _upload( api_key = self._config.resolve_api_key() if not api_key: msg = ( - f"TCKDB API key env var '{self._config.api_key_env}' is not set; " + f"TCKDB API key not configured (tried: " + f"{self._config.describe_api_key_sources()}); " "skipping network call and recording sidecar as failed." ) return self._record_failure(written, msg, raised=ValueError(msg)) @@ -939,14 +2338,40 @@ def _record_failure( ) -> UploadOutcome: sc = written.sidecar sc.status = "failed" - sc.last_error = message + # Pull structured HTTP details off the exception when present + # (TCKDBClient attaches ``status_code``, ``response_json``, and + # ``response_text`` on its HTTP-error subclasses). FastAPI 422 + # bodies otherwise collapse to "HTTP 422" via the exception's + # default message, hiding the field-level rejection reason. + # Non-HTTP failures (timeouts, connection errors, etc.) lack + # these attrs and fall back to the original ``message`` — + # preserving the prior log shape for those paths. + status_code = getattr(raised, "status_code", None) + response_json = getattr(raised, "response_json", None) + response_text = getattr(raised, "response_text", None) + if response_json is not None: + sc.response_body = _summarize_response_body(response_json) + elif response_text: + sc.response_body = _summarize_response_body(response_text) + if status_code is not None: + sc.response_status_code = status_code + detail_for_message: Any = ( + response_json + if response_json is not None + else (response_text or message) + ) + sc.last_error = ( + f"HTTP {status_code}: {detail_for_message}" + if status_code is not None + else message + ) self._writer.update_sidecar(written.sidecar_path, sc) logger.warning( "TCKDB upload failed (strict=%s): %s key=%s err=%s", self._config.strict, written.payload_path, sc.idempotency_key, - message, + sc.last_error, ) if self._config.strict: raise raised @@ -955,7 +2380,7 @@ def _record_failure( payload_path=written.payload_path, sidecar_path=written.sidecar_path, idempotency_key=sc.idempotency_key, - error=message, + error=sc.last_error, ) def _make_client(self, api_key: str): @@ -978,7 +2403,8 @@ def _upload_artifact( api_key = self._config.resolve_api_key() if not api_key: msg = ( - f"TCKDB API key env var '{self._config.api_key_env}' is not set; " + f"TCKDB API key not configured (tried: " + f"{self._config.describe_api_key_sources()}); " "skipping artifact network call." ) return self._record_artifact_failure(written, msg, ValueError(msg)) @@ -1134,6 +2560,13 @@ def _resolve_level( def _opt_result_payload(record: Mapping[str, Any]) -> dict[str, Any] | None: out: dict[str, Any] = {} + # opt_converged is emitted by arc/output.py::_spc_to_dict (line 541) and + # accepted by TCKDB's OptResultPayload as ``converged: bool | None``. + # Without this mapping the calc_opt_result.converged column lands NULL + # even for known-converged species. Coerce to bool defensively — ARC + # writes True/False but the schema is strict about the type. + if record.get("opt_converged") is not None: + out["converged"] = bool(record["opt_converged"]) if record.get("opt_n_steps") is not None: out["n_steps"] = record["opt_n_steps"] if record.get("opt_final_energy_hartree") is not None: @@ -1141,6 +2574,26 @@ def _opt_result_payload(record: Mapping[str, Any]) -> dict[str, Any] | None: return out or None +def _coarse_opt_result_payload(record: Mapping[str, Any]) -> dict[str, Any] | None: + """Like :func:`_opt_result_payload` but reads the ``coarse_opt_*`` fields. + + The coarse opt is treated as ``converged=True`` whenever it ran to + completion — output.yml only records ``coarse_opt_log`` after a + successful coarse run (see ``_spc_to_dict`` line 555: ``if converged + and coarse_path``). A separate ``coarse_opt_converged`` field would + be slightly more honest, but ARC doesn't currently emit one and the + convention "the coarse log only exists if it converged" holds. + """ + out: dict[str, Any] = {} + # The presence of the coarse log indicates a successful coarse stage. + out["converged"] = True + if record.get("coarse_opt_n_steps") is not None: + out["n_steps"] = record["coarse_opt_n_steps"] + if record.get("coarse_opt_final_energy_hartree") is not None: + out["final_energy_hartree"] = record["coarse_opt_final_energy_hartree"] + return out + + _FREQ_FIELD_SPECS = ( # (record_key, payload_key, coerce) ("freq_n_imag", "n_imag", int), @@ -1184,15 +2637,189 @@ def _sp_result_payload(record: Mapping[str, Any]) -> dict[str, Any] | None: energy = record.get(record_key) if energy is None: return None - try: - return {"electronic_energy_hartree": float(energy)} - except (TypeError, ValueError) as exc: - logger.warning( - "TCKDB sp additional calculation skipped for label=%s: " - "malformed %s=%r (%s)", - record.get("label"), record_key, energy, exc, - ) + try: + return {"electronic_energy_hartree": float(energy)} + except (TypeError, ValueError) as exc: + logger.warning( + "TCKDB sp additional calculation skipped for label=%s: " + "malformed %s=%r (%s)", + record.get("label"), record_key, energy, exc, + ) + return None + + +_APPLIED_CORRECTION_COMPONENT_FIELDS = ( + "component_kind", "key", "multiplicity", "parameter_value", "contribution_value", +) + +# TCKDB's LevelOfTheoryRef primary-key set — used by _level_keys_match +# for conservative LoT equality. Note the names here are TCKDB's +# (post-translation), not ARC's (output.yml writes `auxiliary_basis` / +# `cabs`); _level_keys_match compares projected dicts. +_TCKDB_LOT_REF_FIELDS = ("method", "basis", "aux_basis", "cabs_basis") + +# Field-name translation from ARC's Level (output.yml shape) to TCKDB's +# LevelOfTheoryRef. ARC's Level.as_dict() emits `auxiliary_basis` / `cabs` / +# `solvation_method`; TCKDB's LoT uses `aux_basis` / `cabs_basis` / +# `solvent_model`. `software` / `software_version` belong on +# `software_release`, not the LoT, and are intentionally dropped here. +# `method_type` / `year` / `solvation_scheme_level` / `compatible_ess` have +# no TCKDB LoT counterpart and are also dropped. +_ARC_TO_TCKDB_LOT_FIELDS = { + "method": "method", + "basis": "basis", + "auxiliary_basis": "aux_basis", + "cabs": "cabs_basis", + "dispersion": "dispersion", + "solvent": "solvent", + "solvation_method": "solvent_model", +} + + +def _arc_args_to_keywords(args: Any) -> str | None: + """Flatten ARC's nested ``args`` dict to TCKDB's flat ``keywords`` string. + + ARC stores ESS/runtime options as a nested mapping, commonly with + categories such as ``keyword`` and ``block``. TCKDB stores the projected + level-of-theory options in a single deterministic string that participates + in ``lot_hash`` deduplication. + + The serialization is intentionally category-prefixed and sorted so that + equivalent dictionaries produce identical strings regardless of insertion + order. + """ + if not isinstance(args, Mapping): + return None + + parts: list[str] = [] + + for category in sorted(args): + entries = args.get(category) + if not isinstance(entries, Mapping) or not entries: + continue + + for key in sorted(entries): + value = entries[key] + if value is None: + continue + + value_text = json.dumps( + value, + sort_keys=True, + separators=(",", ":"), + ensure_ascii=False, + ) + parts.append(f"{category}:{key}={value_text}") + + if not parts: + return None + + return "; ".join(parts) + + +def _arc_level_to_tckdb_lot(level: Mapping[str, Any] | None) -> dict[str, Any] | None: + """Project ARC's per-job level dict (output.yml shape) onto TCKDB's + ``LevelOfTheoryRef`` shape, applying field-name translation and + flattening ``args`` into ``keywords``. + + Returns ``None`` if ``level`` is missing or has no ``method`` — + callers decide whether to error or skip. + """ + if not isinstance(level, Mapping): + return None + if not level.get("method"): return None + out: dict[str, Any] = {} + for src, dst in _ARC_TO_TCKDB_LOT_FIELDS.items(): + v = level.get(src) + if v: + out[dst] = str(v) + keywords = _arc_args_to_keywords(level.get("args")) + if keywords: + out["keywords"] = keywords + return out + + +def _scheme_level_of_theory(scheme: Mapping[str, Any]) -> dict[str, Any] | None: + """Project ARC's per-species scheme.level_of_theory dict onto TCKDB's + ``LevelOfTheoryRef`` shape. The scheme dict comes from the same + ``_level_to_dict(arkane_level_of_theory)`` producer as opt/freq/sp + levels, so the same field-name translation applies.""" + return _arc_level_to_tckdb_lot(scheme.get("level_of_theory")) + + +def _build_applied_energy_corrections( + applied_records: Any, + *, + source_calculation_key: str | None = None, +) -> list[dict[str, Any]]: + """Translate ``output.yml`` per-species ``applied_energy_corrections`` + into the TCKDB ``AppliedEnergyCorrectionUploadPayload`` shape. + + The output.yml shape is already very close to the upload schema (same + ``application_role`` / ``value`` / ``value_unit`` / ``scheme`` fields), + so this is mostly a passthrough plus three small adaptations: + + 1. Drop output.yml-only fields from components (e.g. ``parameter_unit``) + — TCKDB's ``AppliedCorrectionComponentPayload`` rejects unknowns. + 2. Drop component rows whose ``parameter_value`` is null — the upload + schema requires a real number, and the producer marks them null + precisely when reconstruction wasn't reliable. + 3. Attach ``source_calculation_key`` when the caller has resolved it + to a real SP key in this bundle. AEC and BAC are corrections to the + electronic-energy reference, so SP is the right anchor; the field + is omitted when no key is supplied rather than guessed. + + The caller resolves the SP key against the relevant namespace — + bundle-global (``"sp"``) for computed-species, scoped (``"r0_sp"`` / + ``"p0_sp"`` / ``"ts_sp"``) for computed-reaction. The helper does + not know about modes; it just takes the resolved key (or ``None``) + and stamps it on every emitted entry. + """ + if not isinstance(applied_records, list): + return [] + + out: list[dict[str, Any]] = [] + for rec in applied_records: + if not isinstance(rec, Mapping): + continue + if rec.get("value") is None or rec.get("scheme") is None: + continue + + components_in = rec.get("components") or [] + components_out: list[dict[str, Any]] = [] + for c in components_in: + if not isinstance(c, Mapping): + continue + if c.get("parameter_value") is None or c.get("contribution_value") is None: + continue + components_out.append({ + k: c[k] for k in _APPLIED_CORRECTION_COMPONENT_FIELDS if k in c + }) + + scheme_in = rec["scheme"] + scheme_out = {k: scheme_in[k] for k in scheme_in if k != "level_of_theory"} + lot_ref = _scheme_level_of_theory(scheme_in) + if lot_ref is not None: + scheme_out["level_of_theory"] = lot_ref + + payload: dict[str, Any] = { + "application_role": rec["application_role"], + "value": float(rec["value"]), + "value_unit": rec["value_unit"], + "scheme": scheme_out, + "components": components_out, + } + if source_calculation_key is not None: + payload["source_calculation_key"] = source_calculation_key + + note = rec.get("note") + if note is not None: + payload["note"] = note + + out.append(payload) + + return out def _build_thermo_block( @@ -1217,7 +2844,7 @@ def _build_thermo_block( nasa_low.tmin_k → nasa.t_low nasa_low.tmax_k → nasa.t_mid (cross-checked vs nasa_high.tmin_k) nasa_high.tmax_k → nasa.t_high - cp_data → points (per-point validation; bad points dropped) + thermo_points → points (per-point validation; bad points dropped) ``source_calculations`` are populated from ``included_calc_keys``: each of ``opt`` / ``freq`` / ``sp`` that actually made it into the @@ -1255,20 +2882,24 @@ def _build_thermo_block( if nasa is not None: block["nasa"] = nasa - points = _build_thermo_points(thermo_record.get("cp_data")) + points = _build_thermo_points(thermo_record.get("thermo_points") or thermo_record.get("cp_data")) if points: block["points"] = points - # ThermoCalculationRole accepts opt/freq/sp/composite/imported. We - # link freq and sp when present (they're the calcs that physically - # produced the thermo); we also link opt if it's the only calc in - # the bundle (unusual, but covers thermo-from-opt-only edge cases). + # Thermo provenance: link every calculation that actually contributed + # to the thermo result, not just the "physically produced" ones. + # For ARC's standard opt/freq/sp pipeline that means: + # opt — geometry that freq + sp were run on + # freq — vibrational modes / ZPE / thermal corrections + # sp — electronic-energy reference for H298 + # The links should be self-sufficient; we don't want consumers to have + # to traverse `calculation_dependency` to recover the opt calc. + # Order is fixed (opt, freq, sp) so payloads are deterministic — same + # inputs hash to the same idempotency key across runs. sources: list[dict[str, str]] = [] - for key in (_CALC_KEY_FREQ, _CALC_KEY_SP): + for key in (_CALC_KEY_OPT, _CALC_KEY_FREQ, _CALC_KEY_SP): if key in included_calc_keys: sources.append({"calculation_key": key, "role": key}) - if not sources and _CALC_KEY_OPT in included_calc_keys: - sources.append({"calculation_key": _CALC_KEY_OPT, "role": _CALC_KEY_OPT}) if sources: block["source_calculations"] = sources @@ -1344,44 +2975,46 @@ def _build_nasa_block( return block -def _build_thermo_points(cp_data: Any) -> list[dict[str, Any]]: - """Map ARC's ``cp_data`` list to ``ThermoPointCreate`` dicts. +def _build_thermo_points(thermo_points: Any) -> list[dict[str, Any]]: + """Map ARC's ``thermo_points`` list to ``ThermoPointCreate`` dicts. - Each entry must carry ``temperature_k``; other fields are optional. - Malformed individual points (missing/non-numeric temperature, or a - Cp value that won't coerce) are dropped with a warning so a single - bad row doesn't take out the whole thermo upload. + Each entry must carry ``temperature_k``; ``cp_j_mol_k``, ``h_kj_mol``, + ``s_j_mol_k``, and ``g_kj_mol`` are optional and forwarded when present + and numeric. Malformed individual points (missing/non-numeric + temperature, or a per-quantity value that won't coerce) are dropped + with a warning so a single bad row doesn't take out the whole + thermo upload. """ - if not isinstance(cp_data, list): + if not isinstance(thermo_points, list): return [] seen_temps: set[float] = set() points: list[dict[str, Any]] = [] - for i, raw in enumerate(cp_data): + for i, raw in enumerate(thermo_points): if not isinstance(raw, Mapping): - logger.warning("TCKDB thermo: cp_data[%d] skipped — not a mapping.", i) + logger.warning("TCKDB thermo: thermo_points[%d] skipped — not a mapping.", i) continue t = raw.get("temperature_k") if t is None: - logger.warning("TCKDB thermo: cp_data[%d] skipped — missing temperature_k.", i) + logger.warning("TCKDB thermo: thermo_points[%d] skipped — missing temperature_k.", i) continue try: t_f = float(t) except (TypeError, ValueError) as exc: logger.warning( - "TCKDB thermo: cp_data[%d] skipped — non-numeric temperature_k=%r (%s).", + "TCKDB thermo: thermo_points[%d] skipped — non-numeric temperature_k=%r (%s).", i, t, exc, ) continue if t_f <= 0: logger.warning( - "TCKDB thermo: cp_data[%d] skipped — temperature_k must be > 0 (got %s).", + "TCKDB thermo: thermo_points[%d] skipped — temperature_k must be > 0 (got %s).", i, t_f, ) continue if t_f in seen_temps: # Server enforces uniqueness by temperature_k; skip duplicates here. logger.warning( - "TCKDB thermo: cp_data[%d] skipped — duplicate temperature_k=%s.", + "TCKDB thermo: thermo_points[%d] skipped — duplicate temperature_k=%s.", i, t_f, ) continue @@ -1400,22 +3033,1218 @@ def _build_thermo_points(cp_data: Any) -> list[dict[str, Any]]: point[dst_key] = float(v) except (TypeError, ValueError) as exc: logger.warning( - "TCKDB thermo: cp_data[%d].%s dropped — non-numeric %r (%s).", + "TCKDB thermo: thermo_points[%d].%s dropped — non-numeric %r (%s).", i, src_key, v, exc, ) points.append(point) return points +_KEY_PART_RE = re.compile(r"[^A-Za-z0-9]+") + + +def _safe_key_part(label: str | None) -> str: + """Sanitize a label for use as part of a bundle local key. + + Bundle keys ride into ``GeometryIn.key`` / ``CalculationIn.key`` + where the schema only requires ``min_length=1``, but downstream + consumers (and the idempotency-key sanitizer) prefer + ``[A-Za-z0-9._:-]``. We keep alphanumerics, drop everything else, + cap to 32 chars, and fall back to ``"x"`` for an all-junk label so + we never produce an empty key segment. + """ + if not label: + return "x" + cleaned = _KEY_PART_RE.sub("", str(label))[:32] + return cleaned or "x" + + +def _local_key_for_actor(prefix: str, index: int, label: str | None) -> str: + """Build a deterministic per-actor *species* key like ``"r0_CHO"`` / ``"p1_CH3"``. + + The numeric ``index`` is what guarantees uniqueness across actors + that happen to share a chemical label (e.g. H + H ⇌ H2 has two + ``r*_H`` slots). The sanitized label tail is purely informational — + a human reading the JSON should be able to tell ``r0_CHO`` from + ``r1_CH4`` without cross-referencing. Calc keys derive from + :func:`_calc_prefix_for_actor` instead, which omits the label so + calc keys stay short (e.g. ``r0_opt`` not ``r0_CHO_opt``). + """ + return f"{prefix}{index}_{_safe_key_part(label)}" + + +def _calc_prefix_for_actor(prefix: str, index: int) -> str: + """Build the per-actor calc-key prefix (e.g. ``"r0"`` / ``"p1"``). + + The chemical label is intentionally omitted: calculation keys ride + into ``kinetics.source_calculations`` and are referenced verbatim, + so shorter is better as long as uniqueness is preserved (the + role-letter + index combo guarantees it). + """ + return f"{prefix}{index}" + + +def _index_species(output_doc: Mapping[str, Any]) -> dict[str, Mapping[str, Any]]: + """Build a label → species record map from ``output_doc['species']``. + + Later occurrences win on collision — that matches the implicit + contract of ARC's output.yml (one record per label) but lets the + builder fail loudly when a record is genuinely missing rather than + silently picking a stale duplicate. + """ + out: dict[str, Mapping[str, Any]] = {} + for record in output_doc.get("species") or []: + if not isinstance(record, Mapping): + continue + label = record.get("label") or record.get("original_label") + if label: + out[str(label)] = record + return out + + +def _index_transition_states( + output_doc: Mapping[str, Any], +) -> dict[str, Mapping[str, Any]]: + """Build a ts-label → record map from ``output_doc['transition_states']``.""" + out: dict[str, Mapping[str, Any]] = {} + for record in output_doc.get("transition_states") or []: + if not isinstance(record, Mapping): + continue + label = record.get("label") or record.get("original_label") + if label: + out[str(label)] = record + return out + + +def _arc_workflow_tool_release( + output_doc: Mapping[str, Any], +) -> dict[str, Any] | None: + """Build the ARC ``WorkflowToolReleaseRef``-shaped dict, or ``None``. + + Single source of truth for the ARC release identity used in the + bundle. Returns ``None`` when neither version nor git commit is + available (rare — usually at least one is set). + """ + arc_version = output_doc.get("arc_version") + arc_git_commit = output_doc.get("arc_git_commit") + if not (arc_version or arc_git_commit): + return None + wt: dict[str, Any] = {"name": "ARC"} + if arc_version: + wt["version"] = str(arc_version) + if arc_git_commit: + wt["git_commit"] = str(arc_git_commit) + return wt + + +# StatmechCalculationRole values that ARC's three-stage opt/freq/sp +# workflow can declare. Order is fixed (opt → freq → sp) so the +# emitted source_calculations list is byte-stable across runs of the +# same content. ``opt_coarse`` is intentionally excluded — it's an +# intermediate optimization stage, not a statmech input (the schema's +# StatmechCalculationRole enum has no ``opt_coarse`` value, and the +# task spec is explicit that opt_coarse is not a statmech source). +_STATMECH_CALC_ROLES: tuple[tuple[str, str], ...] = ( + (_CALC_KEY_OPT, "opt"), + (_CALC_KEY_FREQ, "freq"), + (_CALC_KEY_SP, "sp"), +) + + +def _build_freq_scale_factor_ref( + output_doc: Mapping[str, Any], + *, + workflow_tool_release: Mapping[str, Any] | None, +) -> dict[str, Any] | None: + """Build a ``FreqScaleFactorRef``-shaped dict, or ``None``. + + Maps ARC run-level frequency-scale metadata to the unified TCKDB + reference shape. Returns ``None`` when ARC didn't apply a scale + factor for this run, or when the level of theory's ``method`` + isn't available — ``LevelOfTheoryRef.method`` is required by the + schema, and emitting a ref with a missing method would fail + server-side validation. + + Source-attribution policy (from + ``FreqScaleFactorRef`` docstring + ARC's audit): + + * ``value`` ← ``output_doc['freq_scale_factor']`` + * ``level_of_theory`` ← ``freq_level`` (preferred — actual freq LOT), + falling back to ``arkane_level_of_theory`` when freq_level is + absent (common single-LOT runs). + * ``software`` ← ``freq_level.software`` (or fallback chain to opt). + * ``scale_kind`` ← ``"fundamental"``. ARC doesn't distinguish + ZPE/enthalpy/entropy/Cp scale factors today; ``fundamental`` is + the ``FrequencyScaleKind`` default. + * ``note`` ← ``freq_scale_factor_source`` when present (it's a bare + citation/URL string, not structured literature). + * ``source_literature`` ← always ``None``. The schema explicitly + forbids synthesizing literature rows from raw citation strings. + * ``workflow_tool_release`` ← ARC release **only when ARC's + curated data file was the proximate source** (i.e., the source + string is non-null). When the user supplied the factor directly, + omit ``workflow_tool_release`` — claiming ARC's release would + fork the dedupe identity tuple ``(level, software, scale_kind, + value, source_literature, workflow_tool_release)`` and create + duplicate registry rows. + """ + value = output_doc.get("freq_scale_factor") + if value is None: + return None + try: + value_f = float(value) + except (TypeError, ValueError): + logger.warning( + "TCKDB statmech: malformed freq_scale_factor=%r; omitting FSF ref.", + value, + ) + return None + if value_f <= 0: + logger.warning( + "TCKDB statmech: freq_scale_factor=%r is non-positive; omitting " + "FSF ref (server requires gt 0).", value, + ) + return None + + # Prefer freq_level, fall back to arkane_level_of_theory. Don't + # fall further back to opt_level — opt and freq levels are + # genuinely independent in some workflows, and an FSF mislabeled + # against opt_level would dedupe incorrectly. + level_source = output_doc.get("freq_level") or output_doc.get("arkane_level_of_theory") + level_of_theory = _arc_level_to_tckdb_lot(level_source) + if level_of_theory is None: + logger.debug( + "TCKDB statmech: freq/arkane level missing or has no 'method'; " + "cannot build FreqScaleFactorRef.level_of_theory.", + ) + return None + + ref: dict[str, Any] = { + "level_of_theory": level_of_theory, + "scale_kind": "fundamental", + "value": value_f, + } + + # Software comes from freq_level when present; otherwise fall back + # to opt_level's software (matches the existing ess_versions + # fallback in _calculation_payload — opt and freq usually share an + # ESS in practice). + freq_software = ( + (level_source.get("software") if isinstance(level_source, Mapping) else None) + or _opt_level_software(output_doc) + ) + if freq_software: + ref["software"] = {"name": str(freq_software)} + + source_string = output_doc.get("freq_scale_factor_source") + if source_string: + # Bare citation string lands in note. Never synthesized into + # source_literature. + ref["note"] = str(source_string) + # Only tag ARC as the proximate source when our data file was + # actually the lookup origin (source_string non-null implies + # _resolve_freq_scale_factor_source matched a row in + # data/freq_scale_factors.yml). User-supplied factors leave + # workflow_tool_release null to avoid forking the registry. + if workflow_tool_release is not None: + ref["workflow_tool_release"] = dict(workflow_tool_release) + + return ref + + +def _opt_level_software(output_doc: Mapping[str, Any]) -> str | None: + opt_level = output_doc.get("opt_level") + if isinstance(opt_level, Mapping): + sw = opt_level.get("software") + if sw: + return str(sw) + return None + + +# TCKDB ``RigidRotorKind`` enum values (live as of this writing — see +# ``app/db/models/common.py``). ARC's ``_statmech_to_dict`` currently +# only ever emits ``atom`` / ``linear`` / ``asymmetric_top``; the other +# two are listed for forward compatibility if ARC adds the analysis. +_TCKDB_RIGID_ROTOR_KINDS = frozenset({ + "atom", "linear", "spherical_top", "symmetric_top", "asymmetric_top", +}) + +# TCKDB ``TorsionTreatmentKind`` values that ARC produces today. The +# remaining values (``rigid_top``, ``hindered_rotor_dos``) aren't in +# ARC's lexicon, so they're silently dropped per spec rather than +# guessed at. +_ARC_TO_TCKDB_TORSION_TREATMENTS = frozenset({"free_rotor", "hindered_rotor"}) + + +def _build_statmech_block_for_species( + *, + output_doc: Mapping[str, Any], + species_record: Mapping[str, Any] | None = None, + calc_keys_by_role: Mapping[str, str], + workflow_tool_release: Mapping[str, Any] | None, + scan_key_renames: Mapping[str, str] | None = None, +) -> dict[str, Any] | None: + """Build a ``StatmechInBundle``-/``BundleStatmechIn``-shaped dict, or ``None``. + + Pulls per-species statmech metadata from + ``species_record['statmech']`` (the dict that ``arc/output.py:: + _statmech_to_dict`` writes into ``output.yml``) and projects it onto + the upload schema. The frequency-scale-factor handling is unchanged. + + Both bundle endpoints (``StatmechInBundle`` for computed-species, + ``BundleStatmechIn`` for computed-reaction per-species) now accept + the same field set: ``external_symmetry``, ``is_linear``, + ``rigid_rotor_kind``, ``statmech_treatment``, ``point_group``, + ``freq_scale_factor``, ``torsions``, and ``source_calculations``. + Mode-specific filtering is therefore a no-op; the only thing the + caller varies is the calc-key namespace (unscoped for computed- + species, ``r0_*``/``p0_*``/``ts_*`` for computed-reaction). + + ``calc_keys_by_role`` is the role-to-bundle-local-key map for the + *owning* species block. Computed-species passes the unscoped keys + (``{"opt": "opt", ...}``); computed-reaction passes the species- + scoped equivalents (``{"opt": "r0_opt", ...}``). The helper does + not synthesize keys — it only writes a source_calculation entry per + role that the caller declared. + + Returns ``None`` when no field survives filtering — emitting an + empty statmech container would just create a useless server-side + row. Per the project convention, no empty containers. + """ + block: dict[str, Any] = {} + + fsf_ref = _build_freq_scale_factor_ref( + output_doc, workflow_tool_release=workflow_tool_release, + ) + if fsf_ref is not None: + block["freq_scale_factor"] = fsf_ref + + statmech_input = ( + species_record.get("statmech") if isinstance(species_record, Mapping) else None + ) + if isinstance(statmech_input, Mapping): + external_symmetry = statmech_input.get("external_symmetry") + if isinstance(external_symmetry, int) and external_symmetry >= 1: + block["external_symmetry"] = external_symmetry + + is_linear = statmech_input.get("is_linear") + if isinstance(is_linear, bool): + block["is_linear"] = is_linear + + rotor_kind = statmech_input.get("rigid_rotor_kind") + if isinstance(rotor_kind, str) and rotor_kind in _TCKDB_RIGID_ROTOR_KINDS: + block["rigid_rotor_kind"] = rotor_kind + + torsions_input = statmech_input.get("torsions") + treatment = _classify_statmech_treatment(torsions_input) + if treatment is not None: + block["statmech_treatment"] = treatment + + slim_torsions = _build_slim_torsions( + torsions_input, scan_key_renames=scan_key_renames, + ) + if slim_torsions: + block["torsions"] = slim_torsions + + point_group = statmech_input.get("point_group") + if isinstance(point_group, str) and point_group.strip(): + block["point_group"] = point_group.strip() + + # Source calculations are provenance for actual statmech metadata, + # not statmech metadata in themselves — emitting them on an + # otherwise-empty block produces a "useless container" that the + # project convention forbids. Only attach them when the block + # already carries at least one substantive field (FSF or any + # species-derived field), which preserves the prior behavior of + # omitting the whole block on FSF-less runs that lack a statmech + # subdict. The keys are taken straight from ``calc_keys_by_role``; + # the caller is responsible for passing the correct namespace + # (unscoped for computed-species, ``r0_*``/``p0_*`` for computed- + # reaction species blocks) so the server-side ownership check sees + # only calculations owned by the same species entry. + if block: + sources = _build_statmech_source_calculations( + calc_keys_by_role=calc_keys_by_role, + ) + if sources: + block["source_calculations"] = sources + + return block or None + + +def _classify_statmech_treatment( + torsions: Any, +) -> str | None: + """Map ARC's torsion list to a TCKDB ``StatmechTreatmentKind`` value. + + Rules: + + * ``None`` torsions input (no statmech subdict on the species + record) → ``None``: ARC didn't emit a statmech evaluation, so the + treatment is genuinely unknown. Don't fabricate one. + * Empty list (statmech ran, no successful rotors) → ``"rrho"``. + * ≥1 1D rotor (each entry's ``atom_indices`` is a flat 4-int list) + and no ND → ``"rrho_1d"``. + * ≥1 ND rotor (entry's ``atom_indices`` is a list of 4-int lists) + and no 1D → ``"rrho_nd"``. + * Mix of 1D and ND → ``"rrho_1d_nd"``. + * Anything we can't classify confidently → ``None`` (omitted). + + The ``rrho_ad``/``rrao`` enum values are reserved for treatments + ARC doesn't currently produce; we never emit them. + """ + if torsions is None: + return None + if not isinstance(torsions, list): + return None + if not torsions: + return "rrho" + has_1d = False + has_nd = False + for t in torsions: + if not isinstance(t, Mapping): + return None + atom_indices = t.get("atom_indices") + if ( + isinstance(atom_indices, list) + and len(atom_indices) == 4 + and all(isinstance(x, int) for x in atom_indices) + ): + has_1d = True + elif ( + isinstance(atom_indices, list) + and atom_indices + and all(isinstance(x, list) for x in atom_indices) + ): + has_nd = True + else: + return None + if has_1d and has_nd: + return "rrho_1d_nd" + if has_nd: + return "rrho_nd" + if has_1d: + return "rrho_1d" + return None + + +def _build_slim_torsions( + torsions: Any, + *, + scan_key_renames: Mapping[str, str] | None = None, +) -> list[dict[str, Any]]: + """Build ``BundleStatmechTorsionIn`` entries, with coordinate quartets when available. + + ARC's per-rotor dict (built by ``_get_torsions`` in ``arc/output.py``) + carries: + + * ``symmetry_number`` (int) + * ``treatment`` (``"hindered_rotor"`` / ``"free_rotor"``) + * ``atom_indices`` (1-based 4-int dihedral defining atom quartet, + or for ND scans a list of 4-int lists) + * ``pivot_atoms`` (1-based 2-int axis — bundle has no column) + * ``barrier_kj_mol`` (fitted barrier — bundle has no column) + + The bundle ``StatmechTorsionInBundle`` schema accepts: + + * ``torsion_index`` (1-based, allocated by emission order) + * ``symmetry_number`` + * ``treatment_kind`` + * ``dimension`` (default 1) + * ``coordinates`` (list of ``StatmechTorsionCoordinateIn`` — + each with ``coordinate_index`` plus ``atom1_index``..``atom4_index``, + all 1-based; the four atoms must be distinct) + * ``source_scan_calculation_key`` (optional, must resolve to a + bundle-local calc of type ``scan`` — deferred until ARC emits + scan calcs) + + Coordinate emission rules: + + * 1D rotors with a flat 4-int ``atom_indices`` → emit one + coordinate, ``dimension=1``. + * ND rotors with a list-of-lists ``atom_indices`` → emit one + coordinate per inner list, ``dimension=N``. The bundle schema + requires ``len(coordinates) == dimension`` and contiguous + ``coordinate_index`` values 1..N. + * Missing or malformed ``atom_indices`` → log a warning and emit + the summary fields only (no ``coordinates``, no ``dimension`` + override). Producers must never fabricate atom quartets. + + Rotors whose treatment isn't a recognized TCKDB value (e.g. ARC + might add new types in the future) are omitted entirely rather than + emitted with a missing treatment_kind — the latter would produce a + torsion entry that's effectively meaningless to consumers. + + ``pivot_atoms`` and ``barrier_kj_mol`` are deliberately not emitted: + the bundle schema rejects ``pivot_atoms`` and has no destination + column for the fitted barrier. Both stay out of the payload. + """ + if not isinstance(torsions, list): + return [] + out: list[dict[str, Any]] = [] + next_index = 1 + for entry in torsions: + if not isinstance(entry, Mapping): + continue + treatment = entry.get("treatment") + if treatment not in _ARC_TO_TCKDB_TORSION_TREATMENTS: + continue + slim: dict[str, Any] = { + "torsion_index": next_index, + "treatment_kind": treatment, + } + sym = entry.get("symmetry_number") + if isinstance(sym, int) and sym >= 1: + slim["symmetry_number"] = sym + + coordinates = _coerce_torsion_coordinates(entry.get("atom_indices")) + if coordinates is not None: + slim["dimension"] = len(coordinates) + slim["coordinates"] = coordinates + else: + atom_indices = entry.get("atom_indices") + if atom_indices is not None: + logger.warning( + "TCKDB statmech: torsion #%d has unusable atom_indices=%r; " + "emitting torsion summary without coordinates.", + next_index, atom_indices, + ) + # Link the torsion to its underlying scan calc when ARC provided a + # bundle-local key. Computed-reaction bundles namespace scan calcs + # per-species (``r0_scan_rotor_0``, etc.) because the server + # enforces global calc-key uniqueness across the whole bundle — + # the caller passes a ``scan_key_renames`` map so the torsion's + # reference matches the namespaced calc key. Computed-species + # bundles have a single species and pass ``None``: the original + # un-prefixed key is already unique. + scan_key = entry.get("source_scan_calculation_key") + if isinstance(scan_key, str) and scan_key: + if scan_key_renames is not None: + scan_key = scan_key_renames.get(scan_key, scan_key) + slim["source_scan_calculation_key"] = scan_key + out.append(slim) + next_index += 1 + return out + + +def _coerce_torsion_coordinates( + atom_indices: Any, +) -> list[dict[str, int]] | None: + """Validate and project ARC's atom_indices into TCKDB coordinate dicts. + + Returns the coordinate list when ``atom_indices`` is well-formed: + + * Flat list of 4 distinct positive ints → one coordinate. + * List of N flat-4-int lists (each distinct, 4 atoms each) → N + coordinates with ``coordinate_index`` running 1..N. + + Returns ``None`` when the input is missing, malformed, contains + non-positive integers, or has duplicate atoms within a quartet — + callers fall back to a coordinate-less torsion summary so a single + bad rotor never blocks the whole upload. The 4-distinct-atoms check + mirrors the server-side ``StatmechTorsionCoordinateIn`` validator; + failing here gives a clearer producer-side message than letting the + server 422. + """ + if atom_indices is None: + return None + if not isinstance(atom_indices, list) or not atom_indices: + return None + # 1D shape: a flat list of exactly 4 ints. + if all(isinstance(x, int) for x in atom_indices): + if len(atom_indices) != 4: + return None + if not all(x >= 1 for x in atom_indices): + return None + if len(set(atom_indices)) != 4: + return None + a1, a2, a3, a4 = atom_indices + return [{ + "coordinate_index": 1, + "atom1_index": a1, "atom2_index": a2, + "atom3_index": a3, "atom4_index": a4, + }] + # ND shape: a list of 4-int sub-lists. + if all(isinstance(x, list) for x in atom_indices): + coords: list[dict[str, int]] = [] + for i, quartet in enumerate(atom_indices, start=1): + if not all(isinstance(x, int) for x in quartet): + return None + if len(quartet) != 4: + return None + if not all(x >= 1 for x in quartet): + return None + if len(set(quartet)) != 4: + return None + a1, a2, a3, a4 = quartet + coords.append({ + "coordinate_index": i, + "atom1_index": a1, "atom2_index": a2, + "atom3_index": a3, "atom4_index": a4, + }) + return coords or None + return None + + +def _build_statmech_source_calculations( + *, + calc_keys_by_role: Mapping[str, str], +) -> list[dict[str, Any]]: + """Compose statmech ``source_calculations`` from the bundle's emitted calcs. + + Walks ``_STATMECH_CALC_ROLES`` in order (opt → freq → sp) and emits + one entry per role whose key is present in the caller-supplied + ``calc_keys_by_role`` map. ``opt_coarse`` is excluded by design — + it's not a ``StatmechCalculationRole`` enum value. + + The role keys (``"opt"`` / ``"freq"`` / ``"sp"``) are the + ``StatmechCalculationRole`` enum strings. Values are the bundle-local + calculation keys the server should resolve. Both computed-species + and computed-reaction call this with their own scoped values; this + helper is mode-agnostic. + """ + return [ + {"calculation_key": calc_keys_by_role[role], "role": role} + for _, role in _STATMECH_CALC_ROLES + if role in calc_keys_by_role and calc_keys_by_role[role] + ] + + +def _ts_unmapped_smiles_handle( + *, + ts_record: Mapping[str, Any], + reaction_record: Mapping[str, Any], + species_index: Mapping[str, Mapping[str, Any]], +) -> str | None: + """Derive a deterministic textual handle for the TS, or ``None``. + + Resolution order, lifted directly from the spec: + + 1. ``ts_record['smiles']`` when ARC already attached a textual TS + identifier (rare in production — TS records usually have SMILES + null because there's no Lewis structure). + 2. A canonical reaction-SMILES handle ``".>>."`` + built from the reactant/product species' SMILES looked up + through ``species_index``. ``.`` joins species; ``>>`` separates + reactant and product sides — the standard interchange format + and unambiguous from ARC's data. This is *not* a claim that the + TS itself is a single molecule; it's the same data the field + intends to carry — a traceability handle. + 3. ``None`` when any reactant or product lacks a SMILES, or when + the reaction has no reactant/product labels. The producer + refuses to emit a misleading half-handle; TCKDB stores NULL. + + Determinism: the helper is a pure function of the input mappings. + No paths, timestamps, or run identifiers leak in. Idempotency keys + will move only if the upstream species SMILES change. + """ + direct = ts_record.get("smiles") + if direct: + text = str(direct).strip() + if text: + return text + + def _smiles_for_labels(labels: list[str] | None) -> list[str] | None: + out: list[str] = [] + for label in labels or []: + record = species_index.get(label) + if not isinstance(record, Mapping): + return None + smiles = record.get("smiles") + if not smiles: + return None + out.append(str(smiles).strip()) + return out or None + + reactant_smiles = _smiles_for_labels(reaction_record.get("reactant_labels")) + if reactant_smiles is None: + return None + product_smiles = _smiles_for_labels(reaction_record.get("product_labels")) + if product_smiles is None: + return None + + return f"{'.'.join(reactant_smiles)}>>{'.'.join(product_smiles)}" + + +# Result-shape adapter: ``CalculationInBundle`` (computed-species) +# wraps results in nested ``opt_result``/``freq_result``/``sp_result`` +# dicts; the network_pdep ``CalculationIn`` (which the computed-reaction +# endpoint extends) carries the same data as flat fields. The mapping +# below is the full v0 translation; ``_calculation_payload`` produces +# the wrapped shape, and :func:`_flatten_result_fields` rewrites it to +# the flat shape in place when building reaction bundles. +_REACTION_FLAT_RESULT_FIELDS: dict[str, dict[str, str]] = { + "opt_result": { + "converged": "opt_converged", + "n_steps": "opt_n_steps", + "final_energy_hartree": "opt_final_energy_hartree", + }, + "freq_result": { + "n_imag": "freq_n_imag", + "imag_freq_cm1": "freq_imag_freq_cm1", + "zpe_hartree": "freq_zpe_hartree", + }, + "sp_result": { + "electronic_energy_hartree": "sp_electronic_energy_hartree", + }, +} + + +def _flatten_result_fields(calc: dict[str, Any]) -> None: + """Convert wrapped ``opt_result``/``freq_result``/``sp_result`` into flat fields. + + Mutates ``calc`` in place: each wrapped result dict is removed and + its values are promoted to the network_pdep-style flat field names. + A no-op for IRC and any other calc type without a wrapped result — + those carry their data through ``parameters_json`` instead. + """ + for wrapped_key, field_map in _REACTION_FLAT_RESULT_FIELDS.items(): + result = calc.pop(wrapped_key, None) + if not isinstance(result, Mapping): + continue + for src, dst in field_map.items(): + if src in result: + calc[dst] = result[src] + + +def _flatten_all_reaction_calcs(bundle: dict[str, Any]) -> None: + """Walk a computed-reaction bundle and flatten every calc in place. + + Hits each species block's primary opt (under ``conformers[*].calculation``) + and additionals (under ``calculations``), then the TS's primary + (``transition_state.calculation``) and additionals + (``transition_state.calculations``). One pass at the bundle root + keeps the per-builder code free of shape concerns. + """ + for species in bundle.get("species") or []: + for conf in species.get("conformers") or []: + calc = conf.get("calculation") + if isinstance(calc, dict): + _flatten_result_fields(calc) + for calc in species.get("calculations") or []: + if isinstance(calc, dict): + _flatten_result_fields(calc) + ts = bundle.get("transition_state") + if isinstance(ts, dict): + primary = ts.get("calculation") + if isinstance(primary, dict): + _flatten_result_fields(primary) + for calc in ts.get("calculations") or []: + if isinstance(calc, dict): + _flatten_result_fields(calc) + + +# Roles defined by TCKDB's ``KineticsCalculationRole`` enum; mirrored +# here so the adapter stays loud about unknown roles instead of +# bouncing through a 422 at upload time. +_KINETICS_ROLE_REACTANT_ENERGY = "reactant_energy" +_KINETICS_ROLE_PRODUCT_ENERGY = "product_energy" +_KINETICS_ROLE_TS_ENERGY = "ts_energy" +_KINETICS_ROLE_TS_FREQ = "freq" +_KINETICS_ROLE_TS_IRC = "irc" + + +def _build_kinetics_block( + *, + kinetics_record: Mapping[str, Any], + reactant_keys: list[str], + product_keys: list[str], + actor_calc_keys: Mapping[str, Mapping[str, str]], + ts_calc_keys: Mapping[str, str], +) -> dict[str, Any] | None: + """Build a ``BundleKineticsIn``-shaped dict from ARC kinetics. + + Mapping (ARC → TCKDB): + A → a + A_units → a_units (via :func:`arc_to_tckdb_a_units`) + n → n + Ea → reported_ea + Ea_units → reported_ea_units (via :func:`arc_to_tckdb_ea_units`) + Tmin_k → tmin_k + Tmax_k → tmax_k + dn → n_uncertainty + dEa → d_reported_ea (only when dEa_units == Ea_units; + the live schema has no d_reported_ea_units field, + so dEa is only safe to emit when its units agree + with Ea's) + dA → a_uncertainty (+ a_uncertainty_kind="multiplicative") + Arkane/ARC dA is a multiplicative factor f, with + the true value bracketed by [A/f, A*f]. TCKDB's + ``KineticsUncertaintyKind`` exposes that semantic + explicitly, so the producer preserves dA verbatim + rather than dropping or re-encoding it. dA < 1.0 + is omitted (server rejects multiplicative factors + below 1). + + ``source_calculations`` is populated from each actor's emitted + sp/ts_sp/ts_freq/ts_irc keys. Reactant/product freq calcs are + deliberately *not* linked as ``role=freq`` — in v0 kinetics that + role is reserved for the TS frequency. + + Returns ``None`` when neither A nor Ea is populated — TCKDB's + ``BundleKineticsIn`` allows empty kinetics, but a totally empty + record is just noise and easier to omit than to send. + """ + has_substantive_field = any( + kinetics_record.get(k) is not None + for k in ("A", "n", "Ea", "Tmin_k", "Tmax_k") + ) + if not has_substantive_field: + return None + + block: dict[str, Any] = { + "reactant_keys": list(reactant_keys), + "product_keys": list(product_keys), + "model_kind": "modified_arrhenius", + } + + a = kinetics_record.get("A") + if a is not None: + try: + block["a"] = float(a) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB kinetics: malformed A=%r (%s)", a, exc) + a_units = arc_to_tckdb_a_units(kinetics_record.get("A_units")) + if a_units: + block["a_units"] = a_units + + n = kinetics_record.get("n") + if n is not None: + try: + block["n"] = float(n) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB kinetics: malformed n=%r (%s)", n, exc) + + ea = kinetics_record.get("Ea") + ea_units = arc_to_tckdb_ea_units(kinetics_record.get("Ea_units")) + if ea is not None: + try: + block["reported_ea"] = float(ea) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB kinetics: malformed Ea=%r (%s)", ea, exc) + if ea_units: + block["reported_ea_units"] = ea_units + + for arc_key, payload_key in (("Tmin_k", "tmin_k"), ("Tmax_k", "tmax_k")): + v = kinetics_record.get(arc_key) + if v is None: + continue + try: + f = float(v) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB kinetics: malformed %s=%r (%s)", arc_key, v, exc) + continue + if f <= 0: + logger.warning( + "TCKDB kinetics: %s=%r is non-positive; field will be omitted " + "(server requires gt 0).", arc_key, v, + ) + continue + block[payload_key] = f + + dn = kinetics_record.get("dn") + if dn is not None: + try: + block["n_uncertainty"] = float(dn) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB kinetics: malformed dn=%r (%s)", dn, exc) + + # dEa policy: ARC's dEa_units may differ from Ea_units. TCKDB has + # only one Ea unit per kinetics row (no separate d_reported_ea_units + # column today), so we can only safely emit d_reported_ea when the + # producer reported it in the same units as Ea — otherwise the + # number would be silently misinterpreted as the wrong unit. + dea = kinetics_record.get("dEa") + if dea is not None: + dea_units_raw = kinetics_record.get("dEa_units") + dea_units = arc_to_tckdb_ea_units(dea_units_raw) if dea_units_raw else ea_units + if dea_units is not None and ea_units is not None and dea_units == ea_units: + try: + block["d_reported_ea"] = float(dea) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB kinetics: malformed dEa=%r (%s)", dea, exc) + elif dea_units is None and ea_units is None: + # Best-effort: both unitless; pass through. + try: + block["d_reported_ea"] = float(dea) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB kinetics: malformed dEa=%r (%s)", dea, exc) + else: + logger.debug( + "TCKDB kinetics: dEa units (%r) differ from Ea units (%r); " + "omitting d_reported_ea to avoid unit ambiguity.", + dea_units_raw, kinetics_record.get("Ea_units"), + ) + + # dA policy: Arkane/ARC dA is a multiplicative uncertainty factor + # (true A in [A/f, A*f]). TCKDB's ``KineticsUncertaintyKind.multiplicative`` + # encodes exactly that, so we forward dA verbatim — never re-derive + # it as A*(dA-1) or fold it into an additive band. Pair the value + # with ``a_uncertainty_kind="multiplicative"`` (the schema requires + # both fields to appear together, and rejects multiplicative factors + # below 1.0). + da = kinetics_record.get("dA") + if da is not None: + try: + da_f = float(da) + except (TypeError, ValueError) as exc: + logger.warning("TCKDB kinetics: malformed dA=%r (%s)", da, exc) + else: + if da_f < 1.0: + logger.debug( + "TCKDB kinetics: dA=%r is below 1.0; multiplicative factors " + "must be >= 1, so a_uncertainty will be omitted.", + da, + ) + else: + block["a_uncertainty"] = da_f + block["a_uncertainty_kind"] = "multiplicative" + + # Tunneling method ARC applied to the fit (currently always Eckart; + # surfaced through output.yml so the adapter doesn't have to hardcode + # the constant alongside the producer template). Absent → omit the + # field for backward compat with older output.yml versions. + tunneling = kinetics_record.get("tunneling") + if tunneling: + block["tunneling_model"] = str(tunneling) + + sources = _build_kinetics_source_calculations( + reactant_keys=reactant_keys, + product_keys=product_keys, + actor_calc_keys=actor_calc_keys, + ts_calc_keys=ts_calc_keys, + ) + if sources: + block["source_calculations"] = sources + + return block + + +def _build_kinetics_source_calculations( + *, + reactant_keys: list[str], + product_keys: list[str], + actor_calc_keys: Mapping[str, Mapping[str, str]], + ts_calc_keys: Mapping[str, str], +) -> list[dict[str, Any]]: + """Compose the kinetics ``source_calculations`` list explicitly. + + Producer policy: declare exactly the links we have evidence for. + + - Each reactant/product contributes its sp calc (if present) under + ``reactant_energy``/``product_energy``. + - The TS contributes ts_sp under ``ts_energy``. + - The TS contributes ts_freq under ``freq`` (kinetics ``freq`` role + is the *TS* frequency in v0, not reactant freq). + - The TS contributes ts_irc under ``irc`` only when ARC produced + and persisted an IRC calc. + + Anything missing is omitted — the spec is explicit that a missing + source link is preferable to a fabricated one. + """ + links: list[dict[str, Any]] = [] + for actor_key in reactant_keys: + sp_key = actor_calc_keys.get(actor_key, {}).get(_CALC_KEY_SP) + if sp_key: + links.append({"calculation_key": sp_key, "role": _KINETICS_ROLE_REACTANT_ENERGY}) + for actor_key in product_keys: + sp_key = actor_calc_keys.get(actor_key, {}).get(_CALC_KEY_SP) + if sp_key: + links.append({"calculation_key": sp_key, "role": _KINETICS_ROLE_PRODUCT_ENERGY}) + ts_sp = ts_calc_keys.get(_CALC_KEY_SP) + if ts_sp: + links.append({"calculation_key": ts_sp, "role": _KINETICS_ROLE_TS_ENERGY}) + ts_freq = ts_calc_keys.get(_CALC_KEY_FREQ) + if ts_freq: + links.append({"calculation_key": ts_freq, "role": _KINETICS_ROLE_TS_FREQ}) + ts_irc = ts_calc_keys.get(_CALC_KEY_IRC) + if ts_irc: + links.append({"calculation_key": ts_irc, "role": _KINETICS_ROLE_TS_IRC}) + return links + + +# IRC direction labels mirror TCKDB's ``IRCDirection`` enum. They are +# ESS path-direction labels only — the producer does not infer +# reactant/product side from them. +_IRC_DIRECTION_FORWARD = "forward" +_IRC_DIRECTION_REVERSE = "reverse" +_IRC_DIRECTION_BOTH = "both" + + +def _detect_irc_direction(log_path: str) -> str | None: + """Detect IRC direction (forward/reverse) from a log filename. + + ARC writes per-direction IRC logs whose filenames carry an explicit + direction infix. The two patterns seen in production runs: + + - ``..._irc_f.log`` / ``..._irc_r.log`` (compact runtime convention) + - ``..._forward.log`` / ``..._reverse.log`` (long-form / test fixtures) + + Returns ``None`` if the filename matches neither — caller still + emits the trajectory but omits the per-point direction. ``None`` + is also the right answer when ARC has consolidated forward+reverse + into a single log; current ARC adapters don't produce that shape, + but the caller treats it correctly anyway. + """ + name = Path(str(log_path)).name.lower() + if "irc_f" in name or "forward" in name: + return _IRC_DIRECTION_FORWARD + if "irc_r" in name or "reverse" in name: + return _IRC_DIRECTION_REVERSE + return None + + +# Hartree → kJ/mol conversion. Matches CODATA 2018 within the +# truncation TCKDB downstream code uses for relative energies; defining +# it inline keeps this helper free of further imports at module top. +_HARTREE_TO_KJ_MOL = 2625.4996 + + +def _build_irc_result_payload( + trajectories: list[dict[str, Any]], + zero_energy_reference_hartree: float | None = None, +) -> dict[str, Any] | None: + """Build a TCKDB ``IRCResultPayload``-shaped dict from parsed trajectories. + + Each trajectory is a + ``{"direction": str|None, + "rich_points": [, ...] | None, + "geom_points": [, ...] | None}`` + as produced by :meth:`TCKDBAdapter._parse_irc_trajectories`. When + ``rich_points`` is populated, per-point ``electronic_energy_hartree``, + ``reaction_coordinate``, ``max_gradient``, ``rms_gradient``, and (when + Gaussian provides them) per-point ``direction`` flow through. When + only ``geom_points`` is available the result reduces to the original + geometry-only IRC payload behavior — energies/gradients are simply + omitted, the schema makes them optional. + + ``point_index`` is allocated globally across both branches so the + server-side uniqueness invariant holds. Direction labels are passed + straight through — forward/reverse are ESS path labels, not + reactant/product designators. + + ``zero_energy_reference_hartree`` is the TS reference energy at the + IRC level of theory, resolved by + :func:`_resolve_irc_zero_energy_reference`. When non-null, per-point + ``relative_energy_kj_mol`` is computed against it and the result- + level field is stamped on the payload. When null, both stay absent + rather than fabricated. + """ + if not trajectories: + return None + # Late-imported to avoid pulling species converter at adapter import + # time when the producer is configured but disabled. + from arc.species.converter import xyz_to_str + + points: list[dict[str, Any]] = [] + has_forward = False + has_reverse = False + for traj in trajectories: + traj_direction = traj.get("direction") + rich_points = traj.get("rich_points") or [] + geom_points = traj.get("geom_points") or [] + # Rich points carry per-point direction (Gaussian's FORWARD/ + # REVERSE announcement). Geometry-only points inherit the + # trajectory-level direction resolved upstream from the + # scheduler list / filename heuristic. + if rich_points: + iter_records = ( + { + "xyz": rp.get("xyz"), + "direction": rp.get("direction") or traj_direction, + "electronic_energy_hartree": rp.get("electronic_energy_hartree"), + "reaction_coordinate": rp.get("reaction_coordinate"), + "max_gradient": rp.get("max_gradient"), + "rms_gradient": rp.get("rms_gradient"), + } + for rp in rich_points + ) + else: + iter_records = ( + { + "xyz": xyz, + "direction": traj_direction, + "electronic_energy_hartree": None, + "reaction_coordinate": None, + "max_gradient": None, + "rms_gradient": None, + } + for xyz in geom_points + ) + + for record in iter_records: + direction = record["direction"] + if direction == _IRC_DIRECTION_FORWARD: + has_forward = True + elif direction == _IRC_DIRECTION_REVERSE: + has_reverse = True + point: dict[str, Any] = {"point_index": len(points)} + if direction in (_IRC_DIRECTION_FORWARD, _IRC_DIRECTION_REVERSE): + point["direction"] = direction + xyz_dict = record["xyz"] + if xyz_dict is not None: + try: + xyz_str = xyz_to_str(xyz_dict=xyz_dict) + except Exception as exc: + logger.debug( + "TCKDB computed-reaction: IRC point xyz_to_str failed: %s", + exc, + ) + xyz_str = None + normalized = _normalize_xyz_text(xyz_str, None) + if normalized: + point["geometry"] = {"xyz_text": normalized} + energy = record["electronic_energy_hartree"] + if energy is not None: + point["electronic_energy_hartree"] = float(energy) + if zero_energy_reference_hartree is not None: + point["relative_energy_kj_mol"] = ( + (float(energy) - zero_energy_reference_hartree) + * _HARTREE_TO_KJ_MOL + ) + rc = record["reaction_coordinate"] + if rc is not None: + point["reaction_coordinate"] = float(rc) + max_grad = record["max_gradient"] + if max_grad is not None: + point["max_gradient"] = float(max_grad) + rms_grad = record["rms_gradient"] + if rms_grad is not None: + point["rms_gradient"] = float(rms_grad) + points.append(point) + if not points: + return None + if has_forward and has_reverse: + overall = _IRC_DIRECTION_BOTH + elif has_forward: + overall = _IRC_DIRECTION_FORWARD + elif has_reverse: + overall = _IRC_DIRECTION_REVERSE + else: + # Direction couldn't be detected from filenames. The schema + # requires a direction enum value — default to ``both`` since + # ARC's invariant is that any IRC run with multiple logs covers + # both branches. With a single unlabeled log we still pick + # ``both`` rather than guessing one side; the per-point + # direction is simply omitted. + overall = _IRC_DIRECTION_BOTH + payload: dict[str, Any] = { + "direction": overall, + "has_forward": has_forward, + "has_reverse": has_reverse, + "point_count": len(points), + "points": points, + } + if zero_energy_reference_hartree is not None: + payload["zero_energy_reference_hartree"] = float( + zero_energy_reference_hartree + ) + return payload + + +def _level_keys_match(a: Mapping[str, Any] | None, b: Mapping[str, Any] | None) -> bool: + """Conservative level-of-theory equality check. + + Compares only the fields TCKDB treats as primary keys for + ``LevelOfTheoryRef`` (method/basis/aux_basis/cabs_basis). Differences + in software, dispersion, or solvation are intentionally not enough + to declare equality here — but they're also not enough to declare + inequality, since ARC frequently leaves them null on one side + (the opt level) and populated on the other (the sp level) without + that meaning the *energies* are at different levels. + """ + if a is None or b is None: + return False + for field in _TCKDB_LOT_REF_FIELDS: + av = a.get(field) + bv = b.get(field) + if av is None and bv is None: + continue + if av is None or bv is None: + return False + if str(av).strip().lower() != str(bv).strip().lower(): + return False + return True + + +def _resolve_irc_zero_energy_reference( + *, + output_doc: Mapping[str, Any], + ts_record: Mapping[str, Any], +) -> float | None: + """Pick the TS reference electronic energy for an IRC path. + + ARC runs IRC at the opt level (see ``level_kind="opt"`` at the IRC + calculation construction site). The reference must therefore live at + that same level, or the relative energies it produces would mix two + levels of theory. + + Resolution order, conservative by design: + + 1. ``ts_sp_result.electronic_energy_hartree`` *only if* the project + SP level matches the project opt level (LoT keys equal). + 2. ``ts_record['opt_final_energy_hartree']`` — the TS opt's converged + SCF, which is by construction at the opt level. + 3. ``None`` — never fabricate a reference. Downstream consumers + interpret a null ``zero_energy_reference_hartree`` as "relative + energies unavailable" and skip the ``relative_energy_kj_mol`` + per-point field accordingly. + """ + opt_level = _resolve_level(output_doc, "opt") + sp_level = output_doc.get("sp_level") + sp_level = sp_level if isinstance(sp_level, Mapping) else None + sp_energy = ts_record.get("sp_energy_hartree") + if sp_energy is None: + sp_energy = ts_record.get("electronic_energy_hartree") + if sp_energy is not None and _level_keys_match(opt_level, sp_level): + try: + return float(sp_energy) + except (TypeError, ValueError): + pass + opt_energy = ts_record.get("opt_final_energy_hartree") + if opt_energy is not None: + try: + return float(opt_energy) + except (TypeError, ValueError): + pass + return None + + +def _normalize_xyz_text(xyz: str | None, label: str | None) -> str | None: + """Convert an ARC atom-only xyz string into TCKDB's standard XYZ format. + + Input shape (what ``xyz_to_str`` emits): + ``"C 0.0 0.0 0.0\\nH 1.0 0.0 0.0"`` + Output shape (what TCKDB ``GeometryPayload.xyz_text`` expects): + ``"\\n\\n"`` + + If the input already has a valid integer atom-count header, it's + returned untouched. Returns ``None`` for null/empty input — the + format-translation boundary between ARC's internal convention and + the TCKDB schema, with no requirement that input be present. + """ + if not xyz: + return None + text = str(xyz).strip() + if not text: + return None + lines = text.splitlines() + try: + int(lines[0].strip()) + return text + except (ValueError, IndexError): + pass + return f"{len(lines)}\n{label or ''}\n{text}" + + def _require_xyz_text(record: Mapping[str, Any]) -> str: - """Pull the xyz string out of an output.yml species record. - - Converts ARC's atom-only xyz (``"C 0.0 0.0 0.0\\nH 1.0 0.0 0.0"``, - emitted by ``xyz_to_str``) into the standard XYZ format that TCKDB - expects (``"\\n\\n"``). If the input - already has an integer atom-count header, it is passed through - untouched. This is the format-translation boundary between ARC's - internal convention and the TCKDB schema. + """Pull and normalize the species record's reference xyz, raising if absent. + + Wraps :func:`_normalize_xyz_text` for the conformer-level geometry + case where missing xyz is a fatal error (the bundle requires a + ``ConformerInBundle.geometry``). For optional per-calc input + geometries, call ``_normalize_xyz_text`` directly so a missing xyz + just yields ``None`` and the caller omits the field. """ xyz = record.get("xyz") if not xyz: @@ -1423,20 +4252,12 @@ def _require_xyz_text(record: Mapping[str, Any]) -> str: f"output.yml record for label={record.get('label')!r} has no xyz; " "cannot build geometry payload." ) - text = str(xyz).strip() - if not text: + text = _normalize_xyz_text(xyz, record.get("label")) + if text is None: raise ValueError( f"output.yml record for label={record.get('label')!r} has empty xyz." ) - - lines = text.splitlines() - try: - int(lines[0].strip()) - return text - except (ValueError, IndexError): - pass - label = record.get("label") or "" - return f"{len(lines)}\n{label}\n{text}" + return text def _summarize_response_body(body: Any, *, max_chars: int = 2000) -> Any: @@ -1528,10 +4349,14 @@ def _close_quietly(client: Any, context: str) -> None: __all__ = [ "ARTIFACTS_ENDPOINT_TEMPLATE", "ArtifactUploadOutcome", + "COMPUTED_REACTION_ENDPOINT", + "COMPUTED_REACTION_KIND", "COMPUTED_SPECIES_ENDPOINT", "COMPUTED_SPECIES_KIND", "CONFORMER_UPLOAD_ENDPOINT", "PAYLOAD_KIND", "TCKDBAdapter", "UploadOutcome", + "arc_to_tckdb_a_units", + "arc_to_tckdb_ea_units", ] diff --git a/arc/tckdb/adapter_test.py b/arc/tckdb/adapter_test.py index d12415cce6..9987f61804 100644 --- a/arc/tckdb/adapter_test.py +++ b/arc/tckdb/adapter_test.py @@ -18,10 +18,14 @@ from arc.tckdb.adapter import ( ARTIFACTS_ENDPOINT_TEMPLATE, ArtifactUploadOutcome, + COMPUTED_REACTION_ENDPOINT, + COMPUTED_REACTION_KIND, CONFORMER_UPLOAD_ENDPOINT, PAYLOAD_KIND, TCKDBAdapter, UploadOutcome, + arc_to_tckdb_a_units, + arc_to_tckdb_ea_units, ) from arc.tckdb.config import TCKDBArtifactConfig, TCKDBConfig @@ -593,6 +597,68 @@ def test_unimplemented_kind_defensive_skip(self): self.assertIn("no upload path yet", outcome.skip_reason) self.assertEqual(client.calls, []) + # ---------------- bundle-mode suppression + def _bundle_cfg(self, upload_mode: str): + return TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.payload_dir, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode=upload_mode, + artifacts=TCKDBArtifactConfig(upload=True, kinds=("output_log", "input")), + ) + + def test_computed_species_mode_suppresses_standalone_output_log_artifact(self): + # In computed_species bundle mode the bundle payload already + # carries output_log inline under each calc; a standalone POST + # would (a) duplicate the upload and (b) bake a stale calc id + # into the sidecar that 404s on DB resets. + client = _StubClient(response=_StubResponse({}, status_code=201)) + adapter = self._adapter(client, cfg=self._bundle_cfg("computed_species")) + outcome = self._submit(adapter, kind="output_log") + self.assertEqual(outcome.status, "skipped") + self.assertEqual(client.calls, []) + self.assertIn("computed_species", outcome.skip_reason) + self.assertIn("inline", outcome.skip_reason) + # No sidecar should land on disk for the suppressed call. + sidecar_dir = pathlib.Path(self.project_dir) / self.payload_dir / "calculation_artifacts" + self.assertFalse(sidecar_dir.exists() and any(sidecar_dir.iterdir())) + + def test_computed_species_mode_suppresses_standalone_input_artifact(self): + client = _StubClient(response=_StubResponse({}, status_code=201)) + adapter = self._adapter(client, cfg=self._bundle_cfg("computed_species")) + # Stage an input deck file so the kind passes upstream gates. + input_path = os.path.join(self.project_dir, "calcs", "Species", "ethanol", "opt", "input.gjf") + with open(input_path, "wb") as fh: + fh.write(b"# dummy gjf\n") + outcome = self._submit(adapter, log_path=input_path, kind="input") + self.assertEqual(outcome.status, "skipped") + self.assertEqual(client.calls, []) + self.assertIn("inline", outcome.skip_reason) + + def test_computed_reaction_mode_suppresses_standalone_artifact(self): + client = _StubClient(response=_StubResponse({}, status_code=201)) + adapter = self._adapter(client, cfg=self._bundle_cfg("computed_reaction")) + outcome = self._submit(adapter, kind="output_log") + self.assertEqual(outcome.status, "skipped") + self.assertEqual(client.calls, []) + self.assertIn("computed_reaction", outcome.skip_reason) + + def test_conformer_mode_still_uploads_standalone_artifact(self): + # Legacy conformer mode must keep working — primitive flows + # don't have an inline artifact path, so disabling standalone + # uploads here would silently drop data. + client = _StubClient(response=_StubResponse( + {"calculation_id": 42, "artifacts": [{"id": 7}]}, + status_code=201, + )) + # Default upload_mode is "conformer". + adapter = self._adapter(client) + outcome = self._submit(adapter, kind="output_log") + self.assertEqual(outcome.status, "uploaded") + self.assertEqual(len(client.calls), 1) + class TestUploadOutcomeCalcRefs(unittest.TestCase): """Conformer upload exposes primary/additional calc refs from response.""" @@ -1027,10 +1093,13 @@ def _full_record(): "tmax_k": 5000.0, "coeffs": [3.5, 1e-3, -2e-7, 1e-11, -3e-15, -28500.0, 5.0], }, - "cp_data": [ - {"temperature_k": 300.0, "cp_j_mol_k": 33.6}, - {"temperature_k": 400.0, "cp_j_mol_k": 35.2}, - {"temperature_k": 500.0, "cp_j_mol_k": 37.0}, + "thermo_points": [ + {"temperature_k": 300.0, "cp_j_mol_k": 33.6, + "h_kj_mol": -230.5, "s_j_mol_k": 285.1, "g_kj_mol": -315.9}, + {"temperature_k": 400.0, "cp_j_mol_k": 35.2, + "h_kj_mol": -227.1, "s_j_mol_k": 295.3, "g_kj_mol": -345.2}, + {"temperature_k": 500.0, "cp_j_mol_k": 37.0, + "h_kj_mol": -223.5, "s_j_mol_k": 303.4, "g_kj_mol": -375.2}, ], } return record @@ -1104,6 +1173,379 @@ def test_primary_opt_calculation_maps_correctly(self): self.assertIn("opt_result", primary) self.assertEqual(primary["opt_result"]["n_steps"], 12) + # ---------------- 2a: opt_converged → opt_result.converged + def test_opt_converged_true_maps_to_opt_result_converged(self): + """``opt_converged: True`` on the record (from output.py:541) must + land as ``opt_result.converged: true`` so calc_opt_result.converged + isn't NULL in the DB for known-good runs.""" + record = _full_record() + record["opt_converged"] = True + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + self.assertIs(primary["opt_result"]["converged"], True) + + def test_opt_converged_false_maps_through_unchanged(self): + """A failed-to-converge opt should still upload, with + ``converged: false`` so the downstream consumer sees the truth.""" + record = _full_record() + record["opt_converged"] = False + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + self.assertIs(primary["opt_result"]["converged"], False) + + def test_opt_converged_absent_omits_field(self): + """Records that don't carry opt_converged (older output.yml or + non-converged species) should produce an opt_result without the + ``converged`` key — not ``converged: null`` — so the schema's + default kicks in cleanly.""" + record = _full_record() + record.pop("opt_converged", None) # ensure absent + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + self.assertNotIn("converged", primary["opt_result"]) + + def test_opt_converged_none_omits_field(self): + """Explicit None on the record (treated as absent) → key omitted.""" + record = _full_record() + record["opt_converged"] = None + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + self.assertNotIn("converged", primary["opt_result"]) + + # ---------------- 2b: opt_input_xyz → opt.input_geometries + def test_opt_input_xyz_attaches_as_input_geometry(self): + """``opt_input_xyz`` from output.yml lands as + ``primary_calculation.input_geometries[0].xyz_text``.""" + record = _full_record() + record["opt_input_xyz"] = "C 0.001 0.000 0.000\nH 1.090 0.000 0.000" + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + self.assertIn("input_geometries", primary) + self.assertEqual(len(primary["input_geometries"]), 1) + # The atom-only string gets normalized to TCKDB's standard + # "\n\n" format inside the bundle. + text = primary["input_geometries"][0]["xyz_text"] + self.assertEqual(text.splitlines()[0].strip(), "2") # atom count header + + def test_freq_and_sp_input_geometries_set_to_conformer_geom(self): + """Freq + sp now explicitly declare the conformer's optimized + geometry as their input. ARC's invariant guarantees they ran + on this geometry; explicit-over-implicit removes ambiguity for + consumers that don't replicate TCKDB's auto-fill rule.""" + _, _, payload = self._submit() + conformer_xyz = payload["conformers"][0]["geometry"]["xyz_text"] + for calc in payload["conformers"][0]["additional_calculations"]: + self.assertIn( + "input_geometries", calc, + msg=f"{calc['key']} should explicitly declare input_geometries " + "set to the conformer's optimized geometry", + ) + self.assertEqual(len(calc["input_geometries"]), 1) + self.assertEqual( + calc["input_geometries"][0]["xyz_text"], conformer_xyz, + msg=f"{calc['key']}'s input geometry must match the conformer geometry", + ) + + def test_opt_input_xyz_does_not_leak_into_freq_sp(self): + """Even when the record carries ``opt_input_xyz``, freq + sp + must use the conformer (optimized) geometry, NOT the pre-opt + xyz. They genuinely ran on the optimized output.""" + record = _full_record() + record["opt_input_xyz"] = "C 9.999 9.999 9.999\nH 8.888 8.888 8.888" # distinctive + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + # Opt has the pre-opt xyz. + self.assertIn("9.999", primary["input_geometries"][0]["xyz_text"]) + # Freq + sp do NOT — their input is the conformer (post-opt) geom. + for calc in payload["conformers"][0]["additional_calculations"]: + self.assertNotIn( + "9.999", calc["input_geometries"][0]["xyz_text"], + msg=f"{calc['key']}'s input geometry must not be the pre-opt xyz", + ) + + def test_opt_input_xyz_omitted_when_absent_no_fallback_to_optimized(self): + """No ``opt_input_xyz`` on the record → no ``input_geometries`` + on opt's calc block. We must NOT silently substitute the + conformer (optimized) geometry for opt — that's opt's output, + not its input. Backend has no auto-fill for opt either, so the + ``calculation_input_geometry`` row stays absent for that calc. + Honest-empty beats wrong-link.""" + record = _full_record() + record.pop("opt_input_xyz", None) # ensure absent + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + self.assertNotIn("input_geometries", primary) + # Sanity: freq/sp still get explicit conformer geom even when + # opt's input is unknown. + for calc in payload["conformers"][0]["additional_calculations"]: + self.assertIn("input_geometries", calc) + + def test_opt_input_xyz_preserves_existing_count_header(self): + """If output.yml ever ships a properly-headered xyz, pass it + through untouched rather than double-headering it.""" + record = _full_record() + record["opt_input_xyz"] = "2\nethanol\nC 0.001 0.000 0.000\nH 1.090 0.000 0.000" + _, _, payload = self._submit(record=record) + text = payload["conformers"][0]["primary_calculation"]["input_geometries"][0]["xyz_text"] + # Already-headered → identical + self.assertEqual(text.splitlines()[0].strip(), "2") + self.assertEqual(text.splitlines()[1].strip(), "ethanol") + + def test_no_db_ids_in_input_geometries(self): + """``input_geometries`` entries must carry only ``xyz_text``, + never ``geometry_id`` / ``existing_geometry_id`` / etc. + (DR-0029: bundles are self-contained, local data only).""" + record = _full_record() + record["opt_input_xyz"] = "C 0.0 0.0 0.0\nH 1.0 0.0 0.0" + _, _, payload = self._submit(record=record) + forbidden = {"geometry_id", "existing_geometry_id", "id"} + primary = payload["conformers"][0]["primary_calculation"] + for ig in primary.get("input_geometries", []): + self.assertEqual(set(ig.keys()) & forbidden, set(), + msg=f"opt input_geometry has DB id key(s): {ig}") + for calc in payload["conformers"][0]["additional_calculations"]: + for ig in calc.get("input_geometries", []): + self.assertEqual(set(ig.keys()) & forbidden, set(), + msg=f"{calc['key']} input_geometry has DB id key(s): {ig}") + + # ------------------------------------------------------------------ + # Coarse → fine optimization provenance. + # ------------------------------------------------------------------ + + def _coarse_record(self): + """A record that mirrors a real two-stage opt: coarse log + parsed + coarse output xyz, fine opt fields populated, freq + sp + thermo + all present so the bundle exercises the full chain.""" + record = _full_record() + record["coarse_opt_log"] = "calcs/.../coarse/input.log" + record["coarse_opt_n_steps"] = 8 + record["coarse_opt_final_energy_hartree"] = -154.108 + record["coarse_opt_input_xyz"] = ( + "C 9.999 9.999 9.999\nH 8.888 8.888 8.888" # distinctive pre-coarse + ) + record["coarse_opt_output_xyz"] = ( + "C 1.111 2.222 3.333\nH 4.444 5.555 6.666" # distinctive coarse output + ) + # opt_input_xyz now means "fine opt's input" = coarse output. + record["opt_input_xyz"] = record["coarse_opt_output_xyz"] + return record + + # ---------------- spec test 1: single-stage unchanged + def test_single_stage_opt_emits_no_opt_coarse(self): + record = _full_record() + # Sanity: no coarse_opt_log on this record. + self.assertNotIn("coarse_opt_log", record) + _, _, payload = self._submit(record=record) + keys = [c["key"] for c in payload["conformers"][0]["additional_calculations"]] + self.assertNotIn("opt_coarse", keys) + # Primary remains opt; no optimized_from edge. + primary = payload["conformers"][0]["primary_calculation"] + self.assertEqual(primary["key"], "opt") + self.assertNotIn("depends_on", primary) + + # ---------------- spec test 2: coarse + fine emits two opt calcs + def test_coarse_plus_fine_emits_two_opt_calcs_primary_is_fine(self): + record = self._coarse_record() + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + additional_keys = [c["key"] for c in payload["conformers"][0]["additional_calculations"]] + # Primary is the FINE opt — geometry of record. + self.assertEqual(primary["key"], "opt") + self.assertEqual(primary["type"], "opt") + # opt_coarse is an additional calc, alongside freq + sp. + self.assertIn("opt_coarse", additional_keys) + self.assertIn("freq", additional_keys) + self.assertIn("sp", additional_keys) + # Type of opt_coarse is also "opt" (it's an opt-stage calc). + opt_coarse = next(c for c in payload["conformers"][0]["additional_calculations"] + if c["key"] == "opt_coarse") + self.assertEqual(opt_coarse["type"], "opt") + + # ---------------- spec test 3: geometry chain is correct + def test_coarse_geometry_chain_is_correct(self): + record = self._coarse_record() + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + opt_coarse = next(c for c in payload["conformers"][0]["additional_calculations"] + if c["key"] == "opt_coarse") + # opt_coarse's input is the species' truly-initial xyz. + self.assertIn("9.999", opt_coarse["input_geometries"][0]["xyz_text"]) + # The fine opt's input is the COARSE output, not the pre-coarse xyz. + self.assertIn("1.111", primary["input_geometries"][0]["xyz_text"]) + self.assertNotIn("9.999", primary["input_geometries"][0]["xyz_text"]) + # Conformer geometry is the fine opt's output (= record["xyz"]). + conformer_xyz = payload["conformers"][0]["geometry"]["xyz_text"] + # _full_record xyz is "C 0.0 0.0 0.0\nH 1.0 0.0 0.0" — neither + # the coarse-input nor coarse-output coords appear. + self.assertNotIn("9.999", conformer_xyz) + self.assertNotIn("1.111", conformer_xyz) + + # ---------------- spec test 4: dependency edges + def test_coarse_dependency_edges_are_correct(self): + record = self._coarse_record() + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + # opt → opt_coarse with role optimized_from. + self.assertEqual( + primary["depends_on"], + [{"parent_calculation_key": "opt_coarse", "role": "optimized_from"}], + ) + # freq → opt with role freq_on. + freq = next(c for c in payload["conformers"][0]["additional_calculations"] + if c["key"] == "freq") + self.assertEqual( + freq["depends_on"], + [{"parent_calculation_key": "opt", "role": "freq_on"}], + ) + # sp → opt with role single_point_on. + sp = next(c for c in payload["conformers"][0]["additional_calculations"] + if c["key"] == "sp") + self.assertEqual( + sp["depends_on"], + [{"parent_calculation_key": "opt", "role": "single_point_on"}], + ) + # opt_coarse has no upstream calc (chain head). + opt_coarse = next(c for c in payload["conformers"][0]["additional_calculations"] + if c["key"] == "opt_coarse") + self.assertNotIn("depends_on", opt_coarse) + + # ---------------- spec test 5: thermo source links exclude opt_coarse + def test_coarse_thermo_source_calculations_exclude_opt_coarse(self): + record = self._coarse_record() + _, _, payload = self._submit(record=record) + sources = payload["thermo"]["source_calculations"] + keys = [s["calculation_key"] for s in sources] + # opt + freq + sp are sources. opt_coarse is upstream provenance, + # not a direct thermo source. + self.assertEqual(keys, ["opt", "freq", "sp"]) + self.assertNotIn("opt_coarse", keys) + + # ---------------- spec test 6: unparseable coarse → fall back safely + def test_unparseable_coarse_geometry_falls_back_to_single_stage_bundle(self): + record = self._coarse_record() + record["coarse_opt_output_xyz"] = None # parse failure shape from output.py + # Fine opt's input would have come from coarse output; if absent, + # the producer fell back so opt_input_xyz = the species' truly + # initial xyz. Mirror that here. + record["opt_input_xyz"] = "C 0.001 0.000 0.000\nH 1.090 0.000 0.000" + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + additional_keys = [c["key"] for c in payload["conformers"][0]["additional_calculations"]] + # No opt_coarse calc — would have been half-described without + # the output geometry to chain. + self.assertNotIn("opt_coarse", additional_keys) + # Fine opt's depends_on stays empty (no optimized_from edge) — + # we wouldn't introduce a dangling edge to a non-existent calc. + self.assertNotIn("depends_on", primary) + # Bundle still validates as a single-stage opt+freq+sp. + self.assertEqual(primary["key"], "opt") + self.assertIn("freq", additional_keys) + self.assertIn("sp", additional_keys) + + # ---------------- coarse opt's result block carries n_steps / energy + def test_coarse_opt_result_block_carries_coarse_metrics(self): + record = self._coarse_record() + _, _, payload = self._submit(record=record) + opt_coarse = next(c for c in payload["conformers"][0]["additional_calculations"] + if c["key"] == "opt_coarse") + self.assertIn("opt_result", opt_coarse) + self.assertEqual(opt_coarse["opt_result"]["n_steps"], 8) + self.assertAlmostEqual(opt_coarse["opt_result"]["final_energy_hartree"], -154.108) + # Convergence is implicit "ran to completion" — output.py only + # writes coarse_opt_log on the success path. + self.assertIs(opt_coarse["opt_result"]["converged"], True) + + # ---------------- determinism: same inputs → same idempotency key + def test_coarse_bundle_idempotency_key_is_deterministic(self): + out1, _, _ = self._submit(record=self._coarse_record()) + out2, _, _ = self._submit(record=self._coarse_record()) + self.assertEqual(out1.idempotency_key, out2.idempotency_key) + # And distinct from the single-stage bundle's key. + out3, _, _ = self._submit(record=_full_record()) + self.assertNotEqual(out1.idempotency_key, out3.idempotency_key) + + # ------------------------------------------------------------------ + # output_geometries (TCKDB-side support landed; ARC declares them + # explicitly for opt + opt_coarse; freq/sp omit by design). + # ------------------------------------------------------------------ + + def test_single_stage_opt_emits_output_geometries_with_final_role(self): + """Single-stage opt's output_geometries[0] = conformer xyz, role=final.""" + _, _, payload = self._submit() + primary = payload["conformers"][0]["primary_calculation"] + conformer_xyz = payload["conformers"][0]["geometry"]["xyz_text"] + self.assertIn("output_geometries", primary) + self.assertEqual(len(primary["output_geometries"]), 1) + entry = primary["output_geometries"][0] + self.assertEqual(entry["geometry"]["xyz_text"], conformer_xyz) + self.assertEqual(entry["role"], "final") + + def test_coarse_opt_output_geometry_is_coarse_output_xyz(self): + """opt_coarse's output_geometries[0] = parsed coarse output xyz, role=final.""" + record = self._coarse_record() + _, _, payload = self._submit(record=record) + opt_coarse = next(c for c in payload["conformers"][0]["additional_calculations"] + if c["key"] == "opt_coarse") + self.assertIn("output_geometries", opt_coarse) + self.assertEqual(len(opt_coarse["output_geometries"]), 1) + entry = opt_coarse["output_geometries"][0] + # Distinctive coords from _coarse_record's coarse_opt_output_xyz + # ("C 1.111 2.222 3.333\n...") — must match. + self.assertIn("1.111", entry["geometry"]["xyz_text"]) + self.assertEqual(entry["role"], "final") + + def test_coarse_plus_fine_opt_output_geometry_is_conformer_xyz(self): + """In a coarse+fine bundle, the FINE opt's output is still the + conformer geometry of record (= ``xyz``), not the coarse output.""" + record = self._coarse_record() + _, _, payload = self._submit(record=record) + primary = payload["conformers"][0]["primary_calculation"] + conformer_xyz = payload["conformers"][0]["geometry"]["xyz_text"] + self.assertEqual(len(primary["output_geometries"]), 1) + entry = primary["output_geometries"][0] + self.assertEqual(entry["geometry"]["xyz_text"], conformer_xyz) + self.assertEqual(entry["role"], "final") + # Cross-check: fine opt's output ≠ coarse opt's output xyz + # (the coarse-output coords from the record). + self.assertNotIn("1.111", entry["geometry"]["xyz_text"]) + + def test_freq_and_sp_have_no_output_geometries_by_default(self): + """freq/sp don't move atoms — we don't claim a separate output + geometry. Backend's new contract drops the auto-fallback for + these calcs, so omission means zero output rows server-side.""" + _, _, payload = self._submit() + for calc in payload["conformers"][0]["additional_calculations"]: + self.assertNotIn( + "output_geometries", calc, + msg=f"{calc['key']} unexpectedly carries output_geometries", + ) + + def test_no_db_ids_in_output_geometries(self): + """Each output_geometries entry must carry only ``geometry`` + (with its own ``xyz_text``) and ``role`` — no DB id keys. + Same DR-0029 self-containment invariant as input_geometries.""" + record = self._coarse_record() + _, _, payload = self._submit(record=record) + forbidden = {"geometry_id", "existing_geometry_id", "id", "calculation_id"} + + def _walk_outputs(calc): + for entry in calc.get("output_geometries", []): + self.assertEqual(set(entry.keys()), {"geometry", "role"}, + msg=f"unexpected keys on output entry: {entry}") + self.assertEqual(set(entry["geometry"].keys()) & forbidden, set(), + msg=f"DB id key in geometry payload: {entry['geometry']}") + + _walk_outputs(payload["conformers"][0]["primary_calculation"]) + for calc in payload["conformers"][0]["additional_calculations"]: + _walk_outputs(calc) + + def test_output_geometries_idempotency_key_stable(self): + """Adding ``output_geometries`` doesn't make the idempotency key + depend on transient state — same record → same key, repeatable.""" + out1, _, _ = self._submit() + out2, _, _ = self._submit() + self.assertEqual(out1.idempotency_key, out2.idempotency_key) + # ---------------- 3: freq+sp included when fields exist def test_freq_and_sp_included_when_fields_exist(self): _, _, payload = self._submit() @@ -1174,25 +1616,130 @@ def test_malformed_nasa_skips_block_keeps_scalar(self): self.assertEqual(len(payload["thermo"]["points"]), 3) self.assertTrue(any("NASA block skipped" in m for m in logs.output)) - # ---------------- 9: Cp points - def test_cp_points_map_correctly(self): + # ---------------- 9: thermo points (Cp + H + S + G) + def test_thermo_points_map_correctly(self): _, _, payload = self._submit() points = payload["thermo"]["points"] self.assertEqual([p["temperature_k"] for p in points], [300.0, 400.0, 500.0]) self.assertAlmostEqual(points[0]["cp_j_mol_k"], 33.6) - - # ---------------- 10: thermo source links use local keys + # Enrichment: H/S/G now ride per-point. + self.assertAlmostEqual(points[0]["h_kj_mol"], -230.5) + self.assertAlmostEqual(points[0]["s_j_mol_k"], 285.1) + self.assertAlmostEqual(points[0]["g_kj_mol"], -315.9) + self.assertAlmostEqual(points[2]["h_kj_mol"], -223.5) + self.assertAlmostEqual(points[2]["g_kj_mol"], -375.2) + + def test_thermo_points_h_s_g_omitted_when_absent(self): + """A point that supplies only Cp (e.g., legacy ``cp_data`` rows still + on disk) must round-trip without inventing zeroes for H/S/G.""" + record = _full_record() + record["thermo"]["thermo_points"] = [ + {"temperature_k": 300.0, "cp_j_mol_k": 33.6}, + ] + _, _, payload = self._submit(record=record) + point = payload["thermo"]["points"][0] + self.assertEqual(point["temperature_k"], 300.0) + self.assertAlmostEqual(point["cp_j_mol_k"], 33.6) + self.assertNotIn("h_kj_mol", point) + self.assertNotIn("s_j_mol_k", point) + self.assertNotIn("g_kj_mol", point) + + # ---------------- 10: thermo source links use local keys (full opt/freq/sp triple) def test_thermo_source_calculations_use_local_keys(self): + """All three contributing calcs must be linked: opt (geometry), + freq (modes/ZPE), sp (electronic energy reference). Order is + deterministic — opt, freq, sp — so the bundle hashes stably.""" _, _, payload = self._submit() sources = payload["thermo"]["source_calculations"] self.assertEqual( sources, [ + {"calculation_key": "opt", "role": "opt"}, + {"calculation_key": "freq", "role": "freq"}, + {"calculation_key": "sp", "role": "sp"}, + ], + ) + + # ---------------- 10a: subset coverage — opt only + def test_thermo_source_calculations_opt_only(self): + """A bundle with only the opt calc (no freq, no sp) must still + link opt as the thermo source — useful for thermo-from-opt-only + edge cases (e.g. composite methods that hide freq/sp internally).""" + record = _full_record() + # Strip freq/sp signals so only opt + thermo remain on the record. + for key in ("freq_n_imag", "imag_freq_cm1", "zpe_hartree", + "sp_energy_hartree", "electronic_energy_hartree"): + record.pop(key, None) + _, _, payload = self._submit(record=record) + # Sanity: only opt made it into the bundle's calc namespace. + self.assertEqual(payload["conformers"][0]["additional_calculations"], []) + sources = payload["thermo"]["source_calculations"] + self.assertEqual(sources, [{"calculation_key": "opt", "role": "opt"}]) + + # ---------------- 10b: subset coverage — opt + freq, no sp + def test_thermo_source_calculations_opt_plus_freq(self): + record = _full_record() + record.pop("sp_energy_hartree", None) + record.pop("electronic_energy_hartree", None) + _, _, payload = self._submit(record=record) + addn_keys = [c["key"] for c in payload["conformers"][0]["additional_calculations"]] + self.assertEqual(addn_keys, ["freq"]) + sources = payload["thermo"]["source_calculations"] + self.assertEqual( + sources, + [ + {"calculation_key": "opt", "role": "opt"}, {"calculation_key": "freq", "role": "freq"}, + ], + ) + + # ---------------- 10c: subset coverage — opt + sp, no freq + def test_thermo_source_calculations_opt_plus_sp(self): + record = _full_record() + for key in ("freq_n_imag", "imag_freq_cm1", "zpe_hartree"): + record.pop(key, None) + _, _, payload = self._submit(record=record) + addn_keys = [c["key"] for c in payload["conformers"][0]["additional_calculations"]] + self.assertEqual(addn_keys, ["sp"]) + sources = payload["thermo"]["source_calculations"] + self.assertEqual( + sources, + [ + {"calculation_key": "opt", "role": "opt"}, {"calculation_key": "sp", "role": "sp"}, ], ) + # ---------------- 10d: ordering is deterministic — opt, freq, sp + def test_thermo_source_calculations_ordering_is_deterministic(self): + """Same record submitted twice produces identical source lists. + The fixed (opt, freq, sp) order is what makes the bundle's + idempotency hash stable across runs.""" + out1, _, payload1 = self._submit() + out2, _, payload2 = self._submit() + self.assertEqual( + payload1["thermo"]["source_calculations"], + payload2["thermo"]["source_calculations"], + ) + # And that order is exactly opt, freq, sp. + keys = [s["calculation_key"] for s in payload1["thermo"]["source_calculations"]] + self.assertEqual(keys, ["opt", "freq", "sp"]) + # Order-stability also means the idempotency keys match. + self.assertEqual(out1.idempotency_key, out2.idempotency_key) + + # ---------------- 10e: no DB ids in source_calculations + def test_thermo_source_calculations_have_no_db_ids(self): + """Each source link must carry only ``calculation_key`` + ``role`` — + no ``calculation_id`` / ``existing_calculation_id`` / etc., per + DR-0029 Requirement 1 (bundles are self-contained, local keys only).""" + _, _, payload = self._submit() + sources = payload["thermo"]["source_calculations"] + for src in sources: + self.assertEqual( + set(src.keys()), {"calculation_key", "role"}, + msg=f"unexpected keys on thermo source link: {src}", + ) + # ---------------- 11: payload has no DB ids anywhere def test_payload_has_no_existing_or_source_calculation_id(self): _, _, payload = self._submit() @@ -1423,6 +1970,51 @@ def test_live_upload_failure_marks_failed_and_preserves_payload(self): self.assertEqual(outcome.status, "failed") self.assertIn("HTTP 503", outcome.error) + # ---------------- 19a: HTTP 422 detail body preserved in failure record + def test_http_422_detail_preserved_in_outcome_and_sidecar(self): + # Regression: failures used to flatten to ``HTTP 422`` because + # ``_record_failure`` only stored ``str(exc)``. The TCKDB client's + # HTTP-error subclasses carry ``status_code`` / ``response_json`` + # / ``response_text``; the adapter now lifts those into the + # sidecar and the outcome.error so operators can see *why* a + # validation rejection happened. + class _FakeValidationError(RuntimeError): + def __init__(self): + super().__init__("HTTP 422") + self.status_code = 422 + self.response_json = { + "detail": [{ + "type": "value_error", + "loc": ["body", "species", 1, "statmech", "torsions", 0, + "source_scan_calculation_key"], + "msg": ("Value error, source_scan_calculation_key " + "'scan_rotor_0' references undefined " + "calculation_key."), + }], + } + self.response_text = json.dumps(self.response_json) + + client = _StubClient(raise_exc=_FakeValidationError()) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=_fake_output_doc(), + species_record=_full_record(), + ) + self.assertEqual(outcome.status, "failed") + # Outcome.error now carries enough detail to diagnose without + # a manual replay POST. + self.assertIn("HTTP 422", outcome.error) + self.assertIn("scan_rotor_0", outcome.error) + self.assertIn("undefined calculation_key", outcome.error) + # Sidecar has the structured body + status code intact. + sc = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sc["response_status_code"], 422) + self.assertIsInstance(sc["response_body"], dict) + self.assertIn("detail", sc["response_body"]) + # status stays "failed" so reruns / replays know to try again. + self.assertEqual(sc["status"], "failed") + # ---------------- 20: producer reads only output_doc + species_record (mapping arg shape) def test_producer_consumes_only_output_doc_and_record_mappings(self): # Pass plain dicts (no ARC class instances). If the adapter ever @@ -1434,5 +2026,3783 @@ def test_producer_consumes_only_output_doc_and_record_mappings(self): self.assertEqual(outcome.status, "uploaded") +def _reaction_record(*, with_kinetics=True, ts_label="TS0"): + """Mimics one entry from output.yml's `reactions:` list.""" + record = { + "label": "CHO + CH4 <=> CH2O + CH3", + "family": "H_Abstraction", + "multiplicity": 2, + "reactant_labels": ["CHO", "CH4"], + "product_labels": ["CH2O", "CH3"], + "ts_label": ts_label, + "kinetics": None, + } + if with_kinetics: + record["kinetics"] = { + "A": 0.204298, + "A_units": "cm^3/(mol*s)", + "n": 4.37949, + "Ea": 78.9012, + "Ea_units": "kJ/mol", + "Tmin_k": 300.0, + "Tmax_k": 3000.0, + "dA": 1.48466, + "dn": 0.0514735, + "dEa": 0.294363, + "dEa_units": "kJ/mol", + "n_data_points": 50, + "tunneling": "Eckart", + } + return record + + +def _reaction_output_doc(*, with_irc=False): + """Output document with the four species, one TS, and one reaction populated.""" + doc = _fake_output_doc() + + def _spc(label, smiles, mult, *, sp_e, freq_n_imag=0): + return { + "label": label, + "smiles": smiles, + "charge": 0, + "multiplicity": mult, + "is_ts": False, + "xyz": "C 0.0 0.0 0.0\nH 1.0 0.0 0.0", + "opt_n_steps": 10, + "opt_final_energy_hartree": -100.0, + "opt_converged": True, + "freq_n_imag": freq_n_imag, + "zpe_hartree": 0.02, + "sp_energy_hartree": sp_e, + "ess_versions": {"opt": "Gaussian 16, Revision A.03"}, + } + + doc["species"] = [ + _spc("CHO", "[CH]=O", 2, sp_e=-113.7), + _spc("CH4", "C", 1, sp_e=-40.5), + _spc("CH2O", "C=O", 1, sp_e=-114.5), + _spc("CH3", "[CH3]", 2, sp_e=-39.8), + ] + ts = { + "label": "TS0", + "smiles": None, + "charge": 0, + "multiplicity": 2, + "is_ts": True, + "xyz": "C 0.0 0.0 0.0\nH 1.0 0.0 0.0\nH 0.5 0.5 0.0", + "opt_n_steps": 25, + "opt_final_energy_hartree": -154.1, + "opt_converged": True, + "freq_n_imag": 1, + "imag_freq_cm1": -1320.5, + "zpe_hartree": 0.05, + "sp_energy_hartree": -154.7, + "ess_versions": {"opt": "Gaussian 16, Revision A.03"}, + } + if with_irc: + ts["irc_logs"] = ["irc_forward.log", "irc_reverse.log"] + ts["irc_converged"] = True + doc["transition_states"] = [ts] + doc["reactions"] = [_reaction_record()] + return doc + + +# --------------------------------------------------------------------------- +# Computed-reaction tests +# --------------------------------------------------------------------------- + + +class TestKineticsUnitMapping(unittest.TestCase): + """Test 1: ARC kinetics unit strings map to TCKDB enum strings.""" + + def test_arrhenius_units(self): + self.assertEqual(arc_to_tckdb_a_units("cm^3/(mol*s)"), "cm3_mol_s") + self.assertEqual(arc_to_tckdb_a_units("s^-1"), "per_s") + self.assertEqual(arc_to_tckdb_a_units("cm^3/(molecule*s)"), "cm3_molecule_s") + + def test_arrhenius_units_case_insensitive(self): + self.assertEqual(arc_to_tckdb_a_units("CM^3/(MOL*S)"), "cm3_mol_s") + self.assertEqual(arc_to_tckdb_a_units(" s^-1 "), "per_s") + + def test_arrhenius_units_unknown(self): + self.assertIsNone(arc_to_tckdb_a_units("bushels/fortnight")) + + def test_arrhenius_units_none(self): + self.assertIsNone(arc_to_tckdb_a_units(None)) + self.assertIsNone(arc_to_tckdb_a_units("")) + + def test_activation_energy_units(self): + self.assertEqual(arc_to_tckdb_ea_units("kJ/mol"), "kj_mol") + self.assertEqual(arc_to_tckdb_ea_units("kcal/mol"), "kcal_mol") + self.assertEqual(arc_to_tckdb_ea_units("J/mol"), "j_mol") + self.assertEqual(arc_to_tckdb_ea_units("cal/mol"), "cal_mol") + + def test_activation_energy_units_unknown(self): + self.assertIsNone(arc_to_tckdb_ea_units("erg/molecule")) + + +class TestComputedSpeciesStatmechFreqScaleFactor(unittest.TestCase): + """Statmech frequency-scale-factor provenance in computed-species bundles.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-fsf-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode="computed_species", + ) + + def _adapter(self, client): + return TCKDBAdapter(self.cfg, client_factory=lambda c, k: client) + + def _build_payload(self, output_doc): + client = _StubClient(response=_StubResponse({ + "species_entry_id": 1, + "conformers": [{ + "key": "conf0", + "primary_calculation": {"key": "opt", "calculation_id": 100, "type": "opt", "role": "primary"}, + "additional_calculations": [], + }], + })) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=output_doc, species_record=_full_record(), + ) + return outcome, json.loads(outcome.payload_path.read_text()) + + @staticmethod + def _doc_with_fsf(*, value=0.961, source="J. Chem. Theory Comput. 2010, 6, 2872", + include_freq_level=True): + doc = _fake_output_doc() + doc["freq_scale_factor"] = value + doc["freq_scale_factor_source"] = source + if include_freq_level: + doc["freq_level"] = { + "method": "wb97xd", "basis": "def2-tzvp", "software": "gaussian", + } + return doc + + # ---------------- 1: FSF emitted under statmech when value present + def test_statmech_emitted_with_freq_scale_factor(self): + _, payload = self._build_payload(self._doc_with_fsf()) + self.assertIn("statmech", payload) + sm = payload["statmech"] + self.assertIn("freq_scale_factor", sm) + fsf = sm["freq_scale_factor"] + self.assertAlmostEqual(fsf["value"], 0.961) + self.assertEqual(fsf["scale_kind"], "fundamental") + self.assertEqual(fsf["level_of_theory"]["method"], "wb97xd") + self.assertEqual(fsf["level_of_theory"]["basis"], "def2-tzvp") + self.assertEqual(fsf["software"], {"name": "gaussian"}) + + # ---------------- 2: bare citation lands in note, never source_literature + def test_bare_citation_string_maps_to_note_not_literature(self): + citation = "J. Chem. Theory Comput. 2010, 6, 2872, DOI: 10.1021/ct100326h" + _, payload = self._build_payload(self._doc_with_fsf(source=citation)) + fsf = payload["statmech"]["freq_scale_factor"] + self.assertEqual(fsf["note"], citation) + self.assertNotIn("source_literature", fsf) + + # ---------------- 3: ARC tagged as workflow_tool_release only when sourced from ARC's data + def test_arc_workflow_tool_release_set_when_arc_data_file_was_source(self): + _, payload = self._build_payload(self._doc_with_fsf(source="some citation")) + fsf = payload["statmech"]["freq_scale_factor"] + self.assertEqual(fsf["workflow_tool_release"]["name"], "ARC") + + def test_workflow_tool_release_omitted_when_user_supplied(self): + # No source string → user-supplied factor → ARC must not claim + # itself as the proximate source (would fork registry rows + # since workflow_tool_release is part of FSF identity). + doc = self._doc_with_fsf(source=None) + _, payload = self._build_payload(doc) + fsf = payload["statmech"]["freq_scale_factor"] + self.assertNotIn("workflow_tool_release", fsf) + # And no note either, since note carried only the citation. + self.assertNotIn("note", fsf) + + # ---------------- 4: missing FSF cleanly omits the entire statmech block + def test_missing_freq_scale_factor_omits_statmech(self): + # Default fixture has no freq_scale_factor — must look identical + # to the pre-change behavior (no statmech at all). + doc = _fake_output_doc() + self.assertNotIn("freq_scale_factor", doc) + client = _StubClient(response=_StubResponse({ + "species_entry_id": 1, + "conformers": [{"key": "conf0", "primary_calculation": {"key": "opt", "calculation_id": 1, "type": "opt", "role": "primary"}, "additional_calculations": []}], + })) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=doc, species_record=_full_record(), + ) + payload = json.loads(outcome.payload_path.read_text()) + self.assertNotIn("statmech", payload) + + def test_zero_or_negative_fsf_is_rejected_silently(self): + # Schema requires gt 0; producer must not emit a registry- + # poisoning value. Falls through to "no FSF" behavior. + doc = self._doc_with_fsf(value=0.0) + _, payload = self._build_payload(doc) + self.assertNotIn("statmech", payload) + + def test_missing_method_on_freq_level_omits_fsf(self): + # LevelOfTheoryRef.method is required; if ARC's freq_level + # entry is malformed (no method), the producer can't build a + # well-formed FSF ref → omit cleanly rather than fail. + doc = self._doc_with_fsf(include_freq_level=False) + # Also strip arkane_level_of_theory so neither LOT source resolves. + doc.pop("arkane_level_of_theory", None) + _, payload = self._build_payload(doc) + self.assertNotIn("statmech", payload) + + def test_arkane_level_of_theory_used_as_lot_fallback(self): + # When freq_level is absent, the producer falls back to + # arkane_level_of_theory — common in single-LOT runs. + doc = self._doc_with_fsf(include_freq_level=False) + doc["arkane_level_of_theory"] = { + "method": "ccsd(t)-f12", "basis": "cc-pvtz-f12", "software": "molpro", + } + _, payload = self._build_payload(doc) + fsf = payload["statmech"]["freq_scale_factor"] + self.assertEqual(fsf["level_of_theory"]["method"], "ccsd(t)-f12") + self.assertEqual(fsf["software"], {"name": "molpro"}) + + # ---------------- 5: source_calculations use local keys, exclude opt_coarse + def test_source_calculations_use_local_keys(self): + _, payload = self._build_payload(self._doc_with_fsf()) + sm = payload["statmech"] + self.assertIn("source_calculations", sm) + # _full_record() carries opt + freq + sp (no coarse), so all + # three roles should resolve. + by_role = {entry["role"]: entry["calculation_key"] for entry in sm["source_calculations"]} + self.assertEqual(by_role, {"opt": "opt", "freq": "freq", "sp": "sp"}) + + def test_source_calculations_exclude_opt_coarse(self): + # Build a record with the coarse-opt stage so opt_coarse lands + # in included_calc_keys; statmech.source_calculations must + # still emit only opt/freq/sp (no opt_coarse role exists in + # the StatmechCalculationRole enum). + record = _full_record() + record["coarse_opt_log"] = "coarse.log" + record["coarse_opt_output_xyz"] = "C 0 0 0\nH 1 0 0" + record["coarse_opt_n_steps"] = 5 + record["coarse_opt_final_energy_hartree"] = -154.0 + client = _StubClient(response=_StubResponse({ + "species_entry_id": 1, + "conformers": [{"key": "conf0", "primary_calculation": {"key": "opt", "calculation_id": 1, "type": "opt", "role": "primary"}, "additional_calculations": []}], + })) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=self._doc_with_fsf(), species_record=record, + ) + payload = json.loads(outcome.payload_path.read_text()) + sources = payload["statmech"]["source_calculations"] + keys = {entry["calculation_key"] for entry in sources} + self.assertNotIn("opt_coarse", keys) + self.assertEqual(keys, {"opt", "freq", "sp"}) + + def test_source_calculations_only_include_emitted_keys(self): + # Strip sp from the record so only opt + freq emit; statmech + # source list must shrink in lockstep. + record = _full_record() + record["sp_energy_hartree"] = None + record.pop("electronic_energy_hartree", None) + client = _StubClient(response=_StubResponse({ + "species_entry_id": 1, + "conformers": [{"key": "conf0", "primary_calculation": {"key": "opt", "calculation_id": 1, "type": "opt", "role": "primary"}, "additional_calculations": []}], + })) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=self._doc_with_fsf(), species_record=record, + ) + payload = json.loads(outcome.payload_path.read_text()) + roles = [entry["role"] for entry in payload["statmech"]["source_calculations"]] + self.assertEqual(roles, ["opt", "freq"]) + + # ---------------- 6: idempotency — same content → same key + def test_payload_with_fsf_is_deterministic(self): + outcome1, payload1 = self._build_payload(self._doc_with_fsf()) + outcome2, payload2 = self._build_payload(self._doc_with_fsf()) + self.assertEqual(payload1, payload2) + self.assertEqual(outcome1.idempotency_key, outcome2.idempotency_key) + + def test_idempotency_key_changes_when_fsf_value_changes(self): + outcome1, _ = self._build_payload(self._doc_with_fsf(value=0.961)) + outcome2, _ = self._build_payload(self._doc_with_fsf(value=0.999)) + self.assertNotEqual(outcome1.idempotency_key, outcome2.idempotency_key) + + # ---------------- 7: live TCKDB schema validation + def test_payload_validates_against_tckdb_schema(self): + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_species_upload import ( + ComputedSpeciesUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + _, payload = self._build_payload(self._doc_with_fsf()) + # Sanity: the field is populated, not just bypassed. + self.assertTrue(payload["statmech"]["freq_scale_factor"]["value"]) + ComputedSpeciesUploadRequest.model_validate(payload) + + +class TestComputedSpeciesStatmechBaseFields(unittest.TestCase): + """Computed-species statmech: richer base fields (sym, point group, rotor kind, treatment, torsions).""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-stm-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode="computed_species", + ) + + def _adapter(self, client): + return TCKDBAdapter(self.cfg, client_factory=lambda c, k: client) + + @staticmethod + def _record_with_statmech(**overrides): + record = _full_record() + # Mirrors what arc/output.py::_statmech_to_dict writes for a + # converged non-monoatomic species. ARC's emitted fields that + # don't map onto TCKDB statmech (e0_kj_mol, optical_isomers, + # spin_multiplicity, harmonic_frequencies_cm1) are kept here so + # the test fixture matches output.yml shape; the producer is + # expected to ignore them. + record["statmech"] = { + "e0_kj_mol": 12.5, + "spin_multiplicity": 1, + "optical_isomers": 1, + "is_linear": False, + "external_symmetry": 2, + "point_group": "C2v", + "rigid_rotor_kind": "asymmetric_top", + "harmonic_frequencies_cm1": [3000.0, 1500.0, 800.0], + "torsions": [ + { + "symmetry_number": 3, + "treatment": "hindered_rotor", + "atom_indices": [1, 2, 3, 4], + "pivot_atoms": [2, 3], + "barrier_kj_mol": 12.0, + }, + ], + } + record["statmech"].update(overrides.pop("statmech", {})) + record.update(overrides) + return record + + def _submit(self, *, doc, record): + client = _StubClient(response=_StubResponse({ + "species_entry_id": 1, + "conformers": [{ + "key": "conf0", + "primary_calculation": { + "key": "opt", "calculation_id": 1, "type": "opt", "role": "primary", + }, + "additional_calculations": [], + }], + })) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=doc, species_record=record, + ) + return outcome, json.loads(outcome.payload_path.read_text()) + + def _doc(self): + # Reuse the FSF doc helper from the FSF test class — ensures + # the existing FSF wiring keeps working alongside the new + # base-fields wiring. + return TestComputedSpeciesStatmechFreqScaleFactor._doc_with_fsf() + + # ---------------- 1: external_symmetry + def test_external_symmetry_emitted(self): + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + self.assertEqual(payload["statmech"]["external_symmetry"], 2) + + # ---------------- 2: point_group (computed-species only) + def test_point_group_emitted_in_computed_species(self): + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + self.assertEqual(payload["statmech"]["point_group"], "C2v") + + # ---------------- 3: is_linear + def test_is_linear_emitted(self): + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + self.assertIs(payload["statmech"]["is_linear"], False) + + # ---------------- 4: rigid_rotor_kind + def test_rigid_rotor_kind_emitted(self): + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + self.assertEqual(payload["statmech"]["rigid_rotor_kind"], "asymmetric_top") + + # ---------------- 5: statmech_treatment derivation + def test_statmech_treatment_rrho_when_no_torsions(self): + record = self._record_with_statmech() + record["statmech"]["torsions"] = [] + _, payload = self._submit(doc=self._doc(), record=record) + self.assertEqual(payload["statmech"]["statmech_treatment"], "rrho") + + def test_statmech_treatment_rrho_1d_for_1d_rotors(self): + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + self.assertEqual(payload["statmech"]["statmech_treatment"], "rrho_1d") + + def test_statmech_treatment_rrho_nd_for_nd_only(self): + record = self._record_with_statmech() + record["statmech"]["torsions"] = [{ + "symmetry_number": 1, + "treatment": "hindered_rotor", + # ND scan: list of 4-int lists (ARC's directed_scan shape). + "atom_indices": [[1, 2, 3, 4], [5, 2, 3, 6]], + "pivot_atoms": [2, 3], + }] + _, payload = self._submit(doc=self._doc(), record=record) + self.assertEqual(payload["statmech"]["statmech_treatment"], "rrho_nd") + + def test_statmech_treatment_rrho_1d_nd_for_mixed(self): + record = self._record_with_statmech() + record["statmech"]["torsions"] = [ + {"symmetry_number": 3, "treatment": "hindered_rotor", + "atom_indices": [1, 2, 3, 4], "pivot_atoms": [2, 3]}, + {"symmetry_number": 1, "treatment": "hindered_rotor", + "atom_indices": [[5, 6, 7, 8], [9, 6, 7, 10]], "pivot_atoms": [6, 7]}, + ] + _, payload = self._submit(doc=self._doc(), record=record) + self.assertEqual(payload["statmech"]["statmech_treatment"], "rrho_1d_nd") + + def test_statmech_treatment_omitted_when_no_statmech_subdict(self): + # Without a statmech subdict, ARC didn't run an RRHO evaluation + # for this species — the spec forbids fabricating "rrho" in that + # case. The treatment field must be absent. + record = _full_record() + self.assertNotIn("statmech", record) + _, payload = self._submit(doc=self._doc(), record=record) + # FSF block still present; no statmech_treatment. + self.assertNotIn("statmech_treatment", payload["statmech"]) + + # ---------------- 6: FSF behavior preserved + def test_freq_scale_factor_behavior_preserved(self): + # Sanity: the FSF block remains fully populated even when the + # richer base-fields machinery runs. + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + self.assertIn("freq_scale_factor", payload["statmech"]) + self.assertAlmostEqual(payload["statmech"]["freq_scale_factor"]["value"], 0.961) + + # ---------------- 7: source_calculations still emitted (computed-species) + def test_source_calculations_present_in_computed_species(self): + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + self.assertIn("source_calculations", payload["statmech"]) + roles = {e["role"] for e in payload["statmech"]["source_calculations"]} + self.assertEqual(roles, {"opt", "freq", "sp"}) + + # ---------------- 14/15: slim torsion shape only + def test_slim_torsions_emitted(self): + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + torsions = payload["statmech"]["torsions"] + self.assertEqual(len(torsions), 1) + t = torsions[0] + self.assertEqual(t["torsion_index"], 1) + self.assertEqual(t["symmetry_number"], 3) + self.assertEqual(t["treatment_kind"], "hindered_rotor") + + def test_slim_torsions_omit_only_unsupported_fields(self): + # The bundle schema accepts structured ``coordinates`` with + # atom1_index..atom4_index quartets and a ``dimension`` field, + # and now (since ARC emits scan calcs in the bundle) also + # ``source_scan_calculation_key``. Other ARC-side fields + # (``pivot_atoms``, ``barrier_kj_mol``, the raw ARC-shape + # ``atom_indices``) still have no destination column. + # ``source_scan_calculation_id`` is server-assigned and must + # never come from the producer side. This test's fixture + # carries no scan key, so the key is correctly absent here — + # see TestScanCalculations for the positive-pass-through case. + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + for t in payload["statmech"]["torsions"]: + for forbidden in ( + "atom_indices", # ARC's input shape — never the output shape + "pivot_atoms", "barrier_kj_mol", + "source_scan_calculation_id", + ): + self.assertNotIn(forbidden, t) + # No fixture key in this test, so absent in payload. + self.assertNotIn("source_scan_calculation_key", t) + + def test_slim_torsions_skip_unsupported_treatment(self): + # ARC could (now or in the future) emit a torsion with a + # treatment that isn't in TCKDB's TorsionTreatmentKind enum. + # The producer must drop those entries rather than emit an + # entry with a missing/invalid treatment_kind. + record = self._record_with_statmech() + record["statmech"]["torsions"] = [ + {"symmetry_number": 3, "treatment": "hindered_rotor", + "atom_indices": [1, 2, 3, 4], "pivot_atoms": [2, 3]}, + {"symmetry_number": 1, "treatment": "experimental_rotor_xyz", + "atom_indices": [5, 6, 7, 8], "pivot_atoms": [6, 7]}, + ] + _, payload = self._submit(doc=self._doc(), record=record) + torsions = payload["statmech"]["torsions"] + self.assertEqual(len(torsions), 1) + self.assertEqual(torsions[0]["treatment_kind"], "hindered_rotor") + # torsion_index runs over emitted entries, not source entries. + self.assertEqual(torsions[0]["torsion_index"], 1) + + # ---------------- coordinate definitions (1D, ND, malformed) + def test_torsion_coordinates_emitted_for_1d(self): + # Spec example: atom_indices=[5,1,2,3] should emit dimension=1 + # with one coordinate carrying atom1..atom4_index in order. + record = self._record_with_statmech() + record["statmech"]["torsions"] = [{ + "symmetry_number": 3, "treatment": "hindered_rotor", + "atom_indices": [5, 1, 2, 3], "pivot_atoms": [1, 2], + "barrier_kj_mol": 11.99, + }] + _, payload = self._submit(doc=self._doc(), record=record) + torsions = payload["statmech"]["torsions"] + self.assertEqual(len(torsions), 1) + t = torsions[0] + self.assertEqual(t["dimension"], 1) + self.assertEqual(len(t["coordinates"]), 1) + c = t["coordinates"][0] + self.assertEqual(c, { + "coordinate_index": 1, + "atom1_index": 5, "atom2_index": 1, + "atom3_index": 2, "atom4_index": 3, + }) + + def test_torsion_coordinate_atom_indices_preserved_1based(self): + # Indices must round-trip exactly — no off-by-one, no + # canonicalization, no sort. + record = self._record_with_statmech() + record["statmech"]["torsions"] = [{ + "symmetry_number": 1, "treatment": "free_rotor", + "atom_indices": [9, 4, 7, 12], "pivot_atoms": [4, 7], + }] + _, payload = self._submit(doc=self._doc(), record=record) + c = payload["statmech"]["torsions"][0]["coordinates"][0] + self.assertEqual( + (c["atom1_index"], c["atom2_index"], c["atom3_index"], c["atom4_index"]), + (9, 4, 7, 12), + ) + # All ≥ 1 (server-side CheckConstraint). + for i in range(1, 5): + self.assertGreaterEqual(c[f"atom{i}_index"], 1) + + def test_torsion_coordinates_emitted_for_nd(self): + # ND shape: ARC's directed_scan rotor has atom_indices as a + # list of 4-int sub-lists. The producer emits one coordinate + # per sub-list with contiguous coordinate_index values. + record = self._record_with_statmech() + record["statmech"]["torsions"] = [{ + "symmetry_number": 1, "treatment": "hindered_rotor", + "atom_indices": [[1, 2, 3, 4], [5, 2, 3, 6]], + "pivot_atoms": [2, 3], + }] + _, payload = self._submit(doc=self._doc(), record=record) + t = payload["statmech"]["torsions"][0] + self.assertEqual(t["dimension"], 2) + self.assertEqual(len(t["coordinates"]), 2) + self.assertEqual(t["coordinates"][0]["coordinate_index"], 1) + self.assertEqual(t["coordinates"][1]["coordinate_index"], 2) + self.assertEqual(t["coordinates"][0]["atom1_index"], 1) + self.assertEqual(t["coordinates"][1]["atom1_index"], 5) + + def test_torsion_with_missing_atom_indices_emits_summary_only(self): + # No atom_indices at all → emit symmetry/treatment but no + # coordinates and no dimension override. Must not fail upload. + record = self._record_with_statmech() + record["statmech"]["torsions"] = [{ + "symmetry_number": 3, "treatment": "hindered_rotor", + "pivot_atoms": [2, 3], # no atom_indices key at all + }] + _, payload = self._submit(doc=self._doc(), record=record) + torsions = payload["statmech"]["torsions"] + self.assertEqual(len(torsions), 1) + t = torsions[0] + self.assertEqual(t["torsion_index"], 1) + self.assertEqual(t["symmetry_number"], 3) + self.assertEqual(t["treatment_kind"], "hindered_rotor") + self.assertNotIn("coordinates", t) + self.assertNotIn("dimension", t) + + def test_torsion_with_malformed_atom_indices_emits_summary_only(self): + # Length != 4, non-positive ints, or duplicate atoms → producer + # logs a warning and falls back to summary-only. Asserting + # against multiple bad-input shapes pins the same fallback for + # all of them. + for bad_indices in ([1, 2, 3], [1, 2, 3, 4, 5], [0, 1, 2, 3], + [-1, 2, 3, 4], [1, 1, 2, 3], "1234", None): + record = self._record_with_statmech() + record["statmech"]["torsions"] = [{ + "symmetry_number": 3, "treatment": "hindered_rotor", + "atom_indices": bad_indices, + }] + _, payload = self._submit(doc=self._doc(), record=record) + torsions = payload["statmech"]["torsions"] + # Always one torsion emitted (fallback, not skip). + self.assertEqual( + len(torsions), 1, + f"bad atom_indices={bad_indices!r} dropped the torsion entirely", + ) + self.assertNotIn( + "coordinates", torsions[0], + f"bad atom_indices={bad_indices!r} produced coordinates anyway", + ) + + def test_torsion_source_scan_calculation_key_omitted(self): + # ARC doesn't yet emit `type=scan` calcs in the bundle, so + # there's no in-bundle key for source_scan_calculation_key to + # reference. The server-side validator rejects keys that don't + # resolve to a real bundle calc; producer must omit the field + # rather than fabricate one. + record = self._record_with_statmech() + record["statmech"]["torsions"] = [{ + "symmetry_number": 3, "treatment": "hindered_rotor", + "atom_indices": [5, 1, 2, 3], "pivot_atoms": [1, 2], + # ARC has scan_path on rotor dicts, but the producer must + # not turn that into a bundle key out of thin air. + "scan_path": "calcs/Species/x/scan_a45/output.log", + }] + _, payload = self._submit(doc=self._doc(), record=record) + t = payload["statmech"]["torsions"][0] + self.assertNotIn("source_scan_calculation_key", t) + self.assertNotIn("source_scan_calculation_id", t) + + # ---------------- 16: unknown enum values omitted + def test_invalid_rigid_rotor_kind_omitted(self): + record = self._record_with_statmech() + record["statmech"]["rigid_rotor_kind"] = "futuristic_top" + _, payload = self._submit(doc=self._doc(), record=record) + self.assertNotIn("rigid_rotor_kind", payload["statmech"]) + + def test_invalid_external_symmetry_omitted(self): + # ge=1 in both ORM (CheckConstraint) and Pydantic Field. + record = self._record_with_statmech() + record["statmech"]["external_symmetry"] = 0 + _, payload = self._submit(doc=self._doc(), record=record) + self.assertNotIn("external_symmetry", payload["statmech"]) + + def test_optical_isomers_not_routed_to_statmech(self): + # Spec: TCKDB has no statmech column for optical_isomers; ARC + # must not invent one or route it elsewhere within statmech. + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + self.assertNotIn("optical_isomers", payload["statmech"]) + + def test_uses_projected_frequencies_omitted(self): + # ARC doesn't reliably record this; the field must stay absent. + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + self.assertNotIn("uses_projected_frequencies", payload["statmech"]) + + # ---------------- 17: live schema validation + def test_payload_validates_against_tckdb_schema(self): + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_species_upload import ( + ComputedSpeciesUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + _, payload = self._submit(doc=self._doc(), record=self._record_with_statmech()) + ComputedSpeciesUploadRequest.model_validate(payload) + + +class TestScanCalculations(unittest.TestCase): + """Computed-species: rotor-scan additional_calculations + statmech link. + + Pins the contract between ``arc/output.py`` (which writes + ``species_record["additional_calculations"]`` and + ``statmech.torsions[i]["source_scan_calculation_key"]``) and the + TCKDB adapter (which projects those onto the bundle). + """ + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-scan-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode="computed_species", + ) + + def _adapter(self, client): + return TCKDBAdapter(self.cfg, client_factory=lambda c, k: client) + + @staticmethod + def _scan_result_payload(): + """A minimal but TCKDB-valid scan_result dict (1D dihedral, 3 points).""" + return { + "dimension": 1, + "is_relaxed": True, + "zero_energy_reference_hartree": -154.123456, + "coordinates": [{ + "coordinate_index": 1, + "coordinate_kind": "dihedral", + "atom1_index": 5, + "atom2_index": 1, + "atom3_index": 2, + "atom4_index": 3, + "step_count": 3, + "value_unit": "degree", + "symmetry_number": 3, + }], + "points": [ + { + "point_index": 1, + "electronic_energy_hartree": -154.123456, + "relative_energy_kj_mol": 0.0, + "coordinate_values": [{ + "coordinate_index": 1, "coordinate_value": 0.0, + "value_unit": "degree", + }], + }, + { + "point_index": 2, + "electronic_energy_hartree": -154.121456, + "relative_energy_kj_mol": 5.25, + "coordinate_values": [{ + "coordinate_index": 1, "coordinate_value": 120.0, + "value_unit": "degree", + }], + }, + { + "point_index": 3, + "electronic_energy_hartree": -154.122456, + "relative_energy_kj_mol": 2.62, + "coordinate_values": [{ + "coordinate_index": 1, "coordinate_value": 240.0, + "value_unit": "degree", + }], + }, + ], + } + + def _record_with_scan(self, *, with_torsion_link=True, scan_key="scan_rotor_0"): + record = _full_record() + # statmech torsions: one 1D rotor whose source_scan_calculation_key + # points at the bundle-local scan calc emitted below. + torsion = { + "symmetry_number": 3, + "treatment": "hindered_rotor", + "atom_indices": [5, 1, 2, 3], + "pivot_atoms": [1, 2], + "barrier_kj_mol": 5.25, + } + if with_torsion_link: + torsion["source_scan_calculation_key"] = scan_key + record["statmech"] = { + "is_linear": False, + "external_symmetry": 1, + "rigid_rotor_kind": "asymmetric_top", + "harmonic_frequencies_cm1": [3000.0, 1500.0, 800.0], + "torsions": [torsion], + } + record["additional_calculations"] = [{ + "key": scan_key, + "type": "scan", + "scan_result": self._scan_result_payload(), + }] + return record + + def _doc(self): + return TestComputedSpeciesStatmechFreqScaleFactor._doc_with_fsf() + + def _submit(self, *, record, doc=None): + client = _StubClient(response=_StubResponse({ + "species_entry_id": 1, + "conformers": [{ + "key": "conf0", + "primary_calculation": { + "key": "opt", "calculation_id": 1, "type": "opt", "role": "primary", + }, + "additional_calculations": [], + }], + })) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=doc or self._doc(), species_record=record, + ) + return outcome, client, json.loads(outcome.payload_path.read_text()) + + # ---- 3: adapter emits a type=scan calc with scan_result + def test_scan_calc_emitted_in_additional_calculations(self): + _, _, payload = self._submit(record=self._record_with_scan()) + adds = payload["conformers"][0]["additional_calculations"] + scans = [c for c in adds if c.get("type") == "scan"] + self.assertEqual(len(scans), 1) + scan = scans[0] + self.assertEqual(scan["key"], "scan_rotor_0") + self.assertEqual(scan["type"], "scan") + self.assertIn("scan_result", scan) + # scan_result round-tripped intact. + self.assertEqual(scan["scan_result"]["dimension"], 1) + self.assertEqual(len(scan["scan_result"]["points"]), 3) + + def test_scan_calc_uses_opt_level_fallback(self): + # No ``scan_level`` exists on rotors_dict — adapter must fall + # back to opt level + opt software (same fallback used for + # freq/sp when *_level is null on the doc). + _, _, payload = self._submit(record=self._record_with_scan()) + scan = next(c for c in payload["conformers"][0]["additional_calculations"] + if c["type"] == "scan") + self.assertEqual(scan["level_of_theory"]["method"], "wb97xd") + self.assertEqual(scan["software_release"]["name"], "gaussian") + + def test_scan_calc_depends_on_opt(self): + _, _, payload = self._submit(record=self._record_with_scan()) + scan = next(c for c in payload["conformers"][0]["additional_calculations"] + if c["type"] == "scan") + # Role must match the TCKDB ``DependencyRole`` enum value for + # scan→opt edges; using a producer-side string would 422. + self.assertEqual(scan["depends_on"], + [{"parent_calculation_key": "opt", "role": "scan_parent"}]) + + # ---- 4: statmech torsion linked via source_scan_calculation_key + def test_statmech_torsion_links_to_scan_key(self): + _, _, payload = self._submit(record=self._record_with_scan()) + torsions = payload["statmech"]["torsions"] + self.assertEqual(len(torsions), 1) + self.assertEqual(torsions[0]["source_scan_calculation_key"], "scan_rotor_0") + + def test_statmech_torsion_has_no_link_when_record_omits_key(self): + record = self._record_with_scan(with_torsion_link=False) + # Drop the matching scan calc too — the torsion has no upstream + # to point at, so producer correctly emits nothing. + record["additional_calculations"] = [] + _, _, payload = self._submit(record=record) + torsion = payload["statmech"]["torsions"][0] + self.assertNotIn("source_scan_calculation_key", torsion) + + def test_no_scan_calcs_when_additional_calculations_empty(self): + record = self._record_with_scan() + record["additional_calculations"] = [] + _, _, payload = self._submit(record=record) + adds = payload["conformers"][0]["additional_calculations"] + self.assertEqual([c["type"] for c in adds if c.get("type") == "scan"], []) + + def test_unknown_calc_type_in_additional_calculations_skipped(self): + # If the producer adds a future calc type before the adapter + # learns about it, the unknown type is dropped rather than + # uploaded blind. + record = self._record_with_scan() + record["additional_calculations"].append({ + "key": "future_thing", "type": "future", "future_result": {}, + }) + _, _, payload = self._submit(record=record) + types = [c.get("type") for c in payload["conformers"][0]["additional_calculations"]] + # Only the recognized types pass through. + self.assertNotIn("future", types) + self.assertIn("scan", types) + + def test_malformed_scan_calc_skipped_no_crash(self): + # Missing scan_result / non-string key / non-dict scan_result. + record = self._record_with_scan() + record["additional_calculations"] = [ + {"key": "scan_rotor_0", "type": "scan"}, # no scan_result + {"key": "", "type": "scan", "scan_result": {"dimension": 1}}, # empty key + {"key": "scan_rotor_2", "type": "scan", "scan_result": "oops"},# wrong shape + ] + # Drop torsion link so we don't expect a scan to back it. + record["statmech"]["torsions"][0].pop("source_scan_calculation_key", None) + _, _, payload = self._submit(record=record) + scans = [c for c in payload["conformers"][0]["additional_calculations"] + if c.get("type") == "scan"] + self.assertEqual(scans, []) # nothing usable, nothing emitted + + # ---- 5: payload validates against the live TCKDB schema + def test_payload_validates_against_live_schema(self): + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_species_upload import ( + ComputedSpeciesUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + _, _, payload = self._submit(record=self._record_with_scan()) + ComputedSpeciesUploadRequest.model_validate(payload) + + # ---- 6: per-point scan geometries flow through unchanged. + # + # The producer (``arc/output.py::_build_scan_result_for_rotor``) + # attaches ``geometry.xyz_text`` to each scan point when aligned + # geometries are available. The adapter must pass that through + # unchanged so TCKDB can resolve it into ``calc_scan_point.geometry_id``. + + def _record_with_scan_geometries(self): + record = self._record_with_scan() + scan_calc = record["additional_calculations"][0] + # Inject one geometry per scan point — TCKDB count-headered shape. + for i, point in enumerate(scan_calc["scan_result"]["points"]): + point["geometry"] = { + "xyz_text": f"2\n\nC 0.0 0.0 {0.1 * i}\nH 1.0 0.0 0.0", + } + return record + + def test_scan_point_geometries_pass_through_to_payload(self): + record = self._record_with_scan_geometries() + _, _, payload = self._submit(record=record) + scan = next(c for c in payload["conformers"][0]["additional_calculations"] + if c["type"] == "scan") + for point in scan["scan_result"]["points"]: + self.assertIn("geometry", point) + self.assertIn("xyz_text", point["geometry"]) + # No DB ids may leak in. + self.assertNotIn("geometry_id", point) + self.assertNotIn("geometry_id", point["geometry"]) + self.assertEqual(set(point["geometry"].keys()), {"xyz_text"}) + + def test_scan_with_geometries_validates_against_live_schema(self): + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_species_upload import ( + ComputedSpeciesUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + _, _, payload = self._submit(record=self._record_with_scan_geometries()) + # Server's CalculationScanPointPayload now accepts inline + # ``geometry: GeometryPayload | None`` — the bundle must validate. + ComputedSpeciesUploadRequest.model_validate(payload) + + +class TestComputedReactionStatmechBaseFields(unittest.TestCase): + """Computed-reaction per-species statmech: subset accepted by BundleStatmechIn.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-rxn-stm-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode="computed_reaction", + ) + + def _adapter(self, client): + return TCKDBAdapter(self.cfg, client_factory=lambda c, k: client) + + @staticmethod + def _doc_with_statmech_and_fsf(): + doc = _reaction_output_doc() + doc["freq_scale_factor"] = 0.961 + doc["freq_scale_factor_source"] = "J. Chem. Theory Comput. 2010, 6, 2872" + doc["freq_level"] = { + "method": "wb97xd", "basis": "def2-tzvp", "software": "gaussian", + } + # Attach a statmech subdict to one reactant species so we can + # exercise the full computed-reaction statmech path. + for s in doc["species"]: + if s["label"] == "CHO": + s["statmech"] = { + "e0_kj_mol": 8.0, + "spin_multiplicity": 2, + "optical_isomers": 1, + "is_linear": False, + "external_symmetry": 2, + "point_group": "Cs", # MUST be filtered out + "rigid_rotor_kind": "asymmetric_top", + "harmonic_frequencies_cm1": [], + "torsions": [ + {"symmetry_number": 3, "treatment": "hindered_rotor", + "atom_indices": [1, 2, 3, 4], "pivot_atoms": [2, 3]}, + ], + } + return doc + + def _submit(self, *, doc): + client = _StubClient(response=_StubResponse({"reaction_id": 42})) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_reaction_from_output( + output_doc=doc, reaction_record=_reaction_record(), + ) + return outcome, json.loads(outcome.payload_path.read_text()) + + def _r0_statmech(self, payload): + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + self.assertIn("statmech", r0) + return r0["statmech"] + + # ---------------- 8/9/10/11: schema-supported fields propagate + def test_external_symmetry_emitted(self): + _, payload = self._submit(doc=self._doc_with_statmech_and_fsf()) + self.assertEqual(self._r0_statmech(payload)["external_symmetry"], 2) + + def test_is_linear_emitted(self): + _, payload = self._submit(doc=self._doc_with_statmech_and_fsf()) + self.assertIs(self._r0_statmech(payload)["is_linear"], False) + + def test_rigid_rotor_kind_emitted(self): + _, payload = self._submit(doc=self._doc_with_statmech_and_fsf()) + self.assertEqual( + self._r0_statmech(payload)["rigid_rotor_kind"], "asymmetric_top" + ) + + def test_statmech_treatment_emitted(self): + _, payload = self._submit(doc=self._doc_with_statmech_and_fsf()) + self.assertEqual(self._r0_statmech(payload)["statmech_treatment"], "rrho_1d") + + # ---------------- 1: point_group flows through + def test_point_group_emitted_in_computed_reaction(self): + # Schema expansion: BundleStatmechIn now accepts point_group. + # The producer must surface it from the species statmech subdict + # written by arc/output.py::_statmech_to_dict. + _, payload = self._submit(doc=self._doc_with_statmech_and_fsf()) + self.assertEqual(self._r0_statmech(payload)["point_group"], "Cs") + + # ---------------- 2 + 3: scoped source_calculations + def test_source_calculations_emitted_with_scoped_keys(self): + # Schema expansion: BundleStatmechIn now accepts + # source_calculations. Each species block must reference only + # its own scoped calculation keys (r0_*, p0_*, ...). Sibling + # species and the TS use disjoint namespaces. + _, payload = self._submit(doc=self._doc_with_statmech_and_fsf()) + sm = self._r0_statmech(payload) + sources = sm["source_calculations"] + by_role = {sc["role"]: sc["calculation_key"] for sc in sources} + self.assertEqual(by_role, {"opt": "r0_opt", "freq": "r0_freq", "sp": "r0_sp"}) + + # ---------------- 4 + 5: no TS / sibling-species leakage + def test_source_calculations_never_reference_ts_or_siblings(self): + # Defense in depth: every species block's statmech.source_calcs + # must reference only that species's own calc keys. Cross-actor + # references (TS keys leaking into a reactant, or r0 keys + # leaking into r1) would 422 server-side under the workflow's + # owner-consistency check. + _, payload = self._submit(doc=self._doc_with_statmech_and_fsf()) + for species in payload["species"]: + sm = species.get("statmech") + if sm is None or "source_calculations" not in sm: + continue + own_prefix = species["key"].split("_", 1)[0] + "_" + for sc in sm["source_calculations"]: + key = sc["calculation_key"] + self.assertTrue( + key.startswith(own_prefix), + f"species[{species['key']}] statmech references " + f"foreign calc key {key!r} (expected prefix {own_prefix!r})", + ) + self.assertFalse(key.startswith("ts_")) + + # ---------------- 6: missing source calculations omitted cleanly + def test_source_calculations_drop_missing_roles(self): + # If a species lacks an SP calc (sp_energy_hartree absent), the + # SP source must not be emitted, but opt/freq still flow. + doc = self._doc_with_statmech_and_fsf() + for s in doc["species"]: + if s["label"] == "CHO": + # Strip SP so the reaction species block emits opt+freq only. + s["sp_energy_hartree"] = None + s.pop("electronic_energy_hartree", None) + _, payload = self._submit(doc=doc) + sm = self._r0_statmech(payload) + roles = [sc["role"] for sc in sm["source_calculations"]] + self.assertEqual(roles, ["opt", "freq"]) + keys = {sc["calculation_key"] for sc in sm["source_calculations"]} + self.assertEqual(keys, {"r0_opt", "r0_freq"}) + + # ---------------- 14: slim torsions in reaction mode + def test_slim_torsions_emitted(self): + _, payload = self._submit(doc=self._doc_with_statmech_and_fsf()) + torsions = self._r0_statmech(payload)["torsions"] + self.assertEqual(len(torsions), 1) + self.assertEqual(torsions[0]["torsion_index"], 1) + self.assertEqual(torsions[0]["symmetry_number"], 3) + self.assertEqual(torsions[0]["treatment_kind"], "hindered_rotor") + + def test_freq_scale_factor_behavior_preserved(self): + _, payload = self._submit(doc=self._doc_with_statmech_and_fsf()) + sm = self._r0_statmech(payload) + self.assertIn("freq_scale_factor", sm) + self.assertAlmostEqual(sm["freq_scale_factor"]["value"], 0.961) + + def test_no_statmech_when_no_fsf_and_no_subdict(self): + # Backward-compat: a reaction doc with no FSF and no statmech + # subdict on any species must not produce a statmech block on + # those species. + doc = _reaction_output_doc() # no FSF, no statmech subdict + _, payload = self._submit(doc=doc) + for s in payload["species"]: + self.assertNotIn("statmech", s) + + # ---------------- 18: live schema validation + def test_payload_validates_against_tckdb_schema(self): + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_reaction_upload import ( + ComputedReactionUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + _, payload = self._submit(doc=self._doc_with_statmech_and_fsf()) + ComputedReactionUploadRequest.model_validate(payload) + + +class TestComputedReactionBundle(unittest.TestCase): + """Producer-side tests for the /uploads/computed-reaction bundle path.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-rxn-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode="computed_reaction", + ) + + def _adapter(self, client, *, project_directory=None, cfg=None): + return TCKDBAdapter( + cfg or self.cfg, + project_directory=project_directory, + client_factory=lambda c, k: client, + ) + + def _submit(self, *, output_doc=None, reaction=None, client=None, + project_directory=None, cfg=None): + client = client or _StubClient(response=_StubResponse({"reaction_id": 42})) + adapter = self._adapter(client, project_directory=project_directory, cfg=cfg) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_reaction_from_output( + output_doc=output_doc or _reaction_output_doc(), + reaction_record=reaction or _reaction_record(), + ) + return outcome, client, json.loads(outcome.payload_path.read_text()) + + # ---------------- 1: payload top-level shape + def test_payload_top_level_keys(self): + _, _, payload = self._submit() + self.assertIn("species", payload) + self.assertIn("reactant_keys", payload) + self.assertIn("product_keys", payload) + self.assertIn("transition_state", payload) + self.assertIn("kinetics", payload) + self.assertEqual(payload["reaction_family"], "H_Abstraction") + + # ---------------- 2: deterministic local keys for species + def test_species_local_keys_namespaced(self): + _, _, payload = self._submit() + self.assertEqual(payload["reactant_keys"], ["r0_CHO", "r1_CH4"]) + self.assertEqual(payload["product_keys"], ["p0_CH2O", "p1_CH3"]) + species_keys = sorted(s["key"] for s in payload["species"]) + self.assertEqual(species_keys, ["p0_CH2O", "p1_CH3", "r0_CHO", "r1_CH4"]) + + # ---------------- 3: species blocks have one conformer + opt + freq + sp + def test_species_block_contains_opt_freq_sp(self): + _, _, payload = self._submit() + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + self.assertEqual(r0["species_entry"]["smiles"], "[CH]=O") + self.assertEqual(len(r0["conformers"]), 1) + primary = r0["conformers"][0]["calculation"] + self.assertEqual(primary["key"], "r0_opt") + self.assertEqual(primary["type"], "opt") + additional_keys = sorted(c["key"] for c in r0["calculations"]) + self.assertEqual(additional_keys, ["r0_freq", "r0_sp"]) + + # ---------------- 3a: rotor-scan calcs emitted alongside opt/freq/sp + # + # Regression: ``_build_reaction_species_block`` previously emitted + # only opt/freq/sp; statmech torsions referencing ``scan_rotor_`` + # ended up dangling and the server's bundle validator 422'd with + # ``source_scan_calculation_key references undefined calculation_key``. + # The fix mirrors the ``_build_conformer_block`` scan loop into the + # reaction-side species builder. + @staticmethod + def _scan_result_payload(): + """Minimal but TCKDB-valid 1D dihedral scan_result, 3 points.""" + return { + "dimension": 1, + "is_relaxed": True, + "zero_energy_reference_hartree": -113.7, + "coordinates": [{ + "coordinate_index": 1, + "coordinate_kind": "dihedral", + "atom1_index": 1, + "atom2_index": 2, + "atom3_index": 3, + "atom4_index": 4, + "step_count": 3, + "value_unit": "degree", + "symmetry_number": 3, + }], + "points": [ + { + "point_index": 1, + "electronic_energy_hartree": -113.7, + "relative_energy_kj_mol": 0.0, + "coordinate_values": [{ + "coordinate_index": 1, "coordinate_value": 0.0, + "value_unit": "degree", + }], + }, + { + "point_index": 2, + "electronic_energy_hartree": -113.69, + "relative_energy_kj_mol": 5.25, + "coordinate_values": [{ + "coordinate_index": 1, "coordinate_value": 120.0, + "value_unit": "degree", + }], + }, + { + "point_index": 3, + "electronic_energy_hartree": -113.695, + "relative_energy_kj_mol": 2.62, + "coordinate_values": [{ + "coordinate_index": 1, "coordinate_value": 240.0, + "value_unit": "degree", + }], + }, + ], + } + + def _doc_with_scan_on_reactant(self, *, scan_keys=("scan_rotor_0",), + target_label="CHO"): + """Augment one reactant species with hindered-rotor torsions and + matching ``additional_calculations`` scan entries. Mirrors what + ``arc/output.py`` writes into ``output.yml`` for a species with + parsed 1D rotors.""" + doc = _reaction_output_doc() + spc = next(s for s in doc["species"] if s["label"] == target_label) + spc["additional_calculations"] = [ + {"key": k, "type": "scan", "scan_result": self._scan_result_payload()} + for k in scan_keys + ] + spc["statmech"] = { + "is_linear": False, + "external_symmetry": 1, + "rigid_rotor_kind": "asymmetric_top", + "harmonic_frequencies_cm1": [3000.0, 1500.0, 800.0], + "torsions": [ + { + "symmetry_number": 3, + "treatment": "hindered_rotor", + "atom_indices": [1, 2, 3, 4], + "pivot_atoms": [2, 3], + "barrier_kj_mol": 5.25, + "source_scan_calculation_key": k, + } + for k in scan_keys + ], + } + return doc + + def test_scan_calcs_emitted_in_species_calculations(self): + doc = self._doc_with_scan_on_reactant() + _, _, payload = self._submit(output_doc=doc) + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + scans = [c for c in r0["calculations"] if c.get("type") == "scan"] + self.assertEqual(len(scans), 1) + scan = scans[0] + # Per-species namespace: ``validate_unique_keys`` enforces + # globally-unique calc keys across the bundle, so the producer + # prefixes scan keys with the species's calc_prefix (``r0`` for + # the first reactant). The matching rewrite happens on the + # torsion side, asserted below. + self.assertEqual(scan["key"], "r0_scan_rotor_0") + self.assertEqual(scan["type"], "scan") + self.assertIn("scan_result", scan) + self.assertEqual(scan["scan_result"]["dimension"], 1) + self.assertEqual(len(scan["scan_result"]["points"]), 3) + + def test_scan_calc_carries_geometry_key_to_conformer(self): + # Reaction-path schema requires non-opt species calcs to point + # at the conformer geometry by key (same reason freq/sp do). + doc = self._doc_with_scan_on_reactant() + _, _, payload = self._submit(output_doc=doc) + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + scan = next(c for c in r0["calculations"] if c["type"] == "scan") + self.assertEqual(scan["geometry_key"], "r0_CHO_geom") + + def test_scan_calc_depends_on_species_opt(self): + # Edge points back to *this* species's opt — not the TS's, + # not a sibling reactant's. Owner-consistent or 422. + doc = self._doc_with_scan_on_reactant() + _, _, payload = self._submit(output_doc=doc) + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + scan = next(c for c in r0["calculations"] if c["type"] == "scan") + self.assertEqual( + scan["depends_on"], + [{"parent_calculation_key": "r0_opt", "role": "scan_parent"}], + ) + + def test_statmech_torsion_reference_rewritten_to_namespaced_key(self): + # The torsion's ``source_scan_calculation_key`` must be rewritten + # in lockstep with the calc-key namespacing — otherwise it would + # dangle and 422. + doc = self._doc_with_scan_on_reactant() + _, _, payload = self._submit(output_doc=doc) + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + torsion_refs = { + t["source_scan_calculation_key"] + for t in r0["statmech"]["torsions"] + if "source_scan_calculation_key" in t + } + emitted_calc_keys = { + c["key"] for c in r0["calculations"] if c.get("type") == "scan" + } + self.assertEqual(torsion_refs, {"r0_scan_rotor_0"}) + # And the rewritten key resolves to an actually-emitted calc. + self.assertTrue(torsion_refs.issubset(emitted_calc_keys)) + + def test_scan_keys_namespaced_per_species_to_avoid_collisions(self): + # Two reactants both report a raw ``scan_rotor_0`` from output.yml. + # Globally-unique-key validation requires the producer namespace + # them as ``r0_scan_rotor_0`` and ``r1_scan_rotor_0``; both + # torsions correspondingly point at their species's namespaced + # calc, so neither dangles. + doc = self._doc_with_scan_on_reactant(target_label="CHO") + ch4 = next(s for s in doc["species"] if s["label"] == "CH4") + ch4["additional_calculations"] = [{ + "key": "scan_rotor_0", + "type": "scan", + "scan_result": self._scan_result_payload(), + }] + ch4["statmech"] = { + "is_linear": False, + "external_symmetry": 1, + "rigid_rotor_kind": "asymmetric_top", + "harmonic_frequencies_cm1": [3000.0, 1500.0, 800.0], + "torsions": [{ + "symmetry_number": 3, + "treatment": "hindered_rotor", + "atom_indices": [1, 2, 3, 4], + "pivot_atoms": [2, 3], + "barrier_kj_mol": 5.25, + "source_scan_calculation_key": "scan_rotor_0", + }], + } + _, _, payload = self._submit(output_doc=doc) + # Calc keys disjoint between the two reactants. + expected_by_species = { + "r0_CHO": "r0_scan_rotor_0", + "r1_CH4": "r1_scan_rotor_0", + } + for sp_key, expected_calc_key in expected_by_species.items(): + sp = next(s for s in payload["species"] if s["key"] == sp_key) + scans = [c for c in sp["calculations"] if c.get("type") == "scan"] + self.assertEqual([s["key"] for s in scans], [expected_calc_key]) + torsion_refs = [ + t.get("source_scan_calculation_key") + for t in sp["statmech"]["torsions"] + ] + self.assertEqual(torsion_refs, [expected_calc_key]) + # Sanity: globally unique across the whole bundle. + all_calc_keys = [] + for sp in payload["species"]: + for c in sp["calculations"]: + all_calc_keys.append(c["key"]) + for conf in sp["conformers"]: + all_calc_keys.append(conf["calculation"]["key"]) + self.assertEqual(len(set(all_calc_keys)), len(all_calc_keys), + msg=f"duplicate calc keys: {all_calc_keys}") + + def test_payload_with_scans_validates_against_live_reaction_schema(self): + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_reaction_upload import ( + ComputedReactionUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + doc = self._doc_with_scan_on_reactant( + scan_keys=("scan_rotor_0", "scan_rotor_1"), + ) + _, _, payload = self._submit(output_doc=doc) + # If any torsion's source_scan_calculation_key still dangled, + # this validator (computed_reaction_upload.py:840-848) would + # raise — exactly the 422 we hit in the field. + ComputedReactionUploadRequest.model_validate(payload) + + # ---------------- 3b: per-point scan geometries flow through to the bundle + def test_scan_point_geometries_pass_through_in_reaction_bundle(self): + # Producer (``arc/output.py``) emits ``points[i].geometry.xyz_text`` + # in the scan_result; the reaction-bundle adapter must preserve + # that intact so TCKDB resolves it into calc_scan_point.geometry_id. + doc = self._doc_with_scan_on_reactant() + spc = next(s for s in doc["species"] if s["label"] == "CHO") + scan_entry = spc["additional_calculations"][0] + for i, point in enumerate(scan_entry["scan_result"]["points"]): + point["geometry"] = { + "xyz_text": f"2\n\nC 0.0 0.0 {0.1 * i}\nH 1.0 0.0 0.0", + } + _, _, payload = self._submit(output_doc=doc) + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + scan = next(c for c in r0["calculations"] if c["type"] == "scan") + # Every scan point came through with its geometry intact; + # nothing was rewritten or coerced by the adapter. + for point in scan["scan_result"]["points"]: + self.assertIn("geometry", point) + self.assertEqual(set(point["geometry"].keys()), {"xyz_text"}) + self.assertNotIn("geometry_id", point) + self.assertNotIn("geometry_id", point["geometry"]) + + # ---------------- 4: inline TS block with charge, multiplicity, geometry, calcs + def test_ts_block_inline(self): + _, _, payload = self._submit() + ts = payload["transition_state"] + self.assertEqual(ts["charge"], 0) + self.assertEqual(ts["multiplicity"], 2) + self.assertEqual(ts["geometry"]["key"], "ts_geom") + self.assertIn("xyz_text", ts["geometry"]) + self.assertEqual(ts["calculation"]["key"], "ts_opt") + self.assertEqual(ts["calculation"]["type"], "opt") + ts_additional_keys = sorted(c["key"] for c in ts["calculations"]) + self.assertEqual(ts_additional_keys, ["ts_freq", "ts_sp"]) + self.assertEqual(ts["label"], "TS0") + + # ---------------- TS unmapped_smiles handle + def test_ts_unmapped_smiles_built_from_reactant_product_smiles(self): + # Default fixture: TS has no smiles of its own, but every + # reactant/product carries one. Producer should synthesize a + # reaction-SMILES handle ``".>>."``. + _, _, payload = self._submit() + ts = payload["transition_state"] + # Fixture SMILES: CHO=[CH]=O, CH4=C, CH2O=C=O, CH3=[CH3]. + self.assertEqual(ts.get("unmapped_smiles"), "[CH]=O.C>>C=O.[CH3]") + + def test_ts_block_carries_no_mol_field(self): + # Producer must not invent a normal-molecule representation + # from the TS geometry. ``mol`` (under any common spelling) + # must not appear anywhere in the TS subtree. + _, _, payload = self._submit() + text = json.dumps(payload["transition_state"]) + for forbidden in ("\"mol\":", "\"rdkit_mol\":", "\"mol_object\":"): + self.assertNotIn(forbidden, text) + + def test_ts_unmapped_smiles_omitted_when_smiles_missing(self): + # If any reactant/product is missing a SMILES, the producer + # refuses to emit a half-built handle — better null than + # misleading. + doc = _reaction_output_doc() + cho = next(s for s in doc["species"] if s["label"] == "CHO") + cho["smiles"] = None + # Adapter requires SMILES on every species_entry, so we have + # to bypass that gate to test ts-side behavior in isolation: + # direct-call the helper rather than running the full submit. + from arc.tckdb.adapter import _ts_unmapped_smiles_handle, _index_species + species_index = _index_species(doc) + ts_record = doc["transition_states"][0] + rxn = doc["reactions"][0] + self.assertIsNone(_ts_unmapped_smiles_handle( + ts_record=ts_record, reaction_record=rxn, species_index=species_index, + )) + + def test_ts_record_smiles_takes_precedence_over_derived_handle(self): + # When ARC happens to attach an explicit SMILES to the TS + # record itself (rare), it should win over the derived + # reaction handle: it's higher-fidelity producer intent. + doc = _reaction_output_doc() + doc["transition_states"][0]["smiles"] = "[H]...[CH3]" # nonsense but explicit + _, _, payload = self._submit(output_doc=doc) + self.assertEqual( + payload["transition_state"].get("unmapped_smiles"), "[H]...[CH3]" + ) + + def test_ts_unmapped_smiles_is_deterministic(self): + # Same output_doc → same payload string → same idempotency key. + outcome1, _, payload1 = self._submit() + outcome2, _, payload2 = self._submit() + self.assertEqual( + payload1["transition_state"]["unmapped_smiles"], + payload2["transition_state"]["unmapped_smiles"], + ) + self.assertEqual(outcome1.idempotency_key, outcome2.idempotency_key) + + def test_ts_unmapped_smiles_does_not_perturb_kinetics_or_irc(self): + # Adding a TS handle is a TS-block-only concern. Kinetics, + # source_calculations, and any IRC subtree must be byte-equal + # to a baseline payload built by emptying out unmapped_smiles. + baseline_doc = _reaction_output_doc() + ts_baseline_smiles = baseline_doc["transition_states"][0].get("smiles") + _, _, payload = self._submit(output_doc=baseline_doc) + + # Re-build with the TS handle forced to None via direct stub + # of the helper. Compare kinetics + (if present) IRC subtrees. + from unittest.mock import patch + with patch("arc.tckdb.adapter._ts_unmapped_smiles_handle", return_value=None): + _, _, payload_no_handle = self._submit(output_doc=baseline_doc) + + self.assertEqual(payload["kinetics"], payload_no_handle["kinetics"]) + self.assertEqual( + [c for c in payload["transition_state"]["calculations"]], + [c for c in payload_no_handle["transition_state"]["calculations"]], + ) + # The TS primary calc and geometry are unaffected too. + self.assertEqual( + payload["transition_state"]["calculation"], + payload_no_handle["transition_state"]["calculation"], + ) + + def test_ts_unmapped_smiles_payload_validates_against_tckdb_schema(self): + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_reaction_upload import ( + ComputedReactionUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + _, _, payload = self._submit() + # Sanity: the field is populated, not just bypassed. + self.assertTrue(payload["transition_state"].get("unmapped_smiles")) + ComputedReactionUploadRequest.model_validate(payload) + + def test_ts_block_includes_irc_when_present(self): + doc = _reaction_output_doc(with_irc=True) + _, _, payload = self._submit(output_doc=doc) + ts = payload["transition_state"] + ts_calc_keys = {c["key"] for c in ts["calculations"]} + self.assertIn("ts_irc", ts_calc_keys) + irc_calc = next(c for c in ts["calculations"] if c["key"] == "ts_irc") + self.assertEqual(irc_calc["type"], "irc") + # IRC carries depends_on(role=irc_start) → ts_opt: IRC is seeded + # from the optimized TS saddle, so ts_opt is the geometry- + # producing parent. NOT freq_on (the TS freq validates the + # saddle but isn't the seed geometry). + self.assertEqual( + irc_calc["depends_on"], + [{"parent_calculation_key": "ts_opt", "role": "irc_start"}], + ) + + # ---------------- 5: kinetics: modified-Arrhenius mapping + units + def test_kinetics_modified_arrhenius_mapping(self): + _, _, payload = self._submit() + self.assertEqual(len(payload["kinetics"]), 1) + kin = payload["kinetics"][0] + self.assertEqual(kin["model_kind"], "modified_arrhenius") + self.assertAlmostEqual(kin["a"], 0.204298) + self.assertEqual(kin["a_units"], "cm3_mol_s") + self.assertAlmostEqual(kin["n"], 4.37949) + self.assertAlmostEqual(kin["reported_ea"], 78.9012) + self.assertEqual(kin["reported_ea_units"], "kj_mol") + self.assertAlmostEqual(kin["tmin_k"], 300.0) + self.assertAlmostEqual(kin["tmax_k"], 3000.0) + self.assertEqual(kin["reactant_keys"], ["r0_CHO", "r1_CH4"]) + self.assertEqual(kin["product_keys"], ["p0_CH2O", "p1_CH3"]) + + # ---------------- tunneling_model passthrough + def test_kinetics_tunneling_model_passes_through(self): + # output.yml records the tunneling method ARC asked Arkane to + # apply (currently always Eckart). The adapter must surface it + # verbatim as ``tunneling_model`` on the BundleKineticsIn so the + # DB row records which correction was applied to A/n/Ea. + _, _, payload = self._submit() + self.assertEqual(payload["kinetics"][0]["tunneling_model"], "Eckart") + + def test_kinetics_tunneling_model_arbitrary_value(self): + # No allowlist on the producer side — TCKDB's tunneling_model is + # a free-form str | None (computed_reaction_upload.py:463). If a + # future ARC config switches to Wigner / Skodje-Truhlar / etc., + # the adapter must pass it through unchanged. + rxn = _reaction_record() + rxn["kinetics"]["tunneling"] = "Wigner" + _, _, payload = self._submit(reaction=rxn) + self.assertEqual(payload["kinetics"][0]["tunneling_model"], "Wigner") + + def test_kinetics_tunneling_field_omitted_when_absent(self): + # Backward compat: output.yml from before the tunneling-surfacing + # change has no 'tunneling' key. The adapter must omit + # tunneling_model entirely (not emit ``null``) so older payloads + # stay structurally equivalent to what they would have produced + # before this change landed. + rxn = _reaction_record() + rxn["kinetics"].pop("tunneling", None) + _, _, payload = self._submit(reaction=rxn) + self.assertNotIn("tunneling_model", payload["kinetics"][0]) + + # ---------------- 6: dA → multiplicative a_uncertainty, dn → n_uncertainty, + # dEa → d_reported_ea + def test_kinetics_uncertainty_mapping_policy(self): + _, _, payload = self._submit() + kin = payload["kinetics"][0] + # dA is preserved verbatim as a multiplicative factor (NOT + # converted to A * (dA - 1) or any additive band). + self.assertAlmostEqual(kin["a_uncertainty"], 1.48466) + self.assertEqual(kin["a_uncertainty_kind"], "multiplicative") + # dn carries cleanly to n_uncertainty. + self.assertAlmostEqual(kin["n_uncertainty"], 0.0514735) + # dEa with same units as Ea → d_reported_ea (same units). + self.assertAlmostEqual(kin["d_reported_ea"], 0.294363) + + def test_kinetics_da_not_converted_to_additive(self): + # Guardrail against the old "additive band" misinterpretation: + # a_uncertainty must equal dA exactly, not A * (dA - 1). + _, _, payload = self._submit() + kin = payload["kinetics"][0] + bogus_additive = 0.204298 * (1.48466 - 1) # ≈ 0.099 + self.assertNotAlmostEqual(kin["a_uncertainty"], bogus_additive, places=4) + self.assertAlmostEqual(kin["a_uncertainty"], 1.48466) + + def test_kinetics_missing_da_omits_uncertainty(self): + rxn = _reaction_record() + rxn["kinetics"].pop("dA") + _, _, payload = self._submit(reaction=rxn) + kin = payload["kinetics"][0] + # When dA is absent the schema requires both fields to be absent + # (or both present); the producer omits. + self.assertNotIn("a_uncertainty", kin) + self.assertNotIn("a_uncertainty_kind", kin) + + def test_kinetics_da_below_one_omitted(self): + # Schema rejects multiplicative factors < 1.0; producer omits + # rather than upcasting or sending an invalid value. + rxn = _reaction_record() + rxn["kinetics"]["dA"] = 0.7 + _, _, payload = self._submit(reaction=rxn) + kin = payload["kinetics"][0] + self.assertNotIn("a_uncertainty", kin) + self.assertNotIn("a_uncertainty_kind", kin) + + def test_kinetics_dea_units_mismatch_omits_d_reported_ea(self): + rxn = _reaction_record() + rxn["kinetics"]["dEa_units"] = "kcal/mol" # Ea_units is kJ/mol + _, _, payload = self._submit(reaction=rxn) + kin = payload["kinetics"][0] + self.assertNotIn("d_reported_ea", kin) + + # ---------------- 7: kinetics.source_calculations populated by local keys + def test_kinetics_source_calculations_explicit(self): + _, _, payload = self._submit() + kin = payload["kinetics"][0] + sources = kin["source_calculations"] + by_role: dict[str, list[str]] = {} + for entry in sources: + by_role.setdefault(entry["role"], []).append(entry["calculation_key"]) + self.assertEqual(sorted(by_role["reactant_energy"]), ["r0_sp", "r1_sp"]) + self.assertEqual(sorted(by_role["product_energy"]), ["p0_sp", "p1_sp"]) + self.assertEqual(by_role["ts_energy"], ["ts_sp"]) + # In v0, kinetics 'freq' role means the TS frequency. + self.assertEqual(by_role["freq"], ["ts_freq"]) + # No reactant/product freq calc should be linked under role=freq. + for entry in sources: + if entry["role"] == "freq": + self.assertTrue(entry["calculation_key"].startswith("ts_")) + + def test_kinetics_source_calculations_omits_missing(self): + # Drop sp on one reactant — its source link should be omitted, + # not faked, and the kinetics block should still build. + doc = _reaction_output_doc() + cho = next(s for s in doc["species"] if s["label"] == "CHO") + cho["sp_energy_hartree"] = None + _, _, payload = self._submit(output_doc=doc) + kin = payload["kinetics"][0] + sources = kin["source_calculations"] + sp_keys = {entry["calculation_key"] for entry in sources} + self.assertNotIn("r0_sp", sp_keys) + self.assertIn("r1_sp", sp_keys) + + def test_kinetics_irc_source_only_when_irc_calc_exists(self): + # Default fixture has no IRC → no irc source link + _, _, payload = self._submit() + roles = {e["role"] for e in payload["kinetics"][0]["source_calculations"]} + self.assertNotIn("irc", roles) + # With IRC present → irc source link emitted + doc = _reaction_output_doc(with_irc=True) + _, _, payload2 = self._submit(output_doc=doc) + roles2 = [e for e in payload2["kinetics"][0]["source_calculations"] + if e["role"] == "irc"] + self.assertEqual(len(roles2), 1) + self.assertEqual(roles2[0]["calculation_key"], "ts_irc") + + # ---------------- IRC: structured result + output geometries + def _irc_fixture_with_logs_on_disk(self): + """Stage forward+reverse log files on disk and return (doc, project_dir). + + The TS record's ``irc_logs`` are populated with project-relative + paths to real (empty) log files; the parser is mocked separately + to return fake trajectories. The on-disk files exist only so + the adapter's ``is_file()`` gate passes — their contents are + irrelevant once the parser is patched. + """ + proj = tempfile.mkdtemp(prefix="arc-tckdb-irc-") + self.addCleanup(shutil.rmtree, proj, ignore_errors=True) + f_log = pathlib.Path(proj) / "TS0_irc_f.log" + r_log = pathlib.Path(proj) / "TS0_irc_r.log" + f_log.write_text("dummy") + r_log.write_text("dummy") + doc = _reaction_output_doc(with_irc=True) + ts = doc["transition_states"][0] + ts["irc_logs"] = ["TS0_irc_f.log", "TS0_irc_r.log"] + return doc, proj + + @staticmethod + def _fake_irc_points(label): + """Build two distinct ARC xyz_dicts so forward != reverse endpoint.""" + a = float({"f": 0.0, "r": 1.0}[label[0]]) + return [ + {"symbols": ("C", "H"), "isotopes": (12, 1), + "coords": ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0 + a))}, + {"symbols": ("C", "H"), "isotopes": (12, 1), + "coords": ((0.0, 0.0, 0.05), (1.05, 0.0, 0.05 + a))}, + ] + + def _patch_parse_irc(self, *, forward=True, reverse=True, fail=False): + """Patch ``arc.parser.parser.parse_irc_traj`` for a test. + + Returns forward-direction points for ``_irc_f`` paths, reverse + for ``_irc_r``, ``None`` otherwise (or always when ``fail``). + """ + def _stub(log_file_path, raise_error=False): + if fail: + return None + name = pathlib.Path(log_file_path).name + if "_irc_f" in name and forward: + return self._fake_irc_points("forward") + if "_irc_r" in name and reverse: + return self._fake_irc_points("reverse") + return None + return mock.patch("arc.parser.parser.parse_irc_traj", side_effect=_stub) + + def test_irc_depends_on_irc_start_from_ts_opt(self): + # Spec: ``ts_opt → ts_irc`` with ``role=irc_start``. NOT freq_on. + doc, proj = self._irc_fixture_with_logs_on_disk() + with self._patch_parse_irc(): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + ts = payload["transition_state"] + irc_calc = next(c for c in ts["calculations"] if c["key"] == "ts_irc") + self.assertEqual( + irc_calc["depends_on"], + [{"parent_calculation_key": "ts_opt", "role": "irc_start"}], + ) + + def test_irc_kinetics_source_link(self): + doc, proj = self._irc_fixture_with_logs_on_disk() + with self._patch_parse_irc(): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + sources = payload["kinetics"][0]["source_calculations"] + irc_links = [s for s in sources if s["role"] == "irc"] + self.assertEqual(irc_links, [{"calculation_key": "ts_irc", "role": "irc"}]) + + def test_irc_structured_result_emitted_when_parsed(self): + doc, proj = self._irc_fixture_with_logs_on_disk() + with self._patch_parse_irc(): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + ts = payload["transition_state"] + irc_calc = next(c for c in ts["calculations"] if c["key"] == "ts_irc") + self.assertIn("irc_result", irc_calc) + result = irc_calc["irc_result"] + self.assertEqual(result["direction"], "both") + self.assertTrue(result["has_forward"]) + self.assertTrue(result["has_reverse"]) + self.assertEqual(result["point_count"], 4) # 2 forward + 2 reverse + self.assertEqual(len(result["points"]), 4) + + def test_irc_points_preserve_direction_and_geometry(self): + doc, proj = self._irc_fixture_with_logs_on_disk() + with self._patch_parse_irc(): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + ts = payload["transition_state"] + irc_calc = next(c for c in ts["calculations"] if c["key"] == "ts_irc") + points = irc_calc["irc_result"]["points"] + # Indices unique and zero-based across both branches. + self.assertEqual([p["point_index"] for p in points], [0, 1, 2, 3]) + directions = [p.get("direction") for p in points] + self.assertEqual(directions.count("forward"), 2) + self.assertEqual(directions.count("reverse"), 2) + for p in points: + self.assertIn("geometry", p) + self.assertIn("xyz_text", p["geometry"]) + # Producer must NOT label points as reactant/product. + for p in points: + self.assertNotIn("role", p) + + def test_irc_no_explicit_output_geometries_when_irc_result_present(self): + # Regression: the producer must NOT emit explicit output_geometries + # for the IRC calc when irc_result is attached. Server-side + # ``_persist_irc_result`` already creates calculation_output_geometry + # rows (role=irc_forward / irc_reverse) for every directional point; + # producer-explicit endpoints would double-claim those geometries + # and trip the unique (calculation_id, geometry_id) constraint in + # attach_calculation_output_geometries. The bug manifested as + # `output_geometries declares the same geometry more than once` + # 422s on every replay. + doc, proj = self._irc_fixture_with_logs_on_disk() + with self._patch_parse_irc(): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + ts = payload["transition_state"] + irc_calc = next(c for c in ts["calculations"] if c["key"] == "ts_irc") + # irc_result IS present (server will derive output_geometries from points)… + self.assertIn("irc_result", irc_calc) + # …and explicit output_geometries are suppressed (or empty). + self.assertEqual(irc_calc.get("output_geometries", []), []) + + def test_irc_partial_fallback_when_parsing_fails(self): + # Spec: log present but no parsable points → + # - keep type=irc calc + # - keep depends_on(role=irc_start) + # - keep kinetics.source_calculations(role=irc) + # - omit irc_result (don't fabricate incomplete data) + doc, proj = self._irc_fixture_with_logs_on_disk() + with self._patch_parse_irc(fail=True): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + ts = payload["transition_state"] + irc_calc = next(c for c in ts["calculations"] if c["key"] == "ts_irc") + self.assertEqual(irc_calc["type"], "irc") + self.assertEqual( + irc_calc["depends_on"], + [{"parent_calculation_key": "ts_opt", "role": "irc_start"}], + ) + self.assertNotIn("irc_result", irc_calc) + # kinetics source link survives. + sources = payload["kinetics"][0]["source_calculations"] + irc_links = [s for s in sources if s["role"] == "irc"] + self.assertEqual(len(irc_links), 1) + + def test_no_irc_emits_nothing_irc_related(self): + # Default fixture has no irc_logs → no ts_irc calc, no irc_start + # dependency, no kinetics source link. + _, _, payload = self._submit() + ts = payload["transition_state"] + ts_calc_keys = {c["key"] for c in ts["calculations"]} + self.assertNotIn("ts_irc", ts_calc_keys) + for c in ts["calculations"]: + for dep in c.get("depends_on") or []: + self.assertNotEqual(dep.get("role"), "irc_start") + roles = {e["role"] for e in payload["kinetics"][0]["source_calculations"]} + self.assertNotIn("irc", roles) + + # ---------------- IRC: per-point direction labelling + def _irc_fixture_production_shape(self, *, directions=("forward", "reverse")): + """Stage logs whose filenames don't carry direction (production case). + + Real ARC IRC logs land at ``calcs/.../irc_/output.log``; + the filename has no forward/reverse infix. Direction lives only on + ``irc_log_directions``, which the scheduler captures from + ``job.irc_direction``. This fixture mirrors that shape. + """ + proj = tempfile.mkdtemp(prefix="arc-tckdb-irc-prod-") + self.addCleanup(shutil.rmtree, proj, ignore_errors=True) + log_paths = [] + for i, _ in enumerate(directions): + d = pathlib.Path(proj) / f"irc_{i}" + d.mkdir() + log = d / "output.log" + log.write_text("dummy") + log_paths.append(f"irc_{i}/output.log") + doc = _reaction_output_doc(with_irc=True) + ts = doc["transition_states"][0] + ts["irc_logs"] = log_paths + ts["irc_log_directions"] = list(directions) + return doc, proj + + def _patch_parse_irc_by_index(self, points_per_log): + """Like ``_patch_parse_irc`` but keyed off log path, not filename token. + + ``points_per_log[i]`` is the parsed-point list returned for the + ``i``-th unique log path the adapter feeds in. Lets tests stage + production-shape (direction-less) filenames without coupling the + stub to filename pattern matching. + """ + seen: dict[str, int] = {} + + def _stub(log_file_path, raise_error=False): + if log_file_path not in seen: + seen[log_file_path] = len(seen) + return points_per_log[seen[log_file_path]] + return mock.patch("arc.parser.parser.parse_irc_traj", side_effect=_stub) + + def test_explicit_irc_log_directions_label_points_when_filename_lacks_direction(self): + # Reproduces the live-run bug: filename detection alone yielded + # all-NULL directions. With irc_log_directions populated by the + # scheduler the points carry forward/reverse. + doc, proj = self._irc_fixture_production_shape() + fwd = self._fake_irc_points("forward") + rev = self._fake_irc_points("reverse") + with self._patch_parse_irc_by_index([fwd, rev]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + result = next( + c for c in payload["transition_state"]["calculations"] + if c["key"] == "ts_irc" + )["irc_result"] + directions = [p.get("direction") for p in result["points"]] + self.assertEqual(directions.count("forward"), len(fwd)) + self.assertEqual(directions.count("reverse"), len(rev)) + self.assertNotIn(None, directions) + + def test_irc_result_flags_for_forward_only(self): + doc, proj = self._irc_fixture_production_shape(directions=("forward",)) + with self._patch_parse_irc_by_index([self._fake_irc_points("forward")]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + result = next( + c for c in payload["transition_state"]["calculations"] + if c["key"] == "ts_irc" + )["irc_result"] + self.assertEqual(result["direction"], "forward") + self.assertTrue(result["has_forward"]) + self.assertFalse(result["has_reverse"]) + self.assertEqual( + {p.get("direction") for p in result["points"]}, {"forward"} + ) + + def test_irc_result_flags_for_reverse_only(self): + doc, proj = self._irc_fixture_production_shape(directions=("reverse",)) + with self._patch_parse_irc_by_index([self._fake_irc_points("reverse")]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + result = next( + c for c in payload["transition_state"]["calculations"] + if c["key"] == "ts_irc" + )["irc_result"] + self.assertEqual(result["direction"], "reverse") + self.assertFalse(result["has_forward"]) + self.assertTrue(result["has_reverse"]) + self.assertEqual( + {p.get("direction") for p in result["points"]}, {"reverse"} + ) + + def test_irc_points_carry_no_is_ts_marker_when_undetected(self): + # ARC's parse_irc_traj surfaces only geometries — no per-point + # energies/gradients to identify the saddle. So the producer + # MUST NOT mark any point as is_ts; the schema's default is + # False, which the server treats as "no TS index claimed." + doc, proj = self._irc_fixture_production_shape() + fwd = self._fake_irc_points("forward") + rev = self._fake_irc_points("reverse") + with self._patch_parse_irc_by_index([fwd, rev]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + result = next( + c for c in payload["transition_state"]["calculations"] + if c["key"] == "ts_irc" + )["irc_result"] + # Either the field is absent (defaults to False server-side) or + # explicitly False — both shapes are spec-compliant. + for p in result["points"]: + self.assertFalse(p.get("is_ts", False)) + self.assertNotIn("ts_point_index", result) + + def test_irc_no_reactant_product_inference_in_payload(self): + # Spec is explicit: forward/reverse are IRC path-direction + # labels, NOT reactant/product designators. The payload must + # carry no fields that would imply that mapping. + doc, proj = self._irc_fixture_production_shape() + fwd = self._fake_irc_points("forward") + rev = self._fake_irc_points("reverse") + with self._patch_parse_irc_by_index([fwd, rev]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + ts = payload["transition_state"] + irc_calc = next(c for c in ts["calculations"] if c["key"] == "ts_irc") + text = json.dumps(irc_calc) + for forbidden in ("reactant_side", "product_side", "reactants_branch", "products_branch"): + self.assertNotIn(forbidden, text) + + def test_irc_payload_validates_against_tckdb_schema(self): + # Live-schema smoke: skipped when the TCKDB pydantic schema is + # not importable in the active env. + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_reaction_upload import ( + ComputedReactionUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + doc, proj = self._irc_fixture_production_shape() + fwd = self._fake_irc_points("forward") + rev = self._fake_irc_points("reverse") + with self._patch_parse_irc_by_index([fwd, rev]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + ComputedReactionUploadRequest.model_validate(payload) + + # ---------------- IRC double-emit regression + def test_irc_calc_attaches_irc_result_only_no_explicit_output_geometries(self): + # Regression: producer must NOT emit explicit ``output_geometries`` + # for the IRC calc when ``irc_result.points`` is attached. + # Server-side ``_persist_irc_result`` (calculation_resolution.py) + # writes ``calculation_output_geometry`` rows for every + # forward/reverse point with role irc_forward / irc_reverse; + # producer-explicit endpoints would then double-claim those + # geometries and trip the unique (calculation_id, geometry_id) + # constraint in ``attach_calculation_output_geometries``, + # producing the user-visible 422 + # "output_geometries declares the same geometry more than once". + # The trajectory points still carry both directions — the server + # owns the output-geometry derivation. + doc, proj = self._irc_fixture_production_shape() + with self._patch_parse_irc_by_index([ + self._fake_irc_points("forward"), + self._fake_irc_points("reverse"), + ]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + irc_calc = next( + c for c in payload["transition_state"]["calculations"] if c["key"] == "ts_irc" + ) + # irc_result IS attached… + self.assertIn("irc_result", irc_calc) + result = irc_calc["irc_result"] + directions = [p.get("direction") for p in result["points"]] + self.assertIn("forward", directions) + self.assertIn("reverse", directions) + self.assertTrue(result["has_forward"]) + self.assertTrue(result["has_reverse"]) + # …but explicit output_geometries is suppressed (or empty). + self.assertEqual(irc_calc.get("output_geometries", []), []) + + # ---------------- IRC: rich parser path (energies, RC, gradients) + @staticmethod + def _fake_rich_irc_points(direction, *, base_energy): + """Two synthetic rich-parser points with energies/RC/grads. + + Mirrors what :func:`arc.parser.parser.parse_irc_path` returns for + Gaussian: per-point ``electronic_energy_hartree``, + ``reaction_coordinate``, ``max_gradient``, ``rms_gradient``, + ``direction``, and ``xyz``. + """ + return [ + { + "point_number": 1, + "direction": direction, + "electronic_energy_hartree": base_energy + 0.001, + "reaction_coordinate": 0.07236, + "max_gradient": 0.0073, + "rms_gradient": 0.0025, + "xyz": { + "symbols": ("C", "H"), "isotopes": (12, 1), + "coords": ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)), + }, + }, + { + "point_number": 2, + "direction": direction, + "electronic_energy_hartree": base_energy + 0.002, + "reaction_coordinate": 0.14470, + "max_gradient": 0.0140, + "rms_gradient": 0.0048, + "xyz": { + "symbols": ("C", "H"), "isotopes": (12, 1), + "coords": ((0.0, 0.0, 0.05), (1.05, 0.0, 0.05)), + }, + }, + ] + + def _patch_parse_irc_path_by_index(self, points_per_log): + """Patch ``parse_irc_path`` per-log; ``parse_irc_traj`` is left as-is. + + Pairs with ``_patch_parse_irc_by_index`` to drive the rich-vs- + geometry-only branches of ``_parse_irc_trajectories``. + """ + seen: dict[str, int] = {} + + def _stub(log_file_path, raise_error=False): + if log_file_path not in seen: + seen[log_file_path] = len(seen) + return points_per_log[seen[log_file_path]] + return mock.patch("arc.parser.parser.parse_irc_path", side_effect=_stub) + + def test_irc_rich_points_carry_energy_rc_and_gradients(self): + # Rich parser returns per-point energies/RC/grads → the payload + # surfaces them on every IRC point. + doc, proj = self._irc_fixture_production_shape() + fwd = self._fake_rich_irc_points("forward", base_energy=-303.5779) + rev = self._fake_rich_irc_points("reverse", base_energy=-303.5779) + with self._patch_parse_irc_path_by_index([fwd, rev]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + result = next( + c for c in payload["transition_state"]["calculations"] + if c["key"] == "ts_irc" + )["irc_result"] + for p in result["points"]: + self.assertIn("electronic_energy_hartree", p) + self.assertIn("reaction_coordinate", p) + self.assertIn("max_gradient", p) + self.assertIn("rms_gradient", p) + self.assertIn("geometry", p) + # Forward + reverse direction labels propagate from the rich data. + directions = [p["direction"] for p in result["points"]] + self.assertEqual(directions.count("forward"), 2) + self.assertEqual(directions.count("reverse"), 2) + + def test_irc_zero_energy_reference_uses_ts_sp_when_levels_match(self): + # opt_level == sp_level → ts_sp.electronic_energy is the reference. + # relative_energy_kj_mol is computed against it. + doc, proj = self._irc_fixture_production_shape() + # Force a matching SP level in the fixture (opt_level is set in + # _fake_output_doc; mirror it here so _level_keys_match returns True). + doc["sp_level"] = dict(doc["opt_level"]) + ts = doc["transition_states"][0] + ts["sp_energy_hartree"] = -303.6 + ts["opt_final_energy_hartree"] = -303.5 + fwd = self._fake_rich_irc_points("forward", base_energy=-303.5779) + rev = self._fake_rich_irc_points("reverse", base_energy=-303.5779) + with self._patch_parse_irc_path_by_index([fwd, rev]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + result = next( + c for c in payload["transition_state"]["calculations"] + if c["key"] == "ts_irc" + )["irc_result"] + self.assertAlmostEqual(result["zero_energy_reference_hartree"], -303.6) + # relative_energy_kj_mol = (E - E_ref) * 2625.4996 + for p in result["points"]: + expected = ( + p["electronic_energy_hartree"] - (-303.6) + ) * 2625.4996 + self.assertAlmostEqual(p["relative_energy_kj_mol"], expected, places=4) + + def test_irc_zero_energy_reference_falls_back_to_opt_when_sp_level_differs(self): + # sp_level distinct from opt_level → adapter must NOT use the + # higher-level SP energy as the IRC zero reference; falls back to + # the TS opt's final energy (always at opt level by construction). + doc, proj = self._irc_fixture_production_shape(directions=("forward",)) + doc["sp_level"] = {"method": "ccsd(t)-f12a", "basis": "cc-pvtz-f12", + "software": "molpro"} + ts = doc["transition_states"][0] + ts["sp_energy_hartree"] = -303.99 # high-level, MUST NOT be picked + ts["opt_final_energy_hartree"] = -303.50 # opt-level, picked + fwd = self._fake_rich_irc_points("forward", base_energy=-303.5) + with self._patch_parse_irc_path_by_index([fwd]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + result = next( + c for c in payload["transition_state"]["calculations"] + if c["key"] == "ts_irc" + )["irc_result"] + self.assertAlmostEqual(result["zero_energy_reference_hartree"], -303.50) + + def test_irc_zero_energy_reference_omitted_when_no_match_and_no_opt_energy(self): + # No opt_final_energy_hartree on the TS, sp_level differs → null + # reference. Per-point relative_energy_kj_mol is also omitted (the + # spec forbids fabrication). + doc, proj = self._irc_fixture_production_shape(directions=("forward",)) + doc["sp_level"] = {"method": "ccsd(t)-f12a", "basis": "cc-pvtz-f12", + "software": "molpro"} + ts = doc["transition_states"][0] + ts["sp_energy_hartree"] = -303.99 # high-level: ignored + ts["opt_final_energy_hartree"] = None + fwd = self._fake_rich_irc_points("forward", base_energy=-303.5) + with self._patch_parse_irc_path_by_index([fwd]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + result = next( + c for c in payload["transition_state"]["calculations"] + if c["key"] == "ts_irc" + )["irc_result"] + self.assertNotIn("zero_energy_reference_hartree", result) + for p in result["points"]: + self.assertNotIn("relative_energy_kj_mol", p) + + def test_irc_falls_back_to_geometry_only_when_rich_parser_fails(self): + # parse_irc_path returns None → the adapter falls back to + # parse_irc_traj. Result still carries points + directions, but + # no per-point energies/RC/grads (and no zero reference). + doc, proj = self._irc_fixture_production_shape() + with mock.patch("arc.parser.parser.parse_irc_path", + side_effect=lambda log_file_path, raise_error=False: None): + with self._patch_parse_irc_by_index([ + self._fake_irc_points("forward"), + self._fake_irc_points("reverse"), + ]): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + result = next( + c for c in payload["transition_state"]["calculations"] + if c["key"] == "ts_irc" + )["irc_result"] + self.assertEqual(result["point_count"], 4) + for p in result["points"]: + self.assertIn("geometry", p) + self.assertNotIn("electronic_energy_hartree", p) + self.assertNotIn("reaction_coordinate", p) + self.assertNotIn("max_gradient", p) + + def test_irc_path_rich_parser_consumed_end_to_end_on_real_fixture(self): + # Integration: stage the real Gaussian IRC fixtures as the TS's + # irc_logs and confirm the rich parser flows through the adapter + # without any ESS-stub patching. This is the strongest signal that + # the parse_irc_path → adapter → payload chain works on production + # log shapes. + proj = tempfile.mkdtemp(prefix="arc-tckdb-irc-real-") + self.addCleanup(shutil.rmtree, proj, ignore_errors=True) + # Copy fixtures into the project tree so adapter path resolution + # produces an absolute path the parsers can read. + src_dir = pathlib.Path(__file__).resolve().parents[1] / "testing" / "irc" + f_log = pathlib.Path(proj) / "TS0_irc_f.log" + r_log = pathlib.Path(proj) / "TS0_irc_r.log" + shutil.copy(src_dir / "rxn_1_irc_1.out", f_log) + shutil.copy(src_dir / "rxn_1_irc_2.out", r_log) + doc = _reaction_output_doc(with_irc=True) + ts = doc["transition_states"][0] + ts["irc_logs"] = ["TS0_irc_f.log", "TS0_irc_r.log"] + ts["irc_log_directions"] = ["forward", "reverse"] + # Match the IRC fixtures' Gaussian level so the SP-level path + # gets exercised. The actual Hartree values are fixture-dependent + # so we only assert the shape/wiring. + doc["sp_level"] = dict(doc["opt_level"]) + ts["sp_energy_hartree"] = -303.6 + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + result = next( + c for c in payload["transition_state"]["calculations"] + if c["key"] == "ts_irc" + )["irc_result"] + # Each fixture has 50 stepped points; both branches → 100 points. + self.assertEqual(result["point_count"], 100) + self.assertTrue(result["has_forward"]) + self.assertTrue(result["has_reverse"]) + self.assertIn("zero_energy_reference_hartree", result) + # Spot-check one forward + one reverse point have the rich fields. + fwd_pt = next(p for p in result["points"] if p.get("direction") == "forward") + rev_pt = next(p for p in result["points"] if p.get("direction") == "reverse") + for p in (fwd_pt, rev_pt): + self.assertIn("electronic_energy_hartree", p) + self.assertIn("reaction_coordinate", p) + self.assertIn("max_gradient", p) + self.assertIn("rms_gradient", p) + self.assertIn("relative_energy_kj_mol", p) + self.assertIn("geometry", p) + + def test_output_yml_does_not_contain_irc_path(self): + # Spec invariant: output.yml must NOT carry the full IRC path. + # arc/output.py's _spc_to_dict serializes only irc_logs / + # irc_log_directions / irc_converged for IRC — never the + # per-point energy/RC/grad arrays. We check the actual set of + # emitted dict keys (``d[''] = ...``) against an allowlist + # of IRC-related fields and a forbidden list of rich IRC fields. + # Guards the boundary so future helpers that touch the ts_record + # post-parse can't leak rich IRC data into the human-readable + # summary. + from arc.output import _spc_to_dict + import inspect, re + src = inspect.getsource(_spc_to_dict) + emitted_keys = set(re.findall(r"d\[['\"]([^'\"]+)['\"]\]\s*=", src)) + irc_emitted = {k for k in emitted_keys if "irc" in k} + # Allowlist mirrors the existing _spc_to_dict implementation — + # tighten this set if new IRC fields are intentionally added. + self.assertEqual( + irc_emitted, + {"irc_logs", "irc_log_directions", "irc_converged"}, + f"unexpected IRC fields emitted to output.yml: {irc_emitted}", + ) + # Belt-and-suspenders: rich-parser per-point field names must + # never appear as emitted keys. + forbidden_fields = { + "irc_path", "irc_points", "irc_result", + "irc_path_points", "irc_relative_energies", + } + self.assertFalse( + emitted_keys & forbidden_fields, + f"rich IRC fields leaked into output.yml: " + f"{emitted_keys & forbidden_fields}", + ) + + # ---------------- 8: species and TS calc provenance preserved + def test_calc_provenance_input_geometries_threaded(self): + _, _, payload = self._submit() + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + for calc in r0["calculations"]: + # freq + sp explicitly carry the conformer's optimized xyz + self.assertIn("input_geometries", calc) + self.assertEqual(len(calc["input_geometries"]), 1) + + def test_calc_provenance_depends_on_edges(self): + _, _, payload = self._submit() + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + freq_calc = next(c for c in r0["calculations"] if c["key"] == "r0_freq") + sp_calc = next(c for c in r0["calculations"] if c["key"] == "r0_sp") + self.assertEqual( + freq_calc["depends_on"], + [{"parent_calculation_key": "r0_opt", "role": "freq_on"}], + ) + self.assertEqual( + sp_calc["depends_on"], + [{"parent_calculation_key": "r0_opt", "role": "single_point_on"}], + ) + + def test_ts_calc_provenance_depends_on_edges(self): + _, _, payload = self._submit() + ts = payload["transition_state"] + ts_freq = next(c for c in ts["calculations"] if c["key"] == "ts_freq") + ts_sp = next(c for c in ts["calculations"] if c["key"] == "ts_sp") + self.assertEqual( + ts_freq["depends_on"], + [{"parent_calculation_key": "ts_opt", "role": "freq_on"}], + ) + self.assertEqual( + ts_sp["depends_on"], + [{"parent_calculation_key": "ts_opt", "role": "single_point_on"}], + ) + + # ---------------- 9: artifact inclusion when enabled + def test_artifacts_inlined_under_correct_calculations(self): + # Set up artifact upload + actual files on disk + proj = tempfile.mkdtemp(prefix="arc-tckdb-rxn-art-") + self.addCleanup(shutil.rmtree, proj, ignore_errors=True) + opt_log = pathlib.Path(proj) / "r0_opt.log" + opt_log.write_text("dummy log content") + cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode="computed_reaction", + artifacts=TCKDBArtifactConfig(upload=True, kinds=("output_log",)), + ) + doc = _reaction_output_doc() + cho = next(s for s in doc["species"] if s["label"] == "CHO") + cho["opt_log"] = "r0_opt.log" + _, _, payload = self._submit(output_doc=doc, cfg=cfg, project_directory=proj) + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + primary = r0["conformers"][0]["calculation"] + self.assertIn("artifacts", primary) + self.assertEqual(primary["artifacts"][0]["kind"], "output_log") + self.assertEqual(primary["artifacts"][0]["filename"], "r0_opt.log") + + # ---------------- 10: deterministic idempotency + def test_payload_and_idempotency_are_deterministic(self): + outcome1, _, payload1 = self._submit() + outcome2, _, payload2 = self._submit() + self.assertEqual(payload1, payload2) + self.assertEqual(outcome1.idempotency_key, outcome2.idempotency_key) + + def test_idempotency_changes_with_kinetics_change(self): + outcome1, _, _ = self._submit() + rxn = _reaction_record() + rxn["kinetics"]["A"] = 0.5 + outcome2, _, _ = self._submit(reaction=rxn) + self.assertNotEqual(outcome1.idempotency_key, outcome2.idempotency_key) + + # ---------------- 11: sidecar shape (payload_kind + endpoint) + def test_sidecar_payload_kind_and_endpoint(self): + outcome, _, _ = self._submit() + sidecar = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sidecar["payload_kind"], COMPUTED_REACTION_KIND) + self.assertEqual(sidecar["endpoint"], COMPUTED_REACTION_ENDPOINT) + # status updates from "pending" → "uploaded" via the stub client + self.assertEqual(sidecar["status"], "uploaded") + self.assertIn("payload_file", sidecar) + + def test_offline_skipped_status_when_upload_false(self): + cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + upload=False, + project_label="proj-A", + upload_mode="computed_reaction", + ) + outcome, client, _ = self._submit(cfg=cfg) + self.assertEqual(outcome.status, "skipped") + self.assertEqual(client.calls, []) + sidecar = json.loads(outcome.sidecar_path.read_text()) + self.assertEqual(sidecar["payload_kind"], COMPUTED_REACTION_KIND) + self.assertEqual(sidecar["status"], "skipped") + + # ---------------- 12: no DB IDs in payload + def test_payload_contains_no_db_ids(self): + _, _, payload = self._submit() + text = json.dumps(payload) + for token in ("species_entry_id", "calculation_id", "thermo_id", + "conformer_observation_id", "reaction_id"): + self.assertNotIn(token, text, f"unexpected DB-id field {token!r} in payload") + + # ---------------- 13: missing reactant raises with clear error + def test_missing_reactant_label_raises(self): + rxn = _reaction_record() + rxn["reactant_labels"] = ["CHO", "DOES_NOT_EXIST"] + adapter = self._adapter(_StubClient()) + with self.assertRaises(ValueError) as ctx: + adapter._build_computed_reaction_payload( + output_doc=_reaction_output_doc(), reaction_record=rxn, + ) + self.assertIn("DOES_NOT_EXIST", str(ctx.exception)) + + # ---------------- 14: kinetics with no Arrhenius fields → no kinetics list + def test_no_kinetics_omits_kinetics_field(self): + rxn = _reaction_record(with_kinetics=False) + _, _, payload = self._submit(reaction=rxn) + self.assertNotIn("kinetics", payload) + + # ---------------- 15: server-side schema validation (smoke; skipped when unavailable) + def test_payload_validates_against_tckdb_schema(self): + # The adapter targets the live TCKDB ``ComputedReactionUploadRequest`` + # schema. When the backend pydantic schema is importable in the + # current env (it usually isn't from ARC's env), validate the + # payload through it. Otherwise skip — this is a smoke test, not + # a hard requirement. Uses the structured-IRC fixture so the new + # ``irc_result`` / ``output_geometries(role=irc_*)`` / + # ``depends_on(role=irc_start)`` shapes are all exercised by + # ``ComputedReactionUploadRequest.model_validate`` when run. + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_reaction_upload import ( + ComputedReactionUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + doc, proj = self._irc_fixture_with_logs_on_disk() + with self._patch_parse_irc(): + _, _, payload = self._submit(output_doc=doc, project_directory=proj) + ComputedReactionUploadRequest.model_validate(payload) + + # ---------------- AEC/BAC routing (species + TS) + def _doc_with_corrections(self, *, attach_to_ts=True): + """Reaction output.yml fixture with applied_energy_corrections on + every reactant/product and (optionally) on the TS. Lets each test + focus on the routing assertion without re-assembling the doc.""" + doc = _reaction_output_doc() + for sp in doc["species"]: + sp["applied_energy_corrections"] = [_aec_record(), _pbac_record()] + if attach_to_ts: + ts = doc["transition_states"][0] + ts["applied_energy_corrections"] = [_aec_record(), _mbac_record()] + return doc + + def _species_block(self, payload, key): + return next(s for s in payload["species"] if s["key"] == key) + + def test_reactant_species_aec_bac_in_species_block(self): + # Both reactants carry AEC + BAC; the producer must put them on + # the BundleSpeciesIn (not at top-level, not on the TS). + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + for key in ("r0_CHO", "r1_CH4"): + sp = self._species_block(payload, key) + self.assertIn("applied_energy_corrections", sp) + roles = [e["application_role"] for e in sp["applied_energy_corrections"]] + self.assertEqual(roles, ["aec_total", "bac_total"]) + + def test_product_species_aec_bac_in_species_block(self): + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + for key in ("p0_CH2O", "p1_CH3"): + sp = self._species_block(payload, key) + self.assertIn("applied_energy_corrections", sp) + roles = [e["application_role"] for e in sp["applied_energy_corrections"]] + self.assertEqual(roles, ["aec_total", "bac_total"]) + + def test_ts_aec_bac_in_transition_state_block(self): + # TS-side corrections must land on the TS block (server routes them + # to target_transition_state_entry_id), NOT on a species block and + # NOT on a top-level field. + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + ts = payload["transition_state"] + self.assertIn("applied_energy_corrections", ts) + self.assertEqual( + [e["application_role"] for e in ts["applied_energy_corrections"]], + ["aec_total", "bac_total"], + ) + # Bundle has no top-level applied_energy_corrections field for reactions. + self.assertNotIn("applied_energy_corrections", payload) + + def test_reactant_correction_uses_own_sp_key_r0_sp(self): + # Each species's corrections must anchor to that species's own SP + # calc — never a sibling's, never the TS's. + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + sp = self._species_block(payload, "r0_CHO") + for entry in sp["applied_energy_corrections"]: + self.assertEqual(entry["source_calculation_key"], "r0_sp") + + def test_product_correction_uses_own_sp_key_p1_sp(self): + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + sp = self._species_block(payload, "p1_CH3") + for entry in sp["applied_energy_corrections"]: + self.assertEqual(entry["source_calculation_key"], "p1_sp") + + def test_ts_correction_uses_ts_sp_key(self): + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + ts = payload["transition_state"] + for entry in ts["applied_energy_corrections"]: + self.assertEqual(entry["source_calculation_key"], "ts_sp") + + def test_species_corrections_never_use_ts_sp(self): + # Defensive: if a future bug let TS routing leak into the species + # path, the server would 422 (cross-owner reference). Pin it here + # so the regression is caught at the producer. + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + for sp in payload["species"]: + for entry in sp.get("applied_energy_corrections") or []: + self.assertNotEqual(entry.get("source_calculation_key"), "ts_sp") + + def test_ts_corrections_never_use_species_sp_keys(self): + # And the inverse: TS must not pick up a species's SP key. + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + ts = payload["transition_state"] + species_sp_keys = {"r0_sp", "r1_sp", "p0_sp", "p1_sp"} + for entry in ts.get("applied_energy_corrections") or []: + self.assertNotIn( + entry.get("source_calculation_key"), species_sp_keys, + ) + + def test_components_preserved_for_aec_and_pbac(self): + # AEC components (atom × parameter contribution) and PBAC components + # (bond × parameter contribution) survive the producer→bundle + # translation; the producer doesn't dedupe or recompute them. + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + sp = self._species_block(payload, "r0_CHO") + aec = next(e for e in sp["applied_energy_corrections"] + if e["application_role"] == "aec_total") + self.assertEqual(len(aec["components"]), 2) + self.assertEqual([c["key"] for c in aec["components"]], ["C", "H"]) + bac = next(e for e in sp["applied_energy_corrections"] + if e["application_role"] == "bac_total") + self.assertEqual(len(bac["components"]), 1) + + def test_parameter_unit_stripped_from_components(self): + # output.yml carries parameter_unit on each component for clarity; + # TCKDB's AppliedCorrectionComponentPayload rejects unknown fields. + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + for sp in payload["species"]: + for entry in sp.get("applied_energy_corrections") or []: + for c in entry["components"]: + self.assertNotIn("parameter_unit", c) + ts = payload["transition_state"] + for entry in ts.get("applied_energy_corrections") or []: + for c in entry["components"]: + self.assertNotIn("parameter_unit", c) + + def test_mbac_total_only_no_components_on_ts(self): + # Melius BAC is a pairwise atom-pair function with a multiplicity + # correction; per-bond decomposition isn't safe, so the producer + # ships total-only. + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + ts = payload["transition_state"] + bac = next(e for e in ts["applied_energy_corrections"] + if e["application_role"] == "bac_total") + self.assertEqual(bac["scheme"]["kind"], "bac_melius") + self.assertEqual(bac["components"], []) + + def test_existing_payload_unchanged_when_no_corrections(self): + # Backward compat: docs without applied_energy_corrections on + # species or TS produce a bundle with no applied_energy_corrections + # field. Older output.yml consumers continue to work. + _, _, payload = self._submit() # default fixture has no corrections + for sp in payload["species"]: + self.assertNotIn("applied_energy_corrections", sp) + ts = payload["transition_state"] + self.assertNotIn("applied_energy_corrections", ts) + + def test_correction_payload_validates_against_live_schema(self): + # End-to-end pydantic validation when the live schema is reachable. + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_reaction_upload import ( + ComputedReactionUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + _, _, payload = self._submit(output_doc=self._doc_with_corrections()) + ComputedReactionUploadRequest.model_validate(payload) + + def test_scheme_atom_and_bond_params_reach_bundle(self): + # Integration: if output.yml carries scheme.atom_params / + # scheme.bond_params, the bundle's BundleSpeciesIn / + # BundleTransitionStateIn applied_energy_corrections preserve + # them. This is the regression test for the original empty + # ``energy_correction_scheme_atom_param`` / + # ``energy_correction_scheme_bond_param`` tables. + doc = self._doc_with_corrections() + for sp in doc["species"]: + for entry in sp["applied_energy_corrections"]: + if entry["scheme"]["kind"] == "atom_energy": + entry["scheme"]["atom_params"] = [ + {"element": "C", "value": -37.84706}, + {"element": "H", "value": -0.50066}, + ] + elif entry["scheme"]["kind"] == "bac_petersson": + entry["scheme"]["bond_params"] = [ + {"bond_key": "C-H", "value": -0.17350}, + ] + ts = doc["transition_states"][0] + for entry in ts["applied_energy_corrections"]: + if entry["scheme"]["kind"] == "atom_energy": + entry["scheme"]["atom_params"] = [ + {"element": "C", "value": -37.84706}, + {"element": "H", "value": -0.50066}, + ] + _, _, payload = self._submit(output_doc=doc) + # Pick one reactant + the TS as the canonical assertion; + # routing tests already cover all 4 species. + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + aec = next(e for e in r0["applied_energy_corrections"] + if e["scheme"]["kind"] == "atom_energy") + self.assertEqual( + aec["scheme"]["atom_params"], + [{"element": "C", "value": -37.84706}, + {"element": "H", "value": -0.50066}], + ) + bac = next(e for e in r0["applied_energy_corrections"] + if e["scheme"]["kind"] == "bac_petersson") + self.assertEqual( + bac["scheme"]["bond_params"], + [{"bond_key": "C-H", "value": -0.17350}], + ) + ts_aec = next( + e for e in payload["transition_state"]["applied_energy_corrections"] + if e["scheme"]["kind"] == "atom_energy" + ) + self.assertEqual(len(ts_aec["scheme"]["atom_params"]), 2) + + def test_scheme_params_payload_validates_against_live_schema(self): + # Live-schema validation specifically for the scheme-params path: + # SchemeAtomParamPayload / SchemeBondParamPayload are accepted on + # EnergyCorrectionSchemeRef and must not be rejected as extras. + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_reaction_upload import ( + ComputedReactionUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + doc = self._doc_with_corrections() + for sp in doc["species"]: + for entry in sp["applied_energy_corrections"]: + if entry["scheme"]["kind"] == "atom_energy": + entry["scheme"]["atom_params"] = [ + {"element": "C", "value": -37.84706}, + {"element": "H", "value": -0.50066}, + ] + elif entry["scheme"]["kind"] == "bac_petersson": + entry["scheme"]["bond_params"] = [ + {"bond_key": "C-H", "value": -0.17350}, + ] + _, _, payload = self._submit(output_doc=doc) + ComputedReactionUploadRequest.model_validate(payload) + + # ---------------- frequency-scale-factor on reaction species blocks + def _doc_with_fsf(self, *, value=0.988, + source="J. Phys. Chem. A 2007, 111, 11683", + include_freq_level=True): + """Reaction output doc enriched with FSF metadata at the run level. + + Mirrors :class:`TestComputedSpeciesStatmechFreqScaleFactor`'s + ``_doc_with_fsf`` so the same producer fields drive both modes. + """ + doc = _reaction_output_doc() + doc["freq_scale_factor"] = value + doc["freq_scale_factor_source"] = source + if include_freq_level: + doc["freq_level"] = { + "method": "wb97xd", "basis": "def2-tzvp", "software": "gaussian", + } + return doc + + def test_each_species_block_carries_statmech_with_fsf(self): + # Every reactant + product BundleSpeciesIn must get its own + # statmech.frequency_scale_factor when ARC has FSF metadata. + # Without this, the server's frequency_scale_factor table never + # populates even though the run-level data is in output.yml. + _, _, payload = self._submit(output_doc=self._doc_with_fsf()) + for sp in payload["species"]: + self.assertIn("statmech", sp) + sm = sp["statmech"] + self.assertIn("freq_scale_factor", sm) + fsf = sm["freq_scale_factor"] + self.assertAlmostEqual(fsf["value"], 0.988) + self.assertEqual(fsf["scale_kind"], "fundamental") + self.assertEqual(fsf["level_of_theory"]["method"], "wb97xd") + + def test_species_statmech_emits_scoped_source_calculations(self): + # Schema expansion: ``BundleStatmechIn`` now accepts + # ``source_calculations``. Each reactant/product must reference + # only its own scoped calc keys (r0_*, r1_*, p0_*, p1_*) — the + # workflow's owner-consistency check rejects cross-species and + # TS references. + _, _, payload = self._submit(output_doc=self._doc_with_fsf()) + for sp in payload["species"]: + self.assertIn("statmech", sp) + sources = sp["statmech"].get("source_calculations") or [] + # Default fixture carries opt + freq + sp on every species. + roles = [sc["role"] for sc in sources] + self.assertEqual(sorted(roles), ["freq", "opt", "sp"]) + own_prefix = sp["key"].split("_", 1)[0] + "_" + for sc in sources: + self.assertTrue( + sc["calculation_key"].startswith(own_prefix), + f"{sp['key']} statmech references foreign calc " + f"{sc['calculation_key']!r} (expected prefix " + f"{own_prefix!r})", + ) + self.assertFalse(sc["calculation_key"].startswith("ts_")) + + def test_bare_citation_maps_to_note_not_literature(self): + # Same policy as computed-species: a free-text citation is a + # provenance breadcrumb, not a structured Literature row. The + # producer must never invent a Literature entity from a string. + citation = "J. Phys. Chem. A 2007, 111, 11683" + _, _, payload = self._submit( + output_doc=self._doc_with_fsf(source=citation), + ) + for sp in payload["species"]: + fsf = sp["statmech"]["freq_scale_factor"] + self.assertEqual(fsf["note"], citation) + self.assertNotIn("source_literature", fsf) + + def test_missing_fsf_omits_statmech_block(self): + # Strict: when no freq_scale_factor is in the doc, every species + # block omits ``statmech`` entirely. Empty containers create + # useless server-side rows and would be misleading. + _, _, payload = self._submit() # default fixture has no FSF + for sp in payload["species"]: + self.assertNotIn("statmech", sp) + + def test_existing_payload_unchanged_when_fsf_absent(self): + # Backward compat: the same payload that today's fixture + # produces (no FSF) must remain structurally identical to before + # this change. Concrete invariants the rest of the suite relies on: + # species blocks have key/species_entry/conformers/calculations + # but no statmech (and no surprise extra fields). + _, _, payload = self._submit() + for sp in payload["species"]: + self.assertIn("key", sp) + self.assertIn("species_entry", sp) + self.assertIn("conformers", sp) + self.assertIn("calculations", sp) + self.assertNotIn("statmech", sp) + + def test_fsf_payload_validates_against_live_schema(self): + # Live-schema smoke for the FSF path. Skipped when pydantic / + # backend isn't reachable in the active env. + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_reaction_upload import ( + ComputedReactionUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + _, _, payload = self._submit(output_doc=self._doc_with_fsf()) + ComputedReactionUploadRequest.model_validate(payload) + + # ---------------- 16: endpoint is /uploads/computed-reaction + def test_post_target_endpoint(self): + _, client, _ = self._submit() + self.assertEqual(len(client.calls), 1) + self.assertEqual(client.calls[0]["path"], COMPUTED_REACTION_ENDPOINT) + + +# --------------------------------------------------------------------------- +# Per-species applied_energy_corrections (output.yml -> bundle) +# --------------------------------------------------------------------------- + + +def _aec_record(value=-0.02345, *, with_components=True, parameter_unit="hartree"): + """A representative output.yml AEC entry produced by ARC's correction script.""" + rec = { + "application_role": "aec_total", + "value": value, + "value_unit": "hartree", + "scheme": { + "kind": "atom_energy", + "name": "atom_energy", + "level_of_theory": {"method": "wb97xd3", "basis": "def2tzvp", "software": "qchem"}, + "units": "hartree", + "version": None, + "source_literature": None, + "note": "Per-species AEC computed by Arkane.", + }, + "components": [], + } + if with_components: + rec["components"] = [ + {"component_kind": "atom", "key": "C", "multiplicity": 1, + "parameter_value": -37.84993993, "parameter_unit": parameter_unit, + "contribution_value": -0.0153}, + {"component_kind": "atom", "key": "H", "multiplicity": 4, + "parameter_value": -0.49991749, "parameter_unit": parameter_unit, + "contribution_value": -0.00815}, + ] + return rec + + +def _pbac_record(value=-0.694): + return { + "application_role": "bac_total", + "value": value, + "value_unit": "kcal_mol", + "scheme": { + "kind": "bac_petersson", + "name": "bac_petersson", + "level_of_theory": {"method": "wb97xd3", "basis": "def2tzvp", "software": "qchem"}, + "units": "kcal_mol", + "version": None, + "source_literature": None, + "note": "Per-species BAC computed by Arkane (bac_type=p).", + }, + "components": [ + {"component_kind": "bond", "key": "C-H", "multiplicity": 4, + "parameter_value": -0.1735, "parameter_unit": "kcal_mol", + "contribution_value": -0.694}, + ], + } + + +def _mbac_record(value=-0.056): + return { + "application_role": "bac_total", + "value": value, + "value_unit": "kcal_mol", + "scheme": { + "kind": "bac_melius", + "name": "bac_melius", + "level_of_theory": {"method": "wb97xd3", "basis": "def2tzvp", "software": "qchem"}, + "units": "kcal_mol", + "version": None, + "source_literature": None, + "note": "Per-species BAC computed by Arkane (bac_type=m).", + }, + "components": [], + } + + +class TestBuildAppliedEnergyCorrectionsHelper(unittest.TestCase): + """Direct unit tests for `_build_applied_energy_corrections`.""" + + def setUp(self): + from arc.tckdb.adapter import _build_applied_energy_corrections + self._build = _build_applied_energy_corrections + + def test_aec_passthrough_with_components(self): + out = self._build([_aec_record()], source_calculation_key="sp") + self.assertEqual(len(out), 1) + entry = out[0] + self.assertEqual(entry["application_role"], "aec_total") + self.assertAlmostEqual(entry["value"], -0.02345) + self.assertEqual(entry["value_unit"], "hartree") + self.assertEqual(entry["scheme"]["kind"], "atom_energy") + self.assertEqual(entry["scheme"]["name"], "atom_energy") + self.assertEqual(len(entry["components"]), 2) + # parameter_unit must be stripped — not in TCKDB component schema + for c in entry["components"]: + self.assertNotIn("parameter_unit", c) + self.assertIn("component_kind", c) + self.assertIn("contribution_value", c) + self.assertEqual(entry["source_calculation_key"], "sp") + + def test_pbac_passthrough(self): + out = self._build([_pbac_record()], source_calculation_key="sp") + self.assertEqual(len(out), 1) + entry = out[0] + self.assertEqual(entry["application_role"], "bac_total") + self.assertEqual(entry["scheme"]["kind"], "bac_petersson") + self.assertEqual(entry["value_unit"], "kcal_mol") + self.assertEqual(len(entry["components"]), 1) + + def test_mbac_total_only_no_components(self): + out = self._build([_mbac_record()], source_calculation_key="sp") + self.assertEqual(len(out), 1) + entry = out[0] + self.assertEqual(entry["scheme"]["kind"], "bac_melius") + self.assertEqual(entry["components"], []) + + def test_omits_source_calculation_key_when_none_passed(self): + out = self._build([_aec_record()], source_calculation_key=None) + self.assertEqual(len(out), 1) + self.assertNotIn("source_calculation_key", out[0]) + + def test_drops_components_with_null_parameter_value(self): + rec = _aec_record() + rec["components"][0]["parameter_value"] = None + out = self._build([rec], source_calculation_key="sp") + self.assertEqual(len(out[0]["components"]), 1) # one dropped + self.assertEqual(out[0]["components"][0]["key"], "H") + + def test_skips_record_with_null_value(self): + rec = _aec_record() + rec["value"] = None + out = self._build([rec], source_calculation_key="sp") + self.assertEqual(out, []) + + def test_skips_record_without_scheme(self): + rec = _aec_record() + rec["scheme"] = None + out = self._build([rec], source_calculation_key="sp") + self.assertEqual(out, []) + + def test_empty_input_returns_empty_list(self): + self.assertEqual(self._build([], source_calculation_key="sp"), []) + self.assertEqual(self._build(None, source_calculation_key="sp"), []) + + def test_aec_and_bac_both_emitted(self): + out = self._build( + [_aec_record(), _pbac_record()], + source_calculation_key="sp", + ) + roles = [e["application_role"] for e in out] + self.assertEqual(roles, ["aec_total", "bac_total"]) + + def test_strips_software_from_scheme_level_of_theory(self): + """TCKDB ``LevelOfTheoryRef`` does not accept ``software`` — record + software via per-calc software_release elsewhere. The adapter must + project the ARC LoT dict onto method/basis/aux_basis/cabs_basis.""" + out = self._build([_aec_record()], source_calculation_key="sp") + lot = out[0]["scheme"]["level_of_theory"] + self.assertEqual(lot, {"method": "wb97xd3", "basis": "def2tzvp"}) + self.assertNotIn("software", lot) + + def test_scoped_source_calculation_key_passthrough(self): + """Reaction-mode callers pass scoped keys (r0_sp / p1_sp / ts_sp). + The helper must stamp them verbatim — it does not know about modes.""" + for key in ("r0_sp", "p0_sp", "p1_sp", "ts_sp"): + out = self._build([_aec_record()], source_calculation_key=key) + self.assertEqual(out[0]["source_calculation_key"], key) + + def test_scheme_atom_params_pass_through(self): + # output.yml ships the AEC parameter table as scheme.atom_params; + # the adapter must preserve it so TCKDB persists + # energy_correction_scheme_atom_param rows. Without this, the + # applied correction lands but the scheme has no parameters + # backing it. + rec = _aec_record() + rec["scheme"]["atom_params"] = [ + {"element": "C", "value": -37.84706}, + {"element": "H", "value": -0.50066}, + ] + out = self._build([rec], source_calculation_key="sp") + self.assertEqual( + out[0]["scheme"]["atom_params"], + [{"element": "C", "value": -37.84706}, + {"element": "H", "value": -0.50066}], + ) + + def test_scheme_bond_params_pass_through(self): + rec = _pbac_record() + rec["scheme"]["bond_params"] = [ + {"bond_key": "C-H", "value": -0.17350}, + {"bond_key": "C=O", "value": -2.63454}, + ] + out = self._build([rec], source_calculation_key="sp") + self.assertEqual( + out[0]["scheme"]["bond_params"], + [{"bond_key": "C-H", "value": -0.17350}, + {"bond_key": "C=O", "value": -2.63454}], + ) + + def test_scheme_params_absent_means_no_field_in_payload(self): + # Backward compat: schemes without parameter tables continue to + # produce a payload without those fields — TCKDB defaults the + # respective param lists to [] via the schema's default_factory. + out = self._build([_aec_record()], source_calculation_key="sp") + scheme = out[0]["scheme"] + for k in ("atom_params", "bond_params", "component_params"): + self.assertNotIn(k, scheme) + + +class TestComputedSpeciesAppliedCorrectionsBundle(unittest.TestCase): + """End-to-end: output.yml -> computed-species bundle preserves corrections.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-applied-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode="computed_species", + ) + + def _adapter(self, client): + return TCKDBAdapter(self.cfg, client_factory=lambda c, k: client) + + def _submit_with_corrections(self, applied): + record = _full_record() + record["applied_energy_corrections"] = applied + client = _StubClient(response=_StubResponse({ + "species_entry_id": 7, + "conformers": [{ + "key": "conf0", + "conformer_group_id": 3, + "conformer_observation_id": 11, + "primary_calculation": {"key": "opt", "calculation_id": 100, "type": "opt", "role": "primary"}, + "additional_calculations": [ + {"key": "freq", "calculation_id": 101, "type": "freq", "role": "additional"}, + {"key": "sp", "calculation_id": 102, "type": "sp", "role": "additional"}, + ], + }], + "thermo": {"thermo_id": 9}, + })) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=_fake_output_doc(), + species_record=record, + ) + return json.loads(outcome.payload_path.read_text()) + + def test_bundle_carries_applied_energy_corrections(self): + payload = self._submit_with_corrections([_aec_record(), _pbac_record()]) + self.assertIn("applied_energy_corrections", payload) + self.assertEqual(len(payload["applied_energy_corrections"]), 2) + roles = [e["application_role"] for e in payload["applied_energy_corrections"]] + self.assertEqual(roles, ["aec_total", "bac_total"]) + + def test_bundle_omits_block_when_empty(self): + payload = self._submit_with_corrections([]) + self.assertNotIn("applied_energy_corrections", payload) + + def test_bundle_omits_block_when_record_lacks_field(self): + # Backwards-compat with output.yml versions that don't emit the field. + record = _full_record() + record.pop("applied_energy_corrections", None) + client = _StubClient(response=_StubResponse({ + "species_entry_id": 1, + "conformers": [{"key": "conf0", "primary_calculation": {"key": "opt", "calculation_id": 100, "type": "opt", "role": "primary"}, "additional_calculations": []}], + })) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=_fake_output_doc(), + species_record=record, + ) + payload = json.loads(outcome.payload_path.read_text()) + self.assertNotIn("applied_energy_corrections", payload) + + def test_payload_validates_against_live_schema(self): + """If the TCKDB backend is importable, the produced payload must + validate against the live ComputedSpeciesUploadRequest schema — + guarding against silent shape drift in the output.yml -> bundle + translation.""" + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_species_upload import ( + ComputedSpeciesUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + payload = self._submit_with_corrections( + [_aec_record(), _pbac_record()] + ) + ComputedSpeciesUploadRequest.model_validate(payload) + + def test_mbac_payload_validates_against_live_schema(self): + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_species_upload import ( + ComputedSpeciesUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + payload = self._submit_with_corrections( + [_aec_record(), _mbac_record()] + ) + ComputedSpeciesUploadRequest.model_validate(payload) + + +class TestCalculationConstraints(unittest.TestCase): + """Held-fixed coordinate constraint emission into TCKDB calc payloads. + + Covers the wiring between ARC's parser-shaped constraint dicts and the + TCKDB ``CalculationWithResultsPayload.constraints`` field on both the + primary opt and additional (freq/sp/scan) calcs of a computed-species + bundle. Reaction bundles share the same ``_build_calc_in_bundle`` + plumbing, so this exercise covers the reaction path indirectly. + """ + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-constraints-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode="computed_species", + ) + + def _adapter(self, client): + return TCKDBAdapter( + self.cfg, + project_directory=None, + client_factory=lambda c, k: client, + ) + + def _submit(self, record): + client = _StubClient(response=_StubResponse({ + "species_entry_id": 7, + "conformers": [{ + "key": "conf0", + "conformer_group_id": 3, + "conformer_observation_id": 11, + "primary_calculation": { + "key": "opt", "calculation_id": 100, + "type": "opt", "role": "primary", + }, + "additional_calculations": [ + {"key": "freq", "calculation_id": 101, + "type": "freq", "role": "additional"}, + {"key": "sp", "calculation_id": 102, + "type": "sp", "role": "additional"}, + ], + }], + "thermo": {"thermo_id": 9}, + })) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_species_from_output( + output_doc=_fake_output_doc(), + species_record=record, + ) + return json.loads(outcome.payload_path.read_text()) + + def test_opt_constraints_appear_on_primary_calculation(self): + record = _full_record() + record["opt_constraints"] = [ + {"constraint_kind": "bond", "atoms": [1, 2], "target_value": 1.45}, + {"constraint_kind": "angle", "atoms": [1, 2, 3], "target_value": None}, + ] + payload = self._submit(record) + primary = payload["conformers"][0]["primary_calculation"] + self.assertIn("constraints", primary) + # constraint_index starts at 1 and is deterministic + self.assertEqual(primary["constraints"][0]["constraint_index"], 1) + self.assertEqual(primary["constraints"][1]["constraint_index"], 2) + # bond carries a target_value, angle omits it (target_value=None) + self.assertEqual(primary["constraints"][0]["constraint_kind"], "bond") + self.assertEqual(primary["constraints"][0]["atom1_index"], 1) + self.assertEqual(primary["constraints"][0]["atom2_index"], 2) + self.assertNotIn("atom3_index", primary["constraints"][0]) + self.assertAlmostEqual(primary["constraints"][0]["target_value"], 1.45) + self.assertEqual(primary["constraints"][1]["constraint_kind"], "angle") + self.assertNotIn("target_value", primary["constraints"][1]) + + def test_constraints_omitted_when_record_field_absent(self): + record = _full_record() + # No opt_constraints / freq_constraints / sp_constraints set. + payload = self._submit(record) + primary = payload["conformers"][0]["primary_calculation"] + self.assertNotIn("constraints", primary) + for additional in payload["conformers"][0]["additional_calculations"]: + self.assertNotIn("constraints", additional) + + def test_freq_and_sp_constraints_are_isolated_per_calc(self): + record = _full_record() + record["freq_constraints"] = [ + {"constraint_kind": "dihedral", "atoms": [1, 2, 3, 4], + "target_value": 180.0}, + ] + record["sp_constraints"] = [ + {"constraint_kind": "cartesian_atom", "atoms": [5], + "target_value": None}, + ] + payload = self._submit(record) + by_key = {c["key"]: c for c in payload["conformers"][0]["additional_calculations"]} + self.assertEqual(by_key["freq"]["constraints"][0]["constraint_kind"], "dihedral") + self.assertEqual(by_key["freq"]["constraints"][0]["atom4_index"], 4) + self.assertEqual(by_key["sp"]["constraints"][0]["constraint_kind"], "cartesian_atom") + self.assertEqual(by_key["sp"]["constraints"][0]["atom1_index"], 5) + self.assertNotIn("atom2_index", by_key["sp"]["constraints"][0]) + # No cross-talk back onto the opt calc. + primary = payload["conformers"][0]["primary_calculation"] + self.assertNotIn("constraints", primary) + + def test_scan_calc_constraints_attached_inline_via_additional_calculations(self): + # The scan loop uses ``source_constraints`` from the per-entry + # ``constraints`` field, so a scan can hold frozen coordinates + # alongside its active scan coordinate without duplication. + record = _full_record() + record["additional_calculations"] = [{ + "key": "scan_rotor_0", + "type": "scan", + "scan_result": { + "dimension": 1, + "is_relaxed": True, + "coordinates": [{ + "coordinate_index": 1, + "coordinate_kind": "dihedral", + "atom1_index": 1, "atom2_index": 2, + "atom3_index": 3, "atom4_index": 4, + "step_count": 36, "value_unit": "deg", + }], + "points": [], + }, + "constraints": [ + {"constraint_kind": "bond", "atoms": [5, 6], + "target_value": 1.20}, + ], + }] + payload = self._submit(record) + scans = [c for c in payload["conformers"][0]["additional_calculations"] + if c.get("type") == "scan"] + self.assertEqual(len(scans), 1) + self.assertEqual(scans[0]["constraints"][0]["constraint_kind"], "bond") + self.assertEqual(scans[0]["constraints"][0]["atom1_index"], 5) + # The scanned dihedral lives in scan_result.coordinates[], NOT in + # the constraints list. + self.assertNotIn( + "dihedral", + [c["constraint_kind"] for c in scans[0]["constraints"]], + ) + + def test_invalid_constraints_filtered_silently(self): + # Best-effort contract: a malformed entry must be dropped, not + # explode the whole payload. + record = _full_record() + record["opt_constraints"] = [ + {"constraint_kind": "bond", "atoms": [1], "target_value": None}, # arity wrong + {"constraint_kind": "wat", "atoms": [1, 2], "target_value": None}, # bad kind + {"constraint_kind": "angle", "atoms": [1, 2, 3], "target_value": None}, # OK + ] + payload = self._submit(record) + primary = payload["conformers"][0]["primary_calculation"] + self.assertEqual(len(primary["constraints"]), 1) + self.assertEqual(primary["constraints"][0]["constraint_kind"], "angle") + self.assertEqual(primary["constraints"][0]["constraint_index"], 1) + + +class TestCalculationConstraintsSerializer(unittest.TestCase): + """Direct tests for arc.tckdb.constraints.serialize_constraints.""" + + def test_indices_start_at_one_and_are_deterministic(self): + from arc.tckdb.constraints import serialize_constraints + items = [ + {"constraint_kind": "bond", "atoms": [1, 2]}, + {"constraint_kind": "dihedral", "atoms": [3, 4, 5, 6]}, + {"constraint_kind": "cartesian_atom", "atoms": [7]}, + ] + result = serialize_constraints(items) + self.assertEqual([r["constraint_index"] for r in result], [1, 2, 3]) + + def test_dataclass_input_accepted(self): + from arc.tckdb.constraints import ( + TCKDBCalculationConstraint, + serialize_constraints, + ) + c = TCKDBCalculationConstraint( + constraint_kind="bond", atom1_index=1, atom2_index=2, + target_value=1.5, + ) + out = serialize_constraints([c]) + self.assertEqual(out[0]["atom1_index"], 1) + self.assertEqual(out[0]["atom2_index"], 2) + self.assertAlmostEqual(out[0]["target_value"], 1.5) + + def test_empty_input_yields_empty_list(self): + from arc.tckdb.constraints import serialize_constraints + self.assertEqual(serialize_constraints([]), []) + self.assertEqual(serialize_constraints(None or []), []) + + +class TestArcArgsToKeywords(unittest.TestCase): + """Determinism guarantees for ``_arc_args_to_keywords``. + + The output participates in TCKDB's ``lot_hash``, so two args dicts + that differ only in insertion order must serialize identically — a + regression here would fragment the LoT row across runs. + """ + + def setUp(self): + from arc.tckdb.adapter import _arc_args_to_keywords + self.flatten = _arc_args_to_keywords + + def test_none_returns_none(self): + self.assertIsNone(self.flatten(None)) + + def test_empty_dict_returns_none(self): + self.assertIsNone(self.flatten({})) + + def test_dict_with_only_empty_category_returns_none(self): + self.assertIsNone(self.flatten({"keyword": {}})) + + def test_non_dict_returns_none(self): + self.assertIsNone(self.flatten("dlpno=tight")) + self.assertIsNone(self.flatten(42)) + self.assertIsNone(self.flatten(["dlpno"])) + + def test_non_mapping_category_value_skipped(self): + # Defensive: a list under a category shouldn't crash; it just + # doesn't contribute entries. + self.assertIsNone(self.flatten({"keyword": ["TightPNO"]})) + + def test_includes_keyword_entries(self): + out = self.flatten({"keyword": {"rijcosx": "RIJCOSX", + "grid": "DEFGRID3"}}) + self.assertEqual( + out, + 'keyword:grid="DEFGRID3"; keyword:rijcosx="RIJCOSX"', + ) + + def test_includes_block_entries(self): + out = self.flatten({"block": {"scf": "MaxIter 500\nDIIS true"}}) + self.assertEqual(out, 'block:scf="MaxIter 500\\nDIIS true"') + + def test_includes_both_keyword_and_block_categories_sorted(self): + out = self.flatten({ + "keyword": {"dlpno_threshold": "TightPNO", "rijcosx": "RIJCOSX", + "grid": "DEFGRID3", "uno": "UNO"}, + "block": {"scf": "MaxIter 500\nDIIS true"}, + }) + self.assertEqual( + out, + 'block:scf="MaxIter 500\\nDIIS true"; ' + 'keyword:dlpno_threshold="TightPNO"; ' + 'keyword:grid="DEFGRID3"; ' + 'keyword:rijcosx="RIJCOSX"; ' + 'keyword:uno="UNO"', + ) + + def test_skips_none_values(self): + out = self.flatten({"keyword": {"a": "x", "b": None, "c": "y"}}) + self.assertEqual(out, 'keyword:a="x"; keyword:c="y"') + + def test_deterministic_independent_of_input_order(self): + a = {"keyword": {"a": 1, "b": 2, "c": 3}, + "block": {"x": "X", "y": "Y"}} + b = {"block": {"y": "Y", "x": "X"}, + "keyword": {"c": 3, "a": 1, "b": 2}} + self.assertEqual(self.flatten(a), self.flatten(b)) + + def test_serializes_nested_list_and_bool_deterministically(self): + # Dict-valued args are JSON-dumped with sort_keys=True so that + # nested structure also dedups across insertion orders. + a = self.flatten({"keyword": { + "iters": [1, 2, 3], + "use_uno": True, + "thresholds": {"e": 1e-9, "d": 1e-6}, + }}) + b = self.flatten({"keyword": { + "thresholds": {"d": 1e-6, "e": 1e-9}, + "use_uno": True, + "iters": [1, 2, 3], + }}) + self.assertEqual(a, b) + self.assertIn("keyword:iters=[1,2,3]", a) + self.assertIn("keyword:use_uno=true", a) + self.assertIn('keyword:thresholds={"d":1e-06,"e":1e-09}', a) + + +class TestArcLevelToTckdbLot(unittest.TestCase): + """Field-name translation from ARC's Level dict to TCKDB's + ``LevelOfTheoryRef``. ARC writes ``auxiliary_basis``/``cabs``/ + ``solvation_method``; TCKDB consumes ``aux_basis``/``cabs_basis``/ + ``solvent_model`` — the projection must rename, not pass through.""" + + def setUp(self): + from arc.tckdb.adapter import _arc_level_to_tckdb_lot + self.project = _arc_level_to_tckdb_lot + + def test_none_or_no_method_returns_none(self): + self.assertIsNone(self.project(None)) + self.assertIsNone(self.project({})) + self.assertIsNone(self.project({"basis": "cc-pvtz-f12"})) + + def test_renames_arc_field_names_to_tckdb(self): + out = self.project({ + "method": "DLPNO-CCSD(T)-F12", + "basis": "cc-pVTZ-F12", + "auxiliary_basis": "aug-cc-pVTZ/C", + "cabs": "cc-pVTZ-F12-CABS", + "solvation_method": "smd", + "solvent": "water", + "dispersion": "gd3bj", + "software": "orca", # dropped — lives on software_release + "software_version": "5.0", # dropped — lives on software_release + "method_type": "wavefunction", # no TCKDB counterpart + "year": 2024, # no TCKDB counterpart + }) + self.assertEqual(out, { + "method": "DLPNO-CCSD(T)-F12", + "basis": "cc-pVTZ-F12", + "aux_basis": "aug-cc-pVTZ/C", + "cabs_basis": "cc-pVTZ-F12-CABS", + "solvent_model": "smd", + "solvent": "water", + "dispersion": "gd3bj", + }) + + def test_includes_keywords_when_args_present(self): + out = self.project({ + "method": "dlpno-ccsd(t)-f12", + "basis": "cc-pvtz-f12", + "args": {"keyword": {"dlpno_threshold": "TightPNO"}}, + }) + self.assertEqual(out["keywords"], + 'keyword:dlpno_threshold="TightPNO"') + + def test_omits_keywords_when_args_empty(self): + out = self.project({ + "method": "wb97xd", + "basis": "def2tzvp", + "args": {}, + }) + self.assertNotIn("keywords", out) + + +class TestComputedReactionDependencyEdges(unittest.TestCase): + """``optimized_from`` dependency edges in computed-reaction bundles. + + Exercises two gaps the audit found: + - Part A: reactant/product opt → opt_coarse (parity with the + species-side path, which previously hardcoded depends_on=None). + - Part B: ts_opt → ts_guess (path_search: NEB / GSM) — geometry-only + TS guesses (heuristics, AutoTST, user-supplied) stay edge-less. + + The reaction-bundle flatten step (``_flatten_all_reaction_calcs``) + pops wrapped ``opt_result``/etc. but leaves ``depends_on`` at the + calculation-object level, so these tests assert against the + post-flatten payload that actually gets uploaded. + """ + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-tckdb-rxn-deps-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + self.cfg = TCKDBConfig( + enabled=True, + base_url="http://localhost:8000/api/v1", + payload_dir=self.tmp, + api_key_env="X_TCKDB_API_KEY", + project_label="proj-A", + upload_mode="computed_reaction", + ) + + def _adapter(self, client): + return TCKDBAdapter( + self.cfg, + client_factory=lambda c, k: client, + ) + + def _submit(self, *, output_doc=None, reaction=None): + client = _StubClient(response=_StubResponse({"reaction_id": 42})) + adapter = self._adapter(client) + with mock.patch.dict(os.environ, {"X_TCKDB_API_KEY": "tck_x"}): + outcome = adapter.submit_computed_reaction_from_output( + output_doc=output_doc or _reaction_output_doc(), + reaction_record=reaction or _reaction_record(), + ) + return outcome, client, json.loads(outcome.payload_path.read_text()) + + @staticmethod + def _add_coarse_to_species(doc, label): + spc = next(s for s in doc["species"] if s["label"] == label) + spc["coarse_opt_log"] = f"calcs/.../{label}/coarse/input.log" + spc["coarse_opt_n_steps"] = 7 + spc["coarse_opt_final_energy_hartree"] = -100.05 + spc["coarse_opt_input_xyz"] = "C 9.999 9.999 9.999\nH 8.888 8.888 8.888" + spc["coarse_opt_output_xyz"] = "C 1.111 2.222 3.333\nH 4.444 5.555 6.666" + spc["opt_input_xyz"] = spc["coarse_opt_output_xyz"] + return spc + + # ------------------------------------------------------------------ + # Part A: reaction-side coarse → fine opt parity + # ------------------------------------------------------------------ + + def test_reactant_with_coarse_emits_namespaced_opt_coarse(self): + doc = _reaction_output_doc() + self._add_coarse_to_species(doc, "CHO") + _, _, payload = self._submit(output_doc=doc) + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + keys = sorted(c["key"] for c in r0["calculations"]) + self.assertIn("r0_opt_coarse", keys) + opt_coarse = next(c for c in r0["calculations"] if c["key"] == "r0_opt_coarse") + # Same calc type ("opt") as the species-side path; only the + # bundle-local key is namespaced. + self.assertEqual(opt_coarse["type"], "opt") + + def test_reactant_fine_opt_depends_on_namespaced_opt_coarse(self): + doc = _reaction_output_doc() + self._add_coarse_to_species(doc, "CHO") + _, _, payload = self._submit(output_doc=doc) + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + primary = r0["conformers"][0]["calculation"] + self.assertEqual(primary["key"], "r0_opt") + self.assertEqual( + primary["depends_on"], + [{"parent_calculation_key": "r0_opt_coarse", "role": "optimized_from"}], + ) + # opt_coarse is chain head — no upstream calc edge. + opt_coarse = next(c for c in r0["calculations"] if c["key"] == "r0_opt_coarse") + self.assertNotIn("depends_on", opt_coarse) + + def test_product_with_coarse_emits_namespaced_edge(self): + doc = _reaction_output_doc() + self._add_coarse_to_species(doc, "CH3") # product side + _, _, payload = self._submit(output_doc=doc) + p1 = next(s for s in payload["species"] if s["key"] == "p1_CH3") + keys = [c["key"] for c in p1["calculations"]] + self.assertIn("p1_opt_coarse", keys) + primary = p1["conformers"][0]["calculation"] + self.assertEqual( + primary["depends_on"], + [{"parent_calculation_key": "p1_opt_coarse", "role": "optimized_from"}], + ) + + def test_missing_coarse_log_emits_no_opt_coarse_or_edge(self): + doc = _reaction_output_doc() + # No coarse_opt_log at all on the reactant — single-stage opt. + _, _, payload = self._submit(output_doc=doc) + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + keys = [c["key"] for c in r0["calculations"]] + self.assertNotIn("r0_opt_coarse", keys) + primary = r0["conformers"][0]["calculation"] + self.assertNotIn("depends_on", primary) + + def test_unparseable_coarse_geometry_falls_back_no_edge(self): + doc = _reaction_output_doc() + spc = self._add_coarse_to_species(doc, "CHO") + spc["coarse_opt_output_xyz"] = None # parse failure + # opt_input_xyz reverts to species's truly-initial xyz. + spc["opt_input_xyz"] = "C 0.0 0.0 0.0\nH 1.0 0.0 0.0" + _, _, payload = self._submit(output_doc=doc) + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + keys = [c["key"] for c in r0["calculations"]] + self.assertNotIn("r0_opt_coarse", keys) + primary = r0["conformers"][0]["calculation"] + self.assertNotIn("depends_on", primary) + + def test_reaction_coarse_namespaces_are_unique_across_species(self): + # Both a reactant and a product carry coarse — keys must not collide. + doc = _reaction_output_doc() + self._add_coarse_to_species(doc, "CHO") # → r0_opt_coarse + self._add_coarse_to_species(doc, "CH3") # → p1_opt_coarse + _, _, payload = self._submit(output_doc=doc) + all_keys = [] + for sp in payload["species"]: + all_keys.extend(c["key"] for c in sp["calculations"]) + for conf in sp["conformers"]: + all_keys.append(conf["calculation"]["key"]) + self.assertEqual(len(set(all_keys)), len(all_keys), + msg=f"duplicate calc keys: {all_keys}") + self.assertIn("r0_opt_coarse", all_keys) + self.assertIn("p1_opt_coarse", all_keys) + + # ------------------------------------------------------------------ + # Part B: ts_opt → ts_guess (path_search: NEB / GSM) + # ------------------------------------------------------------------ + + @staticmethod + def _make_neb_ts(doc): + ts = doc["transition_states"][0] + ts["chosen_ts_method"] = "orca_neb" + ts["neb_log"] = "calcs/.../TS0/neb/input.log" + return ts + + @staticmethod + def _make_gsm_ts(doc): + ts = doc["transition_states"][0] + ts["chosen_ts_method"] = "xtb_gsm" + ts["gsm_log"] = "calcs/.../TS0/gsm/stringfile.xyz0000" + return ts + + def test_neb_chosen_ts_emits_path_search_ts_guess_calc(self): + doc = _reaction_output_doc() + self._make_neb_ts(doc) + _, _, payload = self._submit(output_doc=doc) + ts = payload["transition_state"] + keys = sorted(c["key"] for c in ts["calculations"]) + self.assertIn("ts_guess", keys) + ts_guess = next(c for c in ts["calculations"] if c["key"] == "ts_guess") + self.assertEqual(ts_guess["type"], "path_search") + self.assertEqual(ts_guess.get("path_search_result"), {"method": "neb"}) + # Chain head: no parent calculation. + self.assertNotIn("depends_on", ts_guess) + + def test_gsm_chosen_ts_emits_path_search_ts_guess_calc(self): + # Positive GSM path: when the producer exposes a gsm_log on the + # TS record (currently a future field — see audit notes in + # adapter.py), the adapter emits type=path_search with method=gsm. + doc = _reaction_output_doc() + self._make_gsm_ts(doc) + _, _, payload = self._submit(output_doc=doc) + ts = payload["transition_state"] + keys = sorted(c["key"] for c in ts["calculations"]) + self.assertIn("ts_guess", keys) + ts_guess = next(c for c in ts["calculations"] if c["key"] == "ts_guess") + self.assertEqual(ts_guess["type"], "path_search") + self.assertEqual(ts_guess.get("path_search_result"), {"method": "gsm"}) + self.assertNotIn("depends_on", ts_guess) + + def test_gsm_method_alias_dash_form_matches(self): + doc = _reaction_output_doc() + ts = self._make_gsm_ts(doc) + ts["chosen_ts_method"] = "xTB-GSM" # producer typing variation + _, _, payload = self._submit(output_doc=doc) + keys = [c["key"] for c in payload["transition_state"]["calculations"]] + self.assertIn("ts_guess", keys) + + def test_gsm_chosen_ts_without_gsm_log_emits_no_parent_calc(self): + # Mirror of NEB-without-log: chosen method is GSM but the + # producer didn't expose the log path. Conservative: no parent. + doc = _reaction_output_doc() + ts = doc["transition_states"][0] + ts["chosen_ts_method"] = "xtb_gsm" + ts["gsm_log"] = None + # Even a stray neb_log shouldn't cross-pollinate methods. + ts["neb_log"] = "calcs/.../TS0/neb/input.log" + _, _, payload = self._submit(output_doc=doc) + keys = [c["key"] for c in payload["transition_state"]["calculations"]] + self.assertNotIn("ts_guess", keys) + + def test_no_payload_carries_legacy_neb_type(self): + # Regression guard: nothing in the bundle may emit the old + # NEB-specific calculation type or wrapped result key. + doc = _reaction_output_doc() + self._make_neb_ts(doc) + _, _, payload = self._submit(output_doc=doc) + blob = json.dumps(payload) + self.assertNotIn('"type": "neb"', blob) + self.assertNotIn('"neb_result"', blob) + + def test_ts_opt_depends_on_ts_guess_with_optimized_from(self): + doc = _reaction_output_doc() + self._make_neb_ts(doc) + _, _, payload = self._submit(output_doc=doc) + primary = payload["transition_state"]["calculation"] + self.assertEqual(primary["key"], "ts_opt") + self.assertEqual( + primary["depends_on"], + [{"parent_calculation_key": "ts_guess", "role": "optimized_from"}], + ) + + def test_neb_method_match_is_case_insensitive(self): + doc = _reaction_output_doc() + ts = self._make_neb_ts(doc) + ts["chosen_ts_method"] = "ORCA_NEB" # producer typing variation + _, _, payload = self._submit(output_doc=doc) + keys = [c["key"] for c in payload["transition_state"]["calculations"]] + self.assertIn("ts_guess", keys) + + def test_missing_neb_log_emits_no_ts_guess_or_edge(self): + # Conservative gate: chosen_ts_method == NEB but no log path means + # we have no provenance to anchor the parent calc — fall back to + # geometry-only ts_opt. + doc = _reaction_output_doc() + ts = doc["transition_states"][0] + ts["chosen_ts_method"] = "orca_neb" + ts["neb_log"] = None + _, _, payload = self._submit(output_doc=doc) + keys = [c["key"] for c in payload["transition_state"]["calculations"]] + self.assertNotIn("ts_guess", keys) + primary = payload["transition_state"]["calculation"] + self.assertNotIn("depends_on", primary) + + def test_heuristic_ts_guess_emits_no_parent_calc(self): + doc = _reaction_output_doc() + ts = doc["transition_states"][0] + ts["chosen_ts_method"] = "Heuristics" + ts["neb_log"] = None + _, _, payload = self._submit(output_doc=doc) + keys = [c["key"] for c in payload["transition_state"]["calculations"]] + self.assertNotIn("ts_guess", keys) + self.assertNotIn( + "depends_on", payload["transition_state"]["calculation"], + ) + + def test_autotst_ts_guess_emits_no_parent_calc(self): + doc = _reaction_output_doc() + ts = doc["transition_states"][0] + ts["chosen_ts_method"] = "AutoTST" + # Even if the producer left a stray neb_log path, the method + # gate must reject — AutoTST is not a NEB calc. + ts["neb_log"] = "calcs/.../TS0/neb/input.log" + _, _, payload = self._submit(output_doc=doc) + keys = [c["key"] for c in payload["transition_state"]["calculations"]] + self.assertNotIn("ts_guess", keys) + + def test_user_guess_emits_no_parent_calc(self): + doc = _reaction_output_doc() + ts = doc["transition_states"][0] + ts["chosen_ts_method"] = "user guess 0" + ts["neb_log"] = None + _, _, payload = self._submit(output_doc=doc) + keys = [c["key"] for c in payload["transition_state"]["calculations"]] + self.assertNotIn("ts_guess", keys) + + def test_no_chosen_ts_method_emits_no_parent_calc(self): + # Older runs / records that never set chosen_ts_method. + doc = _reaction_output_doc() + # Default fixture has no chosen_ts_method on the ts. + _, _, payload = self._submit(output_doc=doc) + keys = [c["key"] for c in payload["transition_state"]["calculations"]] + self.assertNotIn("ts_guess", keys) + + def test_dependency_survives_reaction_bundle_flatten(self): + # ``_flatten_all_reaction_calcs`` pops opt_result into flat + # fields; depends_on lives at the calc-object level (not nested + # in opt_result), so the edge must be intact in the final + # post-flatten payload. + doc = _reaction_output_doc() + self._make_neb_ts(doc) + self._add_coarse_to_species(doc, "CHO") + _, _, payload = self._submit(output_doc=doc) + # Sanity: flatten promoted r0_opt's converged-flag to the top level. + r0 = next(s for s in payload["species"] if s["key"] == "r0_CHO") + r0_primary = r0["conformers"][0]["calculation"] + self.assertNotIn("opt_result", r0_primary) + self.assertIn("opt_converged", r0_primary) + # Both edges still present. + self.assertEqual( + r0_primary["depends_on"], + [{"parent_calculation_key": "r0_opt_coarse", "role": "optimized_from"}], + ) + ts_primary = payload["transition_state"]["calculation"] + self.assertEqual( + ts_primary["depends_on"], + [{"parent_calculation_key": "ts_guess", "role": "optimized_from"}], + ) + + def test_neb_payload_validates_against_live_reaction_schema(self): + # End-to-end: with both Part-A and Part-B edges in play, the + # full payload still passes the live pydantic validator. + try: + import sys as _sys + _sys.path.insert(0, "/home/calvin/code/TCKDB_v2/backend") + from app.schemas.workflows.computed_reaction_upload import ( + ComputedReactionUploadRequest, + ) + except Exception: + self.skipTest("TCKDB backend pydantic schema not importable in this env") + doc = _reaction_output_doc() + self._make_neb_ts(doc) + self._add_coarse_to_species(doc, "CHO") + _, _, payload = self._submit(output_doc=doc) + ComputedReactionUploadRequest.model_validate(payload) + + +class TestTsGuessPathSearchGate(unittest.TestCase): + """Direct unit tests for ``_resolve_ts_guess_path_search``.""" + + def setUp(self): + from arc.tckdb.adapter import _resolve_ts_guess_path_search + self.resolve = _resolve_ts_guess_path_search + + def test_orca_neb_resolves_to_neb(self): + self.assertEqual(self.resolve("orca_neb"), "neb") + + def test_xtb_gsm_underscore_resolves_to_gsm(self): + self.assertEqual(self.resolve("xtb_gsm"), "gsm") + + def test_xtb_gsm_dash_form_resolves_to_gsm(self): + self.assertEqual(self.resolve("xTB-GSM"), "gsm") + self.assertEqual(self.resolve("xtb-gsm"), "gsm") + + def test_match_is_case_and_whitespace_tolerant(self): + self.assertEqual(self.resolve("ORCA_NEB"), "neb") + self.assertEqual(self.resolve(" orca_neb "), "neb") + self.assertEqual(self.resolve("XTB_GSM"), "gsm") + + def test_geometry_only_methods_reject(self): + for m in ("Heuristics", "AutoTST", "KinBot", "GCN", + "user guess 0", "user guess 1"): + self.assertIsNone(self.resolve(m), msg=f"unexpected match: {m}") + + def test_non_string_inputs_reject(self): + self.assertIsNone(self.resolve(None)) + self.assertIsNone(self.resolve(42)) + self.assertIsNone(self.resolve({"method": "orca_neb"})) + + if __name__ == "__main__": unittest.main() diff --git a/arc/tckdb/cli.py b/arc/tckdb/cli.py new file mode 100644 index 0000000000..ed194f9576 --- /dev/null +++ b/arc/tckdb/cli.py @@ -0,0 +1,175 @@ +"""Standalone CLI for re-running the TCKDB upload sweep against an existing project. + +Usage:: + + python -m arc.tckdb.cli /path/to/input.yml + python -m arc.tckdb.cli input.yml --offline + python -m arc.tckdb.cli input.yml --upload-mode computed_reaction + python -m arc.tckdb.cli input.yml --project-directory /elsewhere/proj + +The CLI reads the same ``tckdb`` block from ``input.yml`` that ARC.py +parses post-``execute()``, and runs the same +:func:`arc.tckdb.sweep.run_upload_sweep` — so output is identical to +what you'd see at the end of an ARC run, minus any science work. + +Project-directory resolution order: +1. ``--project-directory`` if passed +2. ``project_directory:`` value inside ``input.yml`` if present +3. ``dirname(input.yml)`` — matches ``ARC.py``'s default + +Why this exists: when an ARC run already wrote ``output/output.yml`` +but the upload sweep didn't fire (or fired in the wrong mode), you +shouldn't have to re-execute jobs to push payloads. Output.yml is the +contract; this CLI consumes it directly. +""" + +import argparse +import os +import sys + +from arc.common import read_yaml_file +from arc.tckdb.config import ( + TCKDBConfig, + UPLOAD_MODE_COMPUTED_REACTION, + UPLOAD_MODE_COMPUTED_SPECIES, + VALID_UPLOAD_MODES, +) +from arc.tckdb.sweep import run_upload_sweep + + +def parse_args(argv=None): + """Parse CLI args. ``argv=None`` lets argparse read sys.argv directly.""" + parser = argparse.ArgumentParser( + prog='python -m arc.tckdb.cli', + description=( + 'Re-run the TCKDB upload sweep against an existing ARC project ' + "(reads /output/output.yml; doesn't re-execute jobs)." + ), + ) + parser.add_argument( + 'input_file', + help='Path to the ARC input.yml whose tckdb: block configures the upload.', + ) + parser.add_argument( + '-p', '--project-directory', + default=None, + help=( + 'Override project directory. Defaults to project_directory in ' + 'input.yml, falling back to dirname(input.yml).' + ), + ) + parser.add_argument( + '--offline', + action='store_true', + help=( + 'Force config.upload=False: write payloads + sidecars to disk ' + 'but make no network calls. Useful for previewing what would ' + 'be uploaded.' + ), + ) + parser.add_argument( + '--upload-mode', + default=None, + choices=sorted(VALID_UPLOAD_MODES), + help=( + 'Override tckdb.upload_mode from input.yml. Useful when the ' + 'original run used the wrong mode (e.g. uploaded conformers ' + "but you wanted reactions) and you don't want to edit the " + 'input file.' + ), + ) + return parser.parse_args(argv) + + +def _resolve_project_directory(args, input_dict): + """Apply the documented resolution order: --project-directory → input.yml → dirname(input).""" + if args.project_directory: + return os.path.abspath(args.project_directory) + from_input = input_dict.get('project_directory') + if from_input: + return os.path.abspath(from_input) + return os.path.abspath(os.path.dirname(args.input_file)) + + +def _build_config(input_dict, *, offline, upload_mode_override): + """Parse tckdb config from input_dict, applying CLI overrides. + + Returns ``None`` when no ``tckdb`` block exists or ``enabled: false`` — + the caller treats this as a hard error (the CLI's whole point is to + upload, so a no-op config is a misuse). + """ + tckdb_raw = dict(input_dict.get('tckdb') or {}) + if not tckdb_raw: + return None + if upload_mode_override: + tckdb_raw['upload_mode'] = upload_mode_override + if offline: + tckdb_raw['upload'] = False + return TCKDBConfig.from_dict(tckdb_raw) + + +def main(argv=None, *, adapter_factory=None): + """CLI entry point. + + ``adapter_factory`` is a test seam: when None, the real + ``TCKDBAdapter`` is constructed from the config. Tests pass a stub + that records calls without touching the network. + + Returns an exit code (0 success, non-zero failure) so callers can + use ``sys.exit(main())`` cleanly. + """ + args = parse_args(argv) + + if not os.path.exists(args.input_file): + print(f'error: input file not found: {args.input_file}', file=sys.stderr) + return 2 + + input_dict = read_yaml_file(path=args.input_file) + if not isinstance(input_dict, dict): + print(f'error: {args.input_file} did not parse as a mapping.', file=sys.stderr) + return 2 + + project_directory = _resolve_project_directory(args, input_dict) + if not os.path.isdir(project_directory): + print( + f'error: project directory does not exist: {project_directory}', + file=sys.stderr, + ) + return 2 + + try: + cfg = _build_config( + input_dict, offline=args.offline, + upload_mode_override=args.upload_mode, + ) + except ValueError as exc: + print(f'error: invalid tckdb config: {exc}', file=sys.stderr) + return 2 + + if cfg is None: + print( + 'error: no tckdb block in input.yml (or enabled: false). ' + 'Add a tckdb: block with enabled: true and base_url: ... to use this CLI.', + file=sys.stderr, + ) + return 2 + + print(f'TCKDB CLI: project={project_directory}') + print(f'TCKDB CLI: mode={cfg.upload_mode} base_url={cfg.base_url} upload={cfg.upload}') + + if adapter_factory is None: + from arc.tckdb.adapter import TCKDBAdapter + adapter = TCKDBAdapter(cfg, project_directory=project_directory) + else: + adapter = adapter_factory(cfg, project_directory) + + run_upload_sweep( + adapter=adapter, + project_directory=project_directory, + tckdb_config=cfg, + ) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/arc/tckdb/cli_test.py b/arc/tckdb/cli_test.py new file mode 100644 index 0000000000..e1782fe3ad --- /dev/null +++ b/arc/tckdb/cli_test.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +"""Tests for the standalone TCKDB upload-sweep CLI.""" + +import io +import os +import shutil +import sys +import tempfile +import unittest +from contextlib import redirect_stderr, redirect_stdout +from unittest import mock + +import yaml + +from arc.tckdb.cli import main, parse_args +from arc.tckdb.adapter_test import _reaction_output_doc + + +# --------------------------------------------------------------------------- +# Stubs and fixtures +# --------------------------------------------------------------------------- + + +class _StubAdapter: + """Captures which submit_* methods the sweep called and with what.""" + + def __init__(self): + self.reaction_calls: list[str] = [] + self.species_calls: list[str] = [] + self.conformer_calls: list[str] = [] + + def submit_computed_reaction_from_output(self, *, output_doc, reaction_record): + self.reaction_calls.append(reaction_record.get('label', '')) + return None + + def submit_computed_species_from_output(self, *, output_doc, species_record): + self.species_calls.append(species_record.get('label', '')) + return None + + def submit_from_output(self, *, output_doc, species_record): + self.conformer_calls.append(species_record.get('label', '')) + return None + + +def _make_project(tmp_dir, *, with_output=True, project_dir_in_input=None): + """Lay out a fake ARC project with input.yml + (optionally) output.yml.""" + proj = os.path.join(tmp_dir, 'proj') + os.makedirs(proj) + if with_output: + out_dir = os.path.join(proj, 'output') + os.makedirs(out_dir) + doc = _reaction_output_doc() + for s in doc['species']: + s['converged'] = True + for ts in doc['transition_states']: + ts['converged'] = True + with open(os.path.join(out_dir, 'output.yml'), 'w') as fh: + yaml.safe_dump(doc, fh) + input_path = os.path.join(proj, 'input.yml') + body = { + 'project': 'cli-test', + 'tckdb': { + 'enabled': True, + 'base_url': 'http://localhost:8000/api/v1', + 'upload_mode': 'computed_reaction', + 'upload': False, + }, + } + if project_dir_in_input: + body['project_directory'] = project_dir_in_input + with open(input_path, 'w') as fh: + yaml.safe_dump(body, fh) + return proj, input_path + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestArgParsing(unittest.TestCase): + def test_positional_input_file_required(self): + with self.assertRaises(SystemExit): + parse_args([]) + + def test_default_flags(self): + args = parse_args(['/tmp/foo.yml']) + self.assertEqual(args.input_file, '/tmp/foo.yml') + self.assertIsNone(args.project_directory) + self.assertIsNone(args.upload_mode) + self.assertFalse(args.offline) + + def test_all_overrides(self): + args = parse_args([ + 'input.yml', '-p', '/some/where', '--offline', + '--upload-mode', 'computed_reaction', + ]) + self.assertEqual(args.project_directory, '/some/where') + self.assertTrue(args.offline) + self.assertEqual(args.upload_mode, 'computed_reaction') + + def test_invalid_upload_mode_rejected(self): + with self.assertRaises(SystemExit): + parse_args(['input.yml', '--upload-mode', 'nonsense']) + + +class TestCLIDispatch(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix='arc-cli-') + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + + def _run(self, argv, *, adapter=None): + adapter = adapter or _StubAdapter() + stdout = io.StringIO() + stderr = io.StringIO() + with redirect_stdout(stdout), redirect_stderr(stderr): + rc = main(argv, adapter_factory=lambda cfg, pdir: adapter) + return rc, stdout.getvalue(), stderr.getvalue(), adapter + + # ---------------- 1: happy path + def test_dispatches_to_reaction_sweep_in_reaction_mode(self): + proj, input_path = _make_project(self.tmp) + rc, stdout, _, adapter = self._run([input_path]) + self.assertEqual(rc, 0) + self.assertEqual(adapter.reaction_calls, ['CHO + CH4 <=> CH2O + CH3']) + self.assertEqual(adapter.species_calls, []) + self.assertEqual(adapter.conformer_calls, []) + self.assertIn('computed-reaction bundle', stdout) + + # ---------------- 2: --upload-mode override + def test_upload_mode_override_redirects_dispatch(self): + proj, input_path = _make_project(self.tmp) + rc, _, _, adapter = self._run( + [input_path, '--upload-mode', 'computed_species'] + ) + self.assertEqual(rc, 0) + # Now species path is hit, not reaction + self.assertEqual(adapter.reaction_calls, []) + self.assertEqual(len(adapter.species_calls), 4) + self.assertEqual(adapter.conformer_calls, []) + + # ---------------- 3: --offline forces config.upload=False + def test_offline_flag_overrides_config_upload(self): + proj, input_path = _make_project(self.tmp) + # Bake upload: true into the input.yml so we can prove --offline overrides + with open(input_path) as fh: + body = yaml.safe_load(fh) + body['tckdb']['upload'] = True + with open(input_path, 'w') as fh: + yaml.safe_dump(body, fh) + + captured_cfgs: list = [] + + def _factory(cfg, pdir): + captured_cfgs.append(cfg) + return _StubAdapter() + + rc = main([input_path, '--offline'], adapter_factory=_factory) + self.assertEqual(rc, 0) + self.assertEqual(len(captured_cfgs), 1) + self.assertFalse( + captured_cfgs[0].upload, + 'config.upload should be False after --offline override', + ) + + # ---------------- 4: project-directory resolution order + def test_project_directory_cli_flag_wins(self): + proj, input_path = _make_project(self.tmp) + # Make a different project dir with its own output.yml + other = os.path.join(self.tmp, 'other') + os.makedirs(os.path.join(other, 'output')) + doc = _reaction_output_doc() + # Only one species converged in this output, distinguishable from default + doc['reactions'][0]['label'] = 'OTHER_RXN' + with open(os.path.join(other, 'output', 'output.yml'), 'w') as fh: + yaml.safe_dump(doc, fh) + rc, _, _, adapter = self._run([input_path, '-p', other]) + self.assertEqual(rc, 0) + self.assertEqual(adapter.reaction_calls, ['OTHER_RXN']) + + def test_project_directory_from_input_yml(self): + proj, input_path = _make_project(self.tmp) + # Move output to a sibling dir and point input.yml at it + sibling = os.path.join(self.tmp, 'sibling') + shutil.move(os.path.join(proj, 'output'), os.path.join(sibling, 'output')) + with open(input_path) as fh: + body = yaml.safe_load(fh) + body['project_directory'] = sibling + with open(input_path, 'w') as fh: + yaml.safe_dump(body, fh) + rc, stdout, _, adapter = self._run([input_path]) + self.assertEqual(rc, 0) + self.assertEqual(adapter.reaction_calls, ['CHO + CH4 <=> CH2O + CH3']) + + def test_project_directory_falls_back_to_input_dir(self): + proj, input_path = _make_project(self.tmp) + rc, stdout, _, adapter = self._run([input_path]) + self.assertEqual(rc, 0) + self.assertIn(proj, stdout) + + # ---------------- 5: missing files / bad config exit codes + def test_missing_input_file_exits_2(self): + rc, _, stderr, _ = self._run(['/nonexistent/input.yml']) + self.assertEqual(rc, 2) + self.assertIn('input file not found', stderr) + + def test_missing_project_directory_exits_2(self): + proj, input_path = _make_project(self.tmp) + rc, _, stderr, _ = self._run([input_path, '-p', '/nonexistent/proj']) + self.assertEqual(rc, 2) + self.assertIn('project directory does not exist', stderr) + + def test_no_tckdb_block_exits_2(self): + proj, input_path = _make_project(self.tmp) + with open(input_path, 'w') as fh: + yaml.safe_dump({'project': 'no-tckdb-here'}, fh) + rc, _, stderr, _ = self._run([input_path]) + self.assertEqual(rc, 2) + self.assertIn('no tckdb block', stderr) + + def test_disabled_tckdb_exits_2(self): + proj, input_path = _make_project(self.tmp) + with open(input_path) as fh: + body = yaml.safe_load(fh) + body['tckdb']['enabled'] = False + with open(input_path, 'w') as fh: + yaml.safe_dump(body, fh) + rc, _, stderr, _ = self._run([input_path]) + # enabled: false → from_dict returns None, treated same as missing block + self.assertEqual(rc, 2) + + def test_invalid_tckdb_config_exits_2(self): + proj, input_path = _make_project(self.tmp) + with open(input_path) as fh: + body = yaml.safe_load(fh) + # Drop required base_url; from_dict should raise + body['tckdb'].pop('base_url') + with open(input_path, 'w') as fh: + yaml.safe_dump(body, fh) + rc, _, stderr, _ = self._run([input_path]) + self.assertEqual(rc, 2) + self.assertIn('invalid tckdb config', stderr) + + # ---------------- 6: missing output.yml is a soft skip, not an error + def test_missing_output_yml_is_soft_skip(self): + proj, input_path = _make_project(self.tmp, with_output=False) + rc, stdout, _, adapter = self._run([input_path]) + # The sweep itself prints "TCKDB upload skipped" but returns 0 + # — the run-not-completed case shouldn't be a hard failure. + self.assertEqual(rc, 0) + self.assertIn('not found', stdout) + self.assertEqual(adapter.reaction_calls, []) + + +if __name__ == '__main__': + unittest.main() diff --git a/arc/tckdb/config.py b/arc/tckdb/config.py index 05f9b4fdba..3ff2143872 100644 --- a/arc/tckdb/config.py +++ b/arc/tckdb/config.py @@ -4,17 +4,30 @@ (or ``enabled`` is false), :func:`TCKDBConfig.from_dict` returns ``None`` and the adapter is a no-op. -API keys are never read from input files. The config carries only the -*name* of the env var (``api_key_env``); the adapter resolves the key -at upload time. +API keys themselves are never stored in YAML. The config carries only: + +- ``api_key_env`` — the *name* of an env var holding the key + (default: ``TCKDB_API_KEY``). +- ``api_key_file`` — optional path to a plain text file containing only + the raw key. +- ``api_key_env_file`` — optional path to a shell/dotenv-style file + containing a ``TCKDB_API_KEY`` assignment (parsed, never executed). + +At upload time, :func:`resolve_tckdb_api_key` resolves the key in that +order; an env var always wins so a temporary override on the command +line still works regardless of input.yml. """ import logging import os +import shlex from collections.abc import Mapping from dataclasses import dataclass, field +from pathlib import Path from typing import Any +from arc.exceptions import InputError + logger = logging.getLogger("arc") @@ -26,11 +39,19 @@ # Upload-mode switch. ``conformer`` (default) keeps the existing # /uploads/conformers + per-artifact path. ``computed_species`` builds # one self-contained bundle and posts it to /uploads/computed-species. -# A run can use either; mixing per-species is intentionally not -# supported — pick one mode per ARC run. +# ``computed_reaction`` builds a reaction bundle (reactants, products, +# inline TS, kinetics) and posts it to /uploads/computed-reaction. +# A run picks one mode; mixing per-species is intentionally not +# supported — combining species + reaction uploads in one run is a +# follow-up that needs a wider config change. UPLOAD_MODE_CONFORMER = "conformer" UPLOAD_MODE_COMPUTED_SPECIES = "computed_species" -VALID_UPLOAD_MODES = frozenset({UPLOAD_MODE_CONFORMER, UPLOAD_MODE_COMPUTED_SPECIES}) +UPLOAD_MODE_COMPUTED_REACTION = "computed_reaction" +VALID_UPLOAD_MODES = frozenset({ + UPLOAD_MODE_CONFORMER, + UPLOAD_MODE_COMPUTED_SPECIES, + UPLOAD_MODE_COMPUTED_REACTION, +}) # Mirrors the server-side ArtifactKind enum # (backend/app/db/models/common.py:147 in TCKDB_v2). Keeping the source @@ -120,6 +141,8 @@ class TCKDBConfig: enabled: bool = False base_url: str | None = None api_key_env: str = DEFAULT_API_KEY_ENV + api_key_file: str | None = None + api_key_env_file: str | None = None payload_dir: str = DEFAULT_PAYLOAD_DIR upload: bool = True strict: bool = False @@ -151,10 +174,18 @@ def from_dict(cls, raw: Mapping[str, Any] | None) -> "TCKDBConfig | None": f"tckdb.upload_mode must be one of {sorted(VALID_UPLOAD_MODES)}; " f"got {upload_mode!r}." ) + api_key_file = raw.get("api_key_file") + if api_key_file is not None and not isinstance(api_key_file, str): + raise ValueError("tckdb.api_key_file must be a string path or unset.") + api_key_env_file = raw.get("api_key_env_file") + if api_key_env_file is not None and not isinstance(api_key_env_file, str): + raise ValueError("tckdb.api_key_env_file must be a string path or unset.") return cls( enabled=True, base_url=base_url, api_key_env=str(raw.get("api_key_env", DEFAULT_API_KEY_ENV)), + api_key_file=api_key_file or None, + api_key_env_file=api_key_env_file or None, payload_dir=str(raw.get("payload_dir", DEFAULT_PAYLOAD_DIR)), upload=bool(raw.get("upload", True)), strict=bool(raw.get("strict", False)), @@ -165,5 +196,136 @@ def from_dict(cls, raw: Mapping[str, Any] | None) -> "TCKDBConfig | None": ) def resolve_api_key(self) -> str | None: - """Read the API key from the configured env var. Never logged.""" - return os.environ.get(self.api_key_env) + """Resolve the API key from env or configured local files. + + Delegates to :func:`resolve_tckdb_api_key`. The value is never + logged. Raises :class:`InputError` if a configured file is + missing, empty, or doesn't define the key — misconfigured + ``input.yml`` should fail loud instead of silently producing + zero uploads. + """ + return resolve_tckdb_api_key( + api_key_env=self.api_key_env, + api_key_file=self.api_key_file, + api_key_env_file=self.api_key_env_file, + ) + + def describe_api_key_sources(self) -> str: + """Human-readable list of resolution sources, for log messages. + + Includes file paths only when configured, so the dominant + "env-var-only" case stays terse. + """ + sources = [f"env var '{self.api_key_env}'"] + if self.api_key_file: + sources.append(f"api_key_file={self.api_key_file}") + if self.api_key_env_file: + sources.append(f"api_key_env_file={self.api_key_env_file}") + return ", ".join(sources) + + +def resolve_tckdb_api_key( + *, + api_key_env: str = DEFAULT_API_KEY_ENV, + api_key_file: str | None = None, + api_key_env_file: str | None = None, +) -> str | None: + """Resolve the TCKDB API key from env or configured local files. + + Resolution order: + + 1. ``api_key_env`` from the current process environment. + 2. ``api_key_file`` — plain text file containing only the raw key. + 3. ``api_key_env_file`` — shell/dotenv-style file containing a + ``=...`` assignment. Parsed, never executed. + + Args: + api_key_env: Name of the env var to consult (default + ``TCKDB_API_KEY``). Also used as the variable name to look + for inside ``api_key_env_file``. + api_key_file: Optional path to a plain text file whose entire + (whitespace-stripped) contents are the API key. + api_key_env_file: Optional path to a small dotenv-style file. + + Returns: + The resolved API key (whitespace-stripped), or ``None`` when no + source is configured / set. + + Raises: + InputError: A file path is configured but missing, unreadable, + empty, or doesn't define the variable. + """ + env_key = os.environ.get(api_key_env) + if env_key: + return env_key.strip() + + if api_key_file: + path = Path(api_key_file).expanduser() + if not path.is_file(): + raise InputError(f"tckdb.api_key_file does not exist: {path}") + try: + content = path.read_text(encoding="utf-8") + except OSError as exc: + raise InputError( + f"tckdb.api_key_file is not readable ({path}): {exc}" + ) from exc + key = content.strip() + if not key: + raise InputError(f"tckdb.api_key_file is empty: {path}") + return key + + if api_key_env_file: + path = Path(api_key_env_file).expanduser() + if not path.is_file(): + raise InputError(f"tckdb.api_key_env_file does not exist: {path}") + try: + key = _read_tckdb_api_key_from_env_file(path, api_key_env) + except OSError as exc: + raise InputError( + f"tckdb.api_key_env_file is not readable ({path}): {exc}" + ) from exc + if not key: + raise InputError( + f"tckdb.api_key_env_file does not define {api_key_env}: {path}" + ) + return key + + return None + + +def _read_tckdb_api_key_from_env_file(path: Path, var_name: str) -> str | None: + """Read ``var_name`` from a small shell/dotenv-style env file. + + Recognized assignment shapes (per :func:`shlex.split` in POSIX mode): + + - ``KEY=abc`` + - ``KEY='abc'`` + - ``KEY="abc"`` + - ``export KEY=...`` + + Blank lines and ``#``-prefixed comments are ignored. Other variables + are skipped. The file is parsed as text — never executed — and POSIX + ``shlex`` mode does not perform ``$VAR`` interpolation. + + Returns the first matching value found, or ``None`` if the variable + is not assigned (or its value is empty / unparseable). + """ + prefix = f"{var_name}=" + for raw_line in path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#"): + continue + if line.startswith("export "): + line = line[len("export "):].lstrip() + if not line.startswith(prefix): + continue + value = line[len(prefix):].strip() + if not value: + return None + try: + tokens = shlex.split(value, posix=True) + except ValueError: + return None + if tokens: + return tokens[0].strip() + return None diff --git a/arc/tckdb/config_test.py b/arc/tckdb/config_test.py index d9abbc35ad..bb4dd919f5 100644 --- a/arc/tckdb/config_test.py +++ b/arc/tckdb/config_test.py @@ -5,9 +5,12 @@ import logging import os +import tempfile import unittest +from pathlib import Path from unittest import mock +from arc.exceptions import InputError from arc.tckdb.config import ( DEFAULT_API_KEY_ENV, DEFAULT_ARTIFACT_KINDS, @@ -18,6 +21,8 @@ TCKDBArtifactConfig, TCKDBConfig, VALID_ARTIFACT_KINDS, + _read_tckdb_api_key_from_env_file, + resolve_tckdb_api_key, ) @@ -205,6 +210,270 @@ def test_implemented_only_kinds_no_warning(self): self.assertNotIn("doesn't yet produce", joined) +class TestResolveTckdbApiKey(unittest.TestCase): + """Resolution of the API key from env vs configured local files.""" + + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.tmp = Path(self._tmp.name) + + def _write(self, name: str, content: str) -> Path: + p = self.tmp / name + p.write_text(content, encoding="utf-8") + return p + + def test_env_var_wins_over_api_key_file(self): + path = self._write("key.txt", "from_file_key") + with mock.patch.dict(os.environ, {"X_T_KEY": "from_env_key"}, clear=False): + got = resolve_tckdb_api_key( + api_key_env="X_T_KEY", + api_key_file=str(path), + ) + self.assertEqual(got, "from_env_key") + + def test_api_key_file_returns_raw_key(self): + path = self._write("key.txt", "tck_abcdef") + os.environ.pop("X_T_KEY", None) + got = resolve_tckdb_api_key(api_key_env="X_T_KEY", api_key_file=str(path)) + self.assertEqual(got, "tck_abcdef") + + def test_api_key_file_strips_trailing_newline(self): + path = self._write("key.txt", "tck_abcdef\n") + os.environ.pop("X_T_KEY", None) + got = resolve_tckdb_api_key(api_key_env="X_T_KEY", api_key_file=str(path)) + self.assertEqual(got, "tck_abcdef") + + def test_api_key_file_strips_surrounding_whitespace(self): + path = self._write("key.txt", " tck_abcdef \n\n") + os.environ.pop("X_T_KEY", None) + got = resolve_tckdb_api_key(api_key_env="X_T_KEY", api_key_file=str(path)) + self.assertEqual(got, "tck_abcdef") + + def test_missing_api_key_file_raises(self): + os.environ.pop("X_T_KEY", None) + with self.assertRaises(InputError) as ctx: + resolve_tckdb_api_key( + api_key_env="X_T_KEY", + api_key_file=str(self.tmp / "does_not_exist"), + ) + self.assertIn("does not exist", str(ctx.exception)) + + def test_empty_api_key_file_raises(self): + path = self._write("key.txt", " \n\n") + os.environ.pop("X_T_KEY", None) + with self.assertRaises(InputError) as ctx: + resolve_tckdb_api_key( + api_key_env="X_T_KEY", + api_key_file=str(path), + ) + self.assertIn("empty", str(ctx.exception)) + + def test_no_sources_returns_none(self): + os.environ.pop("X_T_KEY", None) + self.assertIsNone(resolve_tckdb_api_key(api_key_env="X_T_KEY")) + + def test_user_home_expansion(self): + # Paths starting with ~ should be expanded. + with mock.patch.dict(os.environ, {"HOME": str(self.tmp)}, clear=False): + self._write("key.txt", "tck_home") + os.environ.pop("X_T_KEY", None) + got = resolve_tckdb_api_key( + api_key_env="X_T_KEY", + api_key_file="~/key.txt", + ) + self.assertEqual(got, "tck_home") + + # api_key_env_file paths ------------------------------------------------- + + def test_env_file_supports_unquoted(self): + path = self._write("auth.env", "TCKDB_API_KEY=tck_unq\n") + os.environ.pop("TCKDB_API_KEY", None) + got = resolve_tckdb_api_key(api_key_env_file=str(path)) + self.assertEqual(got, "tck_unq") + + def test_env_file_supports_single_quoted(self): + path = self._write("auth.env", "TCKDB_API_KEY='tck_sq'\n") + os.environ.pop("TCKDB_API_KEY", None) + got = resolve_tckdb_api_key(api_key_env_file=str(path)) + self.assertEqual(got, "tck_sq") + + def test_env_file_supports_double_quoted(self): + path = self._write("auth.env", 'TCKDB_API_KEY="tck_dq"\n') + os.environ.pop("TCKDB_API_KEY", None) + got = resolve_tckdb_api_key(api_key_env_file=str(path)) + self.assertEqual(got, "tck_dq") + + def test_env_file_supports_export_prefix(self): + path = self._write("auth.env", "export TCKDB_API_KEY='tck_exp'\n") + os.environ.pop("TCKDB_API_KEY", None) + got = resolve_tckdb_api_key(api_key_env_file=str(path)) + self.assertEqual(got, "tck_exp") + + def test_env_file_ignores_comments_and_blank_lines(self): + body = ( + "# top comment\n" + "\n" + "OTHER_VAR=irrelevant\n" + " # indented comment\n" + "\n" + "export TCKDB_API_KEY='tck_real'\n" + "TRAILING=stuff\n" + ) + path = self._write("auth.env", body) + os.environ.pop("TCKDB_API_KEY", None) + got = resolve_tckdb_api_key(api_key_env_file=str(path)) + self.assertEqual(got, "tck_real") + + def test_env_file_missing_var_raises(self): + path = self._write("auth.env", "OTHER_VAR=not_the_one\n") + os.environ.pop("TCKDB_API_KEY", None) + with self.assertRaises(InputError) as ctx: + resolve_tckdb_api_key(api_key_env_file=str(path)) + self.assertIn("does not define", str(ctx.exception)) + + def test_env_file_missing_path_raises(self): + os.environ.pop("TCKDB_API_KEY", None) + with self.assertRaises(InputError) as ctx: + resolve_tckdb_api_key( + api_key_env_file=str(self.tmp / "no_such_auth.env"), + ) + self.assertIn("does not exist", str(ctx.exception)) + + def test_env_file_does_not_execute_shell(self): + # If the parser were sourcing the file, the $(...) would run + # and the value would equal "SHOULD_NOT_RUN" (echo's output). + # Asserting we get the literal, untransformed token proves the + # subshell never ran. + body = "TCKDB_API_KEY='$(echo SHOULD_NOT_RUN)'\n" + path = self._write("auth.env", body) + os.environ.pop("TCKDB_API_KEY", None) + got = resolve_tckdb_api_key(api_key_env_file=str(path)) + self.assertEqual(got, "$(echo SHOULD_NOT_RUN)") + self.assertNotEqual(got, "SHOULD_NOT_RUN") + + def test_env_file_does_not_interpolate_dollar_vars(self): + # POSIX shlex does not expand $VAR. Confirm we hand back the + # literal "$HOME" rather than its expanded value. + path = self._write("auth.env", "TCKDB_API_KEY='$HOME'\n") + os.environ.pop("TCKDB_API_KEY", None) + got = resolve_tckdb_api_key(api_key_env_file=str(path)) + self.assertEqual(got, "$HOME") + + def test_env_file_uses_configured_var_name(self): + path = self._write("auth.env", "MY_KEY='tck_custom'\n") + os.environ.pop("MY_KEY", None) + got = resolve_tckdb_api_key( + api_key_env="MY_KEY", + api_key_env_file=str(path), + ) + self.assertEqual(got, "tck_custom") + + # Resolution-order priority --------------------------------------------- + + def test_api_key_file_takes_precedence_over_env_file(self): + kf = self._write("key.txt", "tck_from_keyfile") + ef = self._write("auth.env", "TCKDB_API_KEY='tck_from_envfile'\n") + os.environ.pop("TCKDB_API_KEY", None) + got = resolve_tckdb_api_key( + api_key_file=str(kf), + api_key_env_file=str(ef), + ) + self.assertEqual(got, "tck_from_keyfile") + + +class TestReadEnvFileHelper(unittest.TestCase): + """Direct tests of the env-file parser.""" + + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.path = Path(self._tmp.name) / "auth.env" + + def _write(self, content: str) -> Path: + self.path.write_text(content, encoding="utf-8") + return self.path + + def test_returns_none_when_var_absent(self): + self._write("OTHER=xx\n") + self.assertIsNone(_read_tckdb_api_key_from_env_file(self.path, "TCKDB_API_KEY")) + + def test_first_match_wins(self): + self._write("TCKDB_API_KEY=first\nTCKDB_API_KEY=second\n") + self.assertEqual( + _read_tckdb_api_key_from_env_file(self.path, "TCKDB_API_KEY"), + "first", + ) + + +class TestTCKDBConfigKeyFileFields(unittest.TestCase): + """from_dict + integration with resolve_api_key().""" + + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.tmp = Path(self._tmp.name) + + def test_from_dict_parses_file_paths(self): + cfg = TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "api_key_file": "/etc/tckdb/key", + "api_key_env_file": "/etc/tckdb/auth.env", + }) + self.assertEqual(cfg.api_key_file, "/etc/tckdb/key") + self.assertEqual(cfg.api_key_env_file, "/etc/tckdb/auth.env") + + def test_from_dict_rejects_non_string_paths(self): + with self.assertRaises(ValueError): + TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "api_key_file": ["/a", "/b"], + }) + + def test_resolve_api_key_uses_api_key_file(self): + kf = self.tmp / "key.txt" + kf.write_text("tck_via_cfg\n", encoding="utf-8") + cfg = TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "api_key_env": "DOES_NOT_EXIST_X_X", + "api_key_file": str(kf), + }) + os.environ.pop("DOES_NOT_EXIST_X_X", None) + self.assertEqual(cfg.resolve_api_key(), "tck_via_cfg") + + def test_resolve_api_key_env_var_overrides_config_file(self): + kf = self.tmp / "key.txt" + kf.write_text("from_file\n", encoding="utf-8") + cfg = TCKDBConfig.from_dict({ + "enabled": True, + "base_url": "http://x", + "api_key_env": "X_OVERRIDE_KEY", + "api_key_file": str(kf), + }) + with mock.patch.dict(os.environ, {"X_OVERRIDE_KEY": "from_env"}, clear=False): + self.assertEqual(cfg.resolve_api_key(), "from_env") + + def test_describe_api_key_sources_terse_default(self): + cfg = TCKDBConfig(enabled=True, base_url="http://x", api_key_env="X_KEY") + self.assertEqual(cfg.describe_api_key_sources(), "env var 'X_KEY'") + + def test_describe_api_key_sources_with_files(self): + cfg = TCKDBConfig( + enabled=True, + base_url="http://x", + api_key_env="X_KEY", + api_key_file="/etc/tckdb/key", + api_key_env_file="/etc/tckdb/auth.env", + ) + desc = cfg.describe_api_key_sources() + self.assertIn("env var 'X_KEY'", desc) + self.assertIn("api_key_file=/etc/tckdb/key", desc) + self.assertIn("api_key_env_file=/etc/tckdb/auth.env", desc) + + if __name__ == "__main__": import logging unittest.main() diff --git a/arc/tckdb/constraints.py b/arc/tckdb/constraints.py new file mode 100644 index 0000000000..fe52d6b064 --- /dev/null +++ b/arc/tckdb/constraints.py @@ -0,0 +1,195 @@ +"""Held-fixed coordinate constraints for TCKDB calculation payloads. + +Internal representation + serializer for the ``constraints`` field on +``CalculationWithResultsPayload`` (and the bundle-aware variants used +by computed-species / computed-reaction). + +Producers (Gaussian/ORCA parser code) build a list of +:class:`TCKDBCalculationConstraint` instances; the TCKDB adapter calls +:func:`serialize_constraints` to emit the final TCKDB-shaped dict list +with deterministic 1-based ``constraint_index`` values. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Iterable, Mapping + +from arc.common import get_logger + + +logger = get_logger() + + +_VALID_KINDS: frozenset[str] = frozenset({ + 'cartesian_atom', + 'bond', + 'angle', + 'dihedral', + 'improper', +}) + +# Number of atom indices required for each TCKDB constraint kind. +_ATOMS_PER_KIND: dict[str, int] = { + 'cartesian_atom': 1, + 'bond': 2, + 'angle': 3, + 'dihedral': 4, + 'improper': 4, +} + + +@dataclass(frozen=True) +class TCKDBCalculationConstraint: + """A held-fixed coordinate constraint to emit into a TCKDB calculation payload. + + Atom indices are 1-based per TCKDB's convention. ``target_value`` is + optional — emitted only when the producer has a reliable parsed value + from the ESS input deck or log. + """ + + constraint_kind: str + atom1_index: int + atom2_index: int | None = None + atom3_index: int | None = None + atom4_index: int | None = None + target_value: float | None = None + + @classmethod + def from_atoms( + cls, + constraint_kind: str, + atoms: list[int] | tuple[int, ...], + target_value: float | None = None, + ) -> "TCKDBCalculationConstraint": + """Build from a flat ``atoms`` list, padding unused slots with None.""" + padded: list[int | None] = list(atoms) + [None] * (4 - len(atoms)) + return cls( + constraint_kind=constraint_kind, + atom1_index=padded[0], # type: ignore[arg-type] + atom2_index=padded[1], + atom3_index=padded[2], + atom4_index=padded[3], + target_value=target_value, + ) + + +def _validate(c: TCKDBCalculationConstraint) -> bool: + """Return True if ``c`` is internally consistent for TCKDB emission. + + Validates that the kind is recognised, that the right number of atom + slots are filled, and that all filled atom indices are 1-based ints. + Logs a warning and returns False when invalid — the caller drops the + constraint and continues. + """ + if c.constraint_kind not in _VALID_KINDS: + logger.warning("TCKDB constraint: unknown kind %r; dropping", + c.constraint_kind) + return False + expected = _ATOMS_PER_KIND[c.constraint_kind] + indices = [c.atom1_index, c.atom2_index, c.atom3_index, c.atom4_index] + filled = [i for i in indices if i is not None] + if len(filled) != expected: + logger.warning("TCKDB constraint: kind %s expects %d atom indices, " + "got %d; dropping", c.constraint_kind, expected, len(filled)) + return False + for idx in filled: + if not isinstance(idx, int) or idx < 1: + logger.warning("TCKDB constraint: non-positive or non-integer " + "atom index %r; dropping", idx) + return False + return True + + +def serialize_constraints( + constraints: Iterable[TCKDBCalculationConstraint | Mapping[str, Any]], +) -> list[dict[str, Any]]: + """Serialize an iterable of constraints into TCKDB payload shape. + + Accepts either :class:`TCKDBCalculationConstraint` instances or the + parser-shaped dicts ``{'constraint_kind', 'atoms', 'target_value'}`` + that the Gaussian/ORCA parsers emit. Mixed input is fine. + + Output shape per element:: + + { + 'constraint_index': int, # 1-based, deterministic + 'constraint_kind': str, + 'atom1_index': int, + 'atom2_index': int | omitted, + 'atom3_index': int | omitted, + 'atom4_index': int | omitted, + 'target_value': float | omitted, + } + + Returns ``[]`` when the input is empty or every entry is invalid. + """ + out: list[dict[str, Any]] = [] + next_index = 1 + for raw in constraints: + c = _coerce(raw) + if c is None: + continue + if not _validate(c): + continue + entry: dict[str, Any] = { + 'constraint_index': next_index, + 'constraint_kind': c.constraint_kind, + 'atom1_index': c.atom1_index, + } + if c.atom2_index is not None: + entry['atom2_index'] = c.atom2_index + if c.atom3_index is not None: + entry['atom3_index'] = c.atom3_index + if c.atom4_index is not None: + entry['atom4_index'] = c.atom4_index + if c.target_value is not None: + entry['target_value'] = float(c.target_value) + out.append(entry) + next_index += 1 + return out + + +def _coerce( + raw: TCKDBCalculationConstraint | Mapping[str, Any], +) -> TCKDBCalculationConstraint | None: + """Coerce a parser-dict OR existing dataclass instance into the dataclass. + + Returns None and logs a warning when the input is shaped wrong (e.g., + parser dict missing 'atoms' or 'constraint_kind'). The caller skips. + """ + if isinstance(raw, TCKDBCalculationConstraint): + return raw + if not isinstance(raw, Mapping): + logger.warning("TCKDB constraint: expected dataclass or mapping, " + "got %s; dropping", type(raw).__name__) + return None + kind = raw.get('constraint_kind') + atoms = raw.get('atoms') + if not isinstance(kind, str): + logger.warning("TCKDB constraint: missing or non-string " + "'constraint_kind' in %r; dropping", raw) + return None + if not isinstance(atoms, (list, tuple)) or not atoms: + logger.warning("TCKDB constraint: missing or empty 'atoms' in %r; " + "dropping", raw) + return None + try: + atom_ints = [int(a) for a in atoms] + except (TypeError, ValueError): + logger.warning("TCKDB constraint: non-integer atom index in %r; " + "dropping", raw) + return None + target_value = raw.get('target_value') + if target_value is not None: + try: + target_value = float(target_value) + except (TypeError, ValueError): + logger.warning("TCKDB constraint: non-numeric target_value %r; " + "treating as absent", target_value) + target_value = None + return TCKDBCalculationConstraint.from_atoms( + constraint_kind=kind, + atoms=atom_ints, + target_value=target_value, + ) diff --git a/arc/tckdb/payload_writer.py b/arc/tckdb/payload_writer.py index 42615cb50b..54d23c892e 100644 --- a/arc/tckdb/payload_writer.py +++ b/arc/tckdb/payload_writer.py @@ -132,6 +132,7 @@ class PayloadWriter: SUBDIR = "conformer_calculation" ARTIFACT_SUBDIR = "calculation_artifacts" COMPUTED_SPECIES_SUBDIR = "computed_species" + COMPUTED_REACTION_SUBDIR = "computed_reaction" PAYLOAD_SUFFIX = ".payload.json" SIDECAR_SUFFIX = ".meta.json" ARTIFACT_SIDECAR_SUFFIX = ".artifact.meta.json" diff --git a/arc/tckdb/sweep.py b/arc/tckdb/sweep.py new file mode 100644 index 0000000000..2af6717bc6 --- /dev/null +++ b/arc/tckdb/sweep.py @@ -0,0 +1,316 @@ +"""End-of-run TCKDB upload sweep. + +Reads ``/output/output.yml`` and dispatches per +``TCKDBConfig.upload_mode``: + +- ``conformer``: per-species ``/uploads/conformers`` POST + per-artifact + POSTs to ``/calculations/{id}/artifacts``. +- ``computed_species``: one ``/uploads/computed-species`` bundle per + species, with artifacts inlined under each calc. +- ``computed_reaction``: one ``/uploads/computed-reaction`` bundle per + reaction. Species + TS + kinetics all ship in the reaction bundle; + the per-species sweep is *not* run. + +Lives in its own module so both the post-``execute()`` hook in +``ARC.py`` and the standalone CLI (``arc/tckdb/cli.py``) can call the +same code path. Functions take ``project_directory`` directly rather +than the live ARC object — output.yml is the contract. +""" + +import os + +from arc.common import read_yaml_file +from arc.tckdb.config import ( + IMPLEMENTED_ARTIFACT_KINDS, + UPLOAD_MODE_COMPUTED_REACTION, + UPLOAD_MODE_COMPUTED_SPECIES, +) + + +def run_upload_sweep(*, adapter, project_directory, tckdb_config): + """Top-level dispatch: load output.yml, route per upload_mode, print summary. + + Returns ``None``. Side effects: writes payloads/sidecars under + ``payload_dir``, optionally POSTs to TCKDB, and prints a one-line + summary per upload kind. + """ + output_path = os.path.join(project_directory, 'output', 'output.yml') + if not os.path.exists(output_path): + # Most common cause: the run was interrupted before + # write_output_yml ran. Skip cleanly rather than scrape live + # objects — the replay path expects output.yml as the contract. + print(f'TCKDB upload skipped: {output_path} not found (run did not complete?)') + return + + output_doc = read_yaml_file(path=output_path) + + if tckdb_config.upload_mode == UPLOAD_MODE_COMPUTED_REACTION: + # Reaction mode is its own iteration shape: one POST per + # reaction, species + TS + kinetics carried inline. Species + # records reach TCKDB through whichever reaction(s) reference + # them; standalone species uploads need a different mode. + _run_reaction_sweep(adapter=adapter, output_doc=output_doc) + return + + _run_species_sweep( + adapter=adapter, output_doc=output_doc, tckdb_config=tckdb_config, + ) + + +def _run_species_sweep(*, adapter, output_doc, tckdb_config): + """Per-species iteration shared by ``conformer`` and ``computed_species`` modes.""" + species_records = list(output_doc.get('species') or []) + ts_records = list(output_doc.get('transition_states') or []) + # The species-only modes cover minima only; TS records are deferred + # to a future TS-specific adapter method targeting + # /uploads/transition-states (different schema, no SMILES + # requirement). Reaction mode handles TSes inline elsewhere. + n_ts_deferred = sum(1 for r in ts_records if r.get('converged')) + + is_bundle_mode = tckdb_config.upload_mode == UPLOAD_MODE_COMPUTED_SPECIES + + counts = {'uploaded': 0, 'skipped': 0, 'failed': 0} + artifact_counts = {'uploaded': 0, 'skipped': 0, 'failed': 0} + failures = [] + artifact_failures = [] + n_attempted = 0 + for record in species_records: + label = record.get('label') or '' + if not record.get('converged'): + continue + n_attempted += 1 + try: + if is_bundle_mode: + # Single bundle carries species_entry + conformer + + # opt/freq/sp + (optional) thermo + inlined artifacts. + outcome = adapter.submit_computed_species_from_output( + output_doc=output_doc, species_record=record, + ) + else: + outcome = adapter.submit_from_output( + output_doc=output_doc, species_record=record, + ) + except Exception as exc: + counts['failed'] += 1 + failures.append((label, f'{type(exc).__name__}: {exc}')) + continue + if outcome is None: + continue + counts[outcome.status] = counts.get(outcome.status, 0) + 1 + if outcome.status == 'failed': + failures.append((label, outcome.error or 'unknown error')) + elif ( + outcome.status == 'uploaded' + and not is_bundle_mode + and tckdb_config.artifacts.upload + ): + # Artifact sweep is conformer-mode only — the bundle path + # carries artifacts inline under each calc. + _sweep_artifacts_for_species( + adapter=adapter, + output_doc=output_doc, + species_record=record, + outcome=outcome, + counts=artifact_counts, + failures=artifact_failures, + kinds=_implementable_kinds_from_config(tckdb_config), + ) + + mode_label = 'computed-species bundle' if is_bundle_mode else 'conformer/calculation' + print(f'TCKDB v0 ({mode_label}, {n_attempted} converged species):') + print(f' uploaded: {counts["uploaded"]} skipped: {counts["skipped"]} failed: {counts["failed"]}') + if not is_bundle_mode and tckdb_config.artifacts.upload: + # Bundle mode rolls artifacts into the same upload, so a + # standalone artifact summary line would be misleading. + print( + f' artifacts: uploaded {artifact_counts["uploaded"]} ' + f'skipped {artifact_counts["skipped"]} failed {artifact_counts["failed"]}' + ) + if n_ts_deferred: + print(f' ({n_ts_deferred} converged TS deferred — TS-specific adapter not yet implemented)') + for label, err in failures: + print(f' failed: {label} — {err}') + for label, kind, err in artifact_failures: + print(f' failed artifact: {label} ({kind}) — {err}') + + +def _run_reaction_sweep(*, adapter, output_doc): + """One ``/uploads/computed-reaction`` POST per ``output_doc['reactions']`` entry. + + Reactions skipped here: + - no ``reactant_labels`` / ``product_labels`` → reaction + builder raises ValueError; we count + log the failure. + - ts_label points at a TS without a parsed xyz → same. + - reaction has no kinetics block → still + uploaded (provenance value), with empty kinetics. + """ + reaction_records = list(output_doc.get('reactions') or []) + n_attempted = len(reaction_records) + counts = {'uploaded': 0, 'skipped': 0, 'failed': 0} + failures: list[tuple[str, str]] = [] + + for record in reaction_records: + label = record.get('label') or '' + try: + outcome = adapter.submit_computed_reaction_from_output( + output_doc=output_doc, reaction_record=record, + ) + except Exception as exc: + counts['failed'] += 1 + failures.append((label, f'{type(exc).__name__}: {exc}')) + continue + if outcome is None: + continue + counts[outcome.status] = counts.get(outcome.status, 0) + 1 + if outcome.status == 'failed': + failures.append((label, outcome.error or 'unknown error')) + + print(f'TCKDB v0 (computed-reaction bundle, {n_attempted} reactions):') + print(f' uploaded: {counts["uploaded"]} skipped: {counts["skipped"]} failed: {counts["failed"]}') + if not reaction_records: + # Common cause: ARC ran species jobs but kinetics fitting + # didn't produce any reactions in output.yml. Surface this so + # the user knows why nothing was uploaded. + print(' (no reactions in output.yml — kinetics fit may not have run)') + for label, err in failures: + print(f' failed: {label} — {err}') + + +_CALC_TYPE_TO_LOG_KEY = { + 'opt': 'opt_log', + 'freq': 'freq_log', + 'sp': 'sp_log', +} + +# Companion mapping for input-deck paths, emitted by ``arc/output.py`` +# alongside the log paths. Per-job, with per-job software → per-job +# filename, and only set when the deck file is on disk. +_CALC_TYPE_TO_INPUT_KEY = { + 'opt': 'opt_input', + 'freq': 'freq_input', + 'sp': 'sp_input', +} + + +def _implementable_kinds_from_config(tckdb_config): + """Intersect user-configured kinds with ARC's IMPLEMENTED_ARTIFACT_KINDS. + + The config-parse step warns about valid-but-not-implemented kinds; + this filter is the runtime side of the same gate, so the sweep + silently skips them rather than calling the adapter (which would + skip with a defensive log message anyway). + """ + return tuple(k for k in tckdb_config.artifacts.kinds if k in IMPLEMENTED_ARTIFACT_KINDS) + + +def _resolve_artifact_path(*, kind, calc_type, species_record, output_doc): + """Resolve the local file path to upload for a (kind, calc_type) pair. + + Returns ``None`` if there's nothing to upload for this combination + (e.g. unsupported calc type, file not on disk, engine unknown). + + For ``output_log``, the path is keyed off the species_record's + log fields (``opt_log`` / ``freq_log`` / ``sp_log``). + + For ``input``, the input deck (``input.gjf``, ``ZMAT``, ``input.in``, + etc.) is always written as a sibling of the output log, so we + derive its name from ``arc.imports.settings['input_filenames']`` + keyed on the engine in ``output_doc['opt_level']['software']``. + """ + log_key = _CALC_TYPE_TO_LOG_KEY.get(str(calc_type).lower()) + if log_key is None: + return None + log_path = species_record.get(log_key) + if not log_path: + return None + if kind == 'output_log': + return log_path + if kind == 'input': + # Prefer the path emitted directly by ``arc/output.py``: it's + # per-job (so a Gaussian opt + Molpro sp run picks the right + # deck per calc), and existence on disk has already been + # verified at output-write time. + input_field = _CALC_TYPE_TO_INPUT_KEY.get(str(calc_type).lower()) + if input_field: + recorded = species_record.get(input_field) + if recorded: + return recorded + # Back-compat: older output.yml files predating the + # ``_input`` schema extension. Derive from the opt-level + # software via settings['input_filenames']. Same logic as before + # — kept so old runs can still upload input decks via the + # primitive endpoint. + from arc.imports import settings as _arc_settings + opt_level = output_doc.get('opt_level') or {} + engine = (opt_level.get('software') or '').lower() if isinstance(opt_level, dict) else '' + input_filenames = _arc_settings.get('input_filenames', {}) + input_name = input_filenames.get(engine) + if not input_name: + return None + return os.path.join(os.path.dirname(log_path), input_name) + return None + + +def _sweep_artifacts_for_species( + *, + adapter, + output_doc, + species_record, + outcome, + counts, + failures, + kinds, +): + """For one converged species' conformer upload, push artifacts to each calc. + + Iterates the calc refs returned by the conformer upload (primary + + additional) and, for each, iterates the configured kinds. Resolves + the right local file path per (kind, calc_type) and dispatches to + ``adapter.submit_artifacts_for_calculation``. Updates ``counts`` and + ``failures`` in place. + """ + label = species_record.get('label') or '' + refs = [] + if outcome.primary_calculation: + refs.append(outcome.primary_calculation) + refs.extend(outcome.additional_calculations or []) + if not refs: + # Older server response without calc refs — skip artifact upload + # for this species rather than guess at IDs. + return + for ref in refs: + calc_id = ref.get('calculation_id') + calc_type = ref.get('type') + if calc_id is None or calc_type is None: + continue + for kind in kinds: + file_path = _resolve_artifact_path( + kind=kind, + calc_type=calc_type, + species_record=species_record, + output_doc=output_doc, + ) + if file_path is None: + counts['skipped'] = counts.get('skipped', 0) + 1 + continue + try: + art_outcome = adapter.submit_artifacts_for_calculation( + output_doc=output_doc, + species_record=species_record, + calculation_id=int(calc_id), + calculation_type=str(calc_type), + file_path=file_path, + kind=kind, + ) + except Exception as exc: + counts['failed'] = counts.get('failed', 0) + 1 + failures.append((label, kind, f'{type(exc).__name__}: {exc}')) + continue + if art_outcome is None: + continue + counts[art_outcome.status] = counts.get(art_outcome.status, 0) + 1 + if art_outcome.status == 'failed': + failures.append((label, art_outcome.kind, art_outcome.error or 'unknown error')) + + +__all__ = ['run_upload_sweep'] diff --git a/docs/output_yml_schema.md b/docs/output_yml_schema.md index cae4c2a4db..4da71907a6 100644 --- a/docs/output_yml_schema.md +++ b/docs/output_yml_schema.md @@ -42,7 +42,7 @@ output.yml │ ├── ess_versions? │ ├── thermo? │ │ ├── h298_kj_mol, s298_j_mol_k, tmin_k, tmax_k -│ │ ├── cp_data?: [{temperature_k, cp_j_mol_k}, ...] +│ │ ├── thermo_points?: [{temperature_k, cp_j_mol_k, h_kj_mol, s_j_mol_k, g_kj_mol}, ...] │ │ ├── nasa_low?: {tmin_k, tmax_k, coeffs} │ │ └── nasa_high?: {tmin_k, tmax_k, coeffs} │ └── statmech? @@ -55,7 +55,7 @@ output.yml ├── transition_states: [] │ └── (all species fields, plus:) │ ├── chosen_ts_method?, successful_ts_methods? -│ ├── neb_log?, irc_logs: [], irc_converged? +│ ├── neb_log?, gsm_log?, irc_logs: [], irc_converged? │ └── rxn_label │ └── reactions: [] @@ -160,16 +160,19 @@ All paths are relative to the project directory. | `s298_j_mol_k` | `float` | Standard entropy at 298 K (J/(mol K)) | | `tmin_k` | `float` | Minimum temperature (K) | | `tmax_k` | `float` | Maximum temperature (K) | -| `cp_data` | `list?` | Tabulated heat capacity (see below) | +| `thermo_points` | `list?` | Tabulated per-temperature thermochemistry (see below) | | `nasa_low` | `dict?` | Low-temperature NASA polynomial | | `nasa_high` | `dict?` | High-temperature NASA polynomial | -**`cp_data`** entries: +**`thermo_points`** entries (one per evaluation temperature; `temperature_k` is required, all others are optional but emitted by default when produced via `arc/scripts/save_arkane_thermo.py`): | Field | Type | Description | |---|---|---| | `temperature_k` | `float` | Temperature (K) | -| `cp_j_mol_k` | `float` | Heat capacity at constant pressure (J/(mol K)) | +| `cp_j_mol_k` | `float?` | Heat capacity at constant pressure (J/(mol K)) | +| `h_kj_mol` | `float?` | Enthalpy at this temperature (kJ/mol) | +| `s_j_mol_k` | `float?` | Entropy at this temperature (J/(mol K)) | +| `g_kj_mol` | `float?` | Gibbs free energy at this temperature (kJ/mol) | **`nasa_low` / `nasa_high`**: @@ -218,7 +221,8 @@ All paths are relative to the project directory. | `imag_freq_cm1` | `float?` | Imaginary frequency (cm-1) | | `chosen_ts_method` | `str?` | The TS search method that was selected | | `successful_ts_methods` | `list[str]?` | All TS methods that succeeded | -| `neb_log` | `str?` | Run-relative path to NEB log | +| `neb_log` | `str?` | Run-relative path to NEB log (set when chosen TS method is `orca_neb`) | +| `gsm_log` | `str?` | Run-relative path to GSM stringfile (set when chosen TS method is `xtb_gsm`) | | `irc_logs` | `list[str]` | Run-relative paths to IRC logs | | `irc_converged` | `bool?` | Whether IRC converged (`null` if IRC was not requested) | | `rxn_label` | `str` | Reaction label this TS belongs to | From e026fa268750798a017f665fafd05e393e4ffc35 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Fri, 8 May 2026 16:46:43 +0300 Subject: [PATCH 10/12] More updates --- .gitignore | 6 +- arc/job/adapters/gaussian.py | 8 +- arc/job/adapters/scripts/xtb_gsm/ograd | 18 + arc/job/adapters/ts/xtb_gsm.py | 7 + arc/job/adapters/ts/xtbgsm_test.py | 39 + arc/job/trsh.py | 145 ++ arc/job/trsh_test.py | 113 + arc/output.py | 58 + arc/output_test.py | 189 ++ arc/reaction/reaction_test.py | 38 + arc/species/species.py | 32 + arc/species/species_test.py | 41 +- arc/tckdb/adapter.py | 503 ++++- arc/tckdb/adapter_test.py | 881 +++++++- arc/tckdb/cli_test.py | 4 +- arc/tckdb/config.py | 13 + arc/tckdb/config_test.py | 20 + arc/tckdb/payload_writer.py | 68 +- arc/tckdb/payload_writer_test.py | 126 ++ arc/tckdb/sweep.py | 79 +- .../calcs/Species/spc1/opt_a1313/input.gjf | 12 + .../calcs/Species/spc1/opt_a1313/submit.sh | 56 + .../conf_opt_a1313/input.gjf | 12 + .../conf_opt_a1313/submit.sh | 56 + .../scan_a1313/input.gjf | 20 + docs/gaussian.md | 114 + docs/gaussian_imp.md | 677 ++++++ docs/tckdb-integration.md | 153 +- leng_gauss.md | 1867 +++++++++++++++++ wang_gauss.md | 1265 +++++++++++ 30 files changed, 6569 insertions(+), 51 deletions(-) create mode 100644 arc/testing/test_JobAdapter/calcs/Species/spc1/opt_a1313/input.gjf create mode 100644 arc/testing/test_JobAdapter/calcs/Species/spc1/opt_a1313/submit.sh create mode 100644 arc/testing/test_JobAdapter/calcs/Species/spc1_and_2_others/conf_opt_a1313/input.gjf create mode 100644 arc/testing/test_JobAdapter/calcs/Species/spc1_and_2_others/conf_opt_a1313/submit.sh create mode 100644 arc/testing/test_JobAdapter_scan/calcs/Species/methanol_and_5_others/scan_a1313/input.gjf create mode 100644 docs/gaussian.md create mode 100644 docs/gaussian_imp.md create mode 100644 leng_gauss.md create mode 100644 wang_gauss.md diff --git a/.gitignore b/.gitignore index bee2d5e0ec..a87e6b816e 100644 --- a/.gitignore +++ b/.gitignore @@ -69,5 +69,9 @@ build/* *.log *.xml -# AI Agent files +# AI Agent files and folders AGENTS.md +.claude/* +.vexb/* + +ARC.egg-info/* diff --git a/arc/job/adapters/gaussian.py b/arc/job/adapters/gaussian.py index 314e796641..71e7ca2456 100644 --- a/arc/job/adapters/gaussian.py +++ b/arc/job/adapters/gaussian.py @@ -306,10 +306,14 @@ def write_input_file(self) -> None: if input_dict['trsh']: input_dict['trsh'] += ' ' input_dict['trsh'] += 'scf=(tight,direct)' + # 'no_tight' is set by trsh_keyword_loose_disp when a previous attempt hit + # MaxOptCycles with forces converged but displacement criteria unreachable. + drop_tight = 'no_tight' in self.ess_trsh_methods + fine_opt = [] if drop_tight else ['tight'] if self.is_ts: - keywords.extend(['tight', 'maxstep=5']) + keywords.extend(fine_opt + ['maxstep=5']) else: - keywords.extend(['tight', 'maxstep=5', f'maxcycle={max_c}']) + keywords.extend(fine_opt + ['maxstep=5', f'maxcycle={max_c}']) input_dict['job_type_1'] = "opt" if self.level.method_type not in ['dft', 'composite', 'wavefunction']\ else f"opt=({', '.join(key for key in keywords)})" diff --git a/arc/job/adapters/scripts/xtb_gsm/ograd b/arc/job/adapters/scripts/xtb_gsm/ograd index d208a7502f..f5c01f4f3a 100644 --- a/arc/job/adapters/scripts/xtb_gsm/ograd +++ b/arc/job/adapters/scripts/xtb_gsm/ograd @@ -24,3 +24,21 @@ tm2orca.py $basename rm xtbrestart cd .. +# ── Per-node provenance preservation (TCKDB path_search_result.points) ── +# tm2orca.py renames the xTB-generated Turbomole-format ``energy`` +# and ``gradient`` files (xTB writes its --grad output in Turbomole's +# on-disk text format; the calculation provenance is xTB, not Turbomole) +# to ``.energy`` and ``.gradient`` +# inside scratch/. The GSM binary then consumes the ORCA-shaped +# ``.engrad`` and may overwrite or remove the per-node files on +# subsequent calls. Copy them (plus the captured xtb stdout) into a +# stable side-effect directory at the run root so the TCKDB adapter's +# parser can recover per-node electronic energies and gradient metrics +# later. The copies are not consumed by GSM — the original scratch/ +# files stay in place unchanged for the algorithm. +node_label="$1" +preserve_dir="gsm_node_outputs" +mkdir -p "$preserve_dir" +[ -f "scratch/${basename}.energy" ] && cp -p "scratch/${basename}.energy" "$preserve_dir/${node_label}.energy" +[ -f "scratch/${basename}.gradient" ] && cp -p "scratch/${basename}.gradient" "$preserve_dir/${node_label}.gradient" +[ -f "scratch/${ofile}.xtbout" ] && cp -p "scratch/${ofile}.xtbout" "$preserve_dir/${node_label}.xtbout" diff --git a/arc/job/adapters/ts/xtb_gsm.py b/arc/job/adapters/ts/xtb_gsm.py index baa06819cf..57d7c71a73 100644 --- a/arc/job/adapters/ts/xtb_gsm.py +++ b/arc/job/adapters/ts/xtb_gsm.py @@ -306,6 +306,13 @@ def set_additional_file_paths(self) -> None: self.tm2orca_path = os.path.join(self.local_path, 'tm2orca.py') self.scratch_initial0000_path = os.path.join(self.local_path, 'scratch', 'initial0000.xyz') self.stringfile_path = os.path.join(self.local_path, 'stringfile.xyz0000') + # Side-effect directory written by the patched ``ograd`` wrapper. + # Holds per-node ``