From c55375503e361a6c711cb0e8cecca70118e16f35 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Sun, 26 Apr 2026 16:25:55 +0300 Subject: [PATCH 1/3] trsh: drop checkfile on SCF/Unconverged/BasisSet failures Previously trsh_keyword_checkfile only fired on the 'CheckFile' keyword, so a job that died with unconverged SCF (L502) had its partial check.chk inherited by the rerun. The next job's gaussian adapter then auto-emitted guess=read against MOs from a non-converged iteration, typically failing with L401 ("Basis set data is not on the checkpoint file") and wasting a full Gaussian invocation before self-healing back to guess=mix. Extend the guard to also drop the chk on 'SCF', 'Unconverged', and 'BasisSet' keywords. MaxOptCycles, InternalCoordinateError, DiskSpace, OptOrientation and other non-wavefunction failures remain unaffected, so the warm-start cycle savings on those paths are preserved. Adds a regression test that exercises a fresh ess_trsh_methods list for each drop-keyword (the existing test_trsh_ess_job masked the bug because its sub-cases mutated a shared list) and pins the keep-keyword behavior so the warm-start path can't regress silently. --- arc/job/trsh.py | 12 ++++++++++-- arc/job/trsh_test.py | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/arc/job/trsh.py b/arc/job/trsh.py index a1ec9a8eab..de6b3264e7 100644 --- a/arc/job/trsh.py +++ b/arc/job/trsh.py @@ -1754,9 +1754,17 @@ def scan_quality_check(label: str, def trsh_keyword_checkfile(job_status, ess_trsh_methods, couldnt_trsh) -> tuple[bool, list, bool]: """ - Check if the job requires removal of checkfile + Check if the job requires removal of checkfile. + + Drops the checkfile when the prior job either could not read it + ('CheckFile' from L301/L401), produced a non-converged wavefunction + ('SCF' from L502, 'Unconverged' from L508), or reported a failed + basis projection ('BasisSet' from L401). Reusing MOs from a + non-converged or basis-incompatible chk re-seeds the same failure. """ - if 'CheckFile' in job_status.get('keywords', '') and 'checkfile=None' not in ess_trsh_methods: + keywords = job_status.get('keywords', []) or [] + bad_wavefunction = any(k in keywords for k in ('CheckFile', 'SCF', 'Unconverged', 'BasisSet')) + if bad_wavefunction and 'checkfile=None' not in ess_trsh_methods: ess_trsh_methods.append('checkfile=None') couldnt_trsh = False return True, ess_trsh_methods, couldnt_trsh diff --git a/arc/job/trsh_test.py b/arc/job/trsh_test.py index d974874e9c..b6d6cd457f 100644 --- a/arc/job/trsh_test.py +++ b/arc/job/trsh_test.py @@ -796,6 +796,25 @@ def test_trsh_ess_job(self): num_heavy_atoms, cpu_cores, ess_trsh_methods, is_h=True, is_monoatomic=True) + def test_trsh_keyword_checkfile_drops_on_bad_wavefunction(self): + """SCF/Unconverged failures must drop the checkfile so the rerun uses guess=mix.""" + for kw in ('CheckFile', 'SCF', 'Unconverged', 'BasisSet'): + ess_trsh_methods = [] + remove, ess_trsh_methods, couldnt = trsh.trsh_keyword_checkfile( + {'keywords': [kw]}, ess_trsh_methods, couldnt_trsh=True, + ) + self.assertTrue(remove, f'{kw} should drop the checkfile') + self.assertIn('checkfile=None', ess_trsh_methods) + self.assertFalse(couldnt) + + # Failures unrelated to wavefunction quality must keep the checkfile. + for kw in ('MaxOptCycles', 'InternalCoordinateError', 'DiskSpace', 'OptOrientation'): + remove, methods, _ = trsh.trsh_keyword_checkfile( + {'keywords': [kw]}, ess_trsh_methods=[], couldnt_trsh=True, + ) + self.assertFalse(remove, f'{kw} must not drop the checkfile') + self.assertNotIn('checkfile=None', methods) + def test_determine_job_log_memory_issues(self): """Test the determine_job_log_memory_issues() function.""" job_log_path_1 = os.path.join(ARC_TESTING_PATH, 'job_log', 'no_issues.log') From e1070d10e557448bd89a702af3d804f358d5373d Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Sun, 26 Apr 2026 17:43:01 +0300 Subject: [PATCH 2/3] trsh: branch the chk-drop log message by triggering keyword After widening trsh_keyword_checkfile to fire on SCF/Unconverged/BasisSet in addition to CheckFile, the hardcoded log line "that failed with 'Basis set data is not on the checkpoint file' by removing the checkfile." falsely claimed a chk-data failure for unrelated wavefunction failures. Pick the explanatory phrase from job_status['keywords'] so the log reflects the actual failure mode (chk-read error, basis projection, unconverged SCF, or generic unconverged wavefunction). --- arc/job/trsh.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arc/job/trsh.py b/arc/job/trsh.py index de6b3264e7..7addc66310 100644 --- a/arc/job/trsh.py +++ b/arc/job/trsh.py @@ -912,7 +912,18 @@ def trsh_ess_job(label: str, # - Changing Computational Parameters remove_checkfile, ess_trsh_methods, couldnt_trsh = trsh_keyword_checkfile(job_status, ess_trsh_methods, couldnt_trsh) if remove_checkfile: - logger_info.append('that failed with "Basis set data is not on the checkpoint file" by removing the checkfile.') + chk_drop_keywords = job_status.get('keywords', []) or [] + if 'CheckFile' in chk_drop_keywords: + chk_drop_reason = '"Basis set data is not on the checkpoint file"' + elif 'BasisSet' in chk_drop_keywords: + chk_drop_reason = 'a failed basis-set projection from the prior checkpoint' + elif 'SCF' in chk_drop_keywords: + chk_drop_reason = 'an unconverged SCF in the prior job' + elif 'Unconverged' in chk_drop_keywords: + chk_drop_reason = 'an unconverged wavefunction in the prior job' + else: + chk_drop_reason = 'a checkpoint-related issue' + logger_info.append(f'that failed with {chk_drop_reason} by removing the checkfile.') # Check if InternalCoordinateError is in the keyword or opt=(cartesian) ess_trsh_methods, trsh_keyword, couldnt_trsh = trsh_keyword_cartesian(job_status, ess_trsh_methods, job_type, trsh_keyword,couldnt_trsh) From 582c012875453b9ca80a3e85d655589ca9e2311e Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Sun, 26 Apr 2026 17:43:35 +0300 Subject: [PATCH 3/3] trsh: detect Gaussian L601 (RdWrB1) chk/rwf collision Concurrent Gaussian jobs sharing a checkpoint or read-write file path can trigger L601 with an RdWrB1 garbage-pointer error. ARC currently has no l601.exe handler in determine_ess_status, so these failures fall through to the generic 'Unknown' bucket and bypass the chk-drop logic even though the chk is exactly what's at fault. Route L601 to the 'CheckFile' keyword so the existing chk-drop plumbing clears the chk before the rerun, and add a small fixture + regression test covering the new detection path. --- arc/job/trsh.py | 7 +++++++ arc/job/trsh_test.py | 10 ++++++++++ arc/testing/trsh/gaussian/l601.out | 10 ++++++++++ 3 files changed, 27 insertions(+) create mode 100644 arc/testing/trsh/gaussian/l601.out diff --git a/arc/job/trsh.py b/arc/job/trsh.py index 7addc66310..93c9dd9a05 100644 --- a/arc/job/trsh.py +++ b/arc/job/trsh.py @@ -127,6 +127,13 @@ def determine_ess_status(output_path: str, keywords = ['GL301'] elif 'l401.exe' in line: keywords = ['GL401'] + elif 'l601.exe' in line: + # L601 with RdWrB1 typically signals a checkpoint/read-write file + # collision (e.g., concurrent jobs sharing a chk path) — Gaussian + # cannot read the chk safely, so the next attempt must rebuild it. + keywords = ['CheckFile', 'GL601'] + error = ('Gaussian L601 read-write error, often from a chk/rwf ' + 'collision between concurrent jobs.') elif 'l502.exe' in line: # Check if Inaccurate quadrature in CalDSu inacc_quad = False diff --git a/arc/job/trsh_test.py b/arc/job/trsh_test.py index b6d6cd457f..56c0713e47 100644 --- a/arc/job/trsh_test.py +++ b/arc/job/trsh_test.py @@ -97,6 +97,16 @@ def test_determine_ess_status(self): self.assertIn("Error termination via Lnk1e", line) self.assertIn("g09/l401.exe", line) + path = os.path.join(self.base_path["gaussian"], "l601.out") + status, keywords, error, line = trsh.determine_ess_status( + output_path=path, species_label="Zr2O4H", job_type="opt" + ) + self.assertEqual(status, "errored") + self.assertEqual(keywords, ["CheckFile", "GL601"]) + self.assertIn("L601", error) + self.assertIn("Error termination via Lnk1e", line) + self.assertIn("l601.exe", line) + path = os.path.join(self.base_path["gaussian"], "l9999.out") status, keywords, error, line = trsh.determine_ess_status( output_path=path, species_label="Zr2O4H", job_type="opt" diff --git a/arc/testing/trsh/gaussian/l601.out b/arc/testing/trsh/gaussian/l601.out new file mode 100644 index 0000000000..5238fb668e --- /dev/null +++ b/arc/testing/trsh/gaussian/l601.out @@ -0,0 +1,10 @@ + Entering Gaussian System, Link 0=g16 + Initial command: + /usr/local/g16/l1.exe "/scratch/Gau-l601.inp" -scrdir="/scratch/" + ---------------- + # opt freq m062x + ---------------- + Charge = 0 Multiplicity = 1 + RdWrB1: read garbage pointers; chk/rwf collision detected. + Error termination via Lnk1e in /usr/local/g16/l601.exe at Wed Apr 22 18:23:01 2026. + Job cpu time: 0 days 0 hours 0 minutes 3.2 seconds.