diff --git a/arc/job/trsh.py b/arc/job/trsh.py index a1ec9a8eab..93c9dd9a05 100644 --- a/arc/job/trsh.py +++ b/arc/job/trsh.py @@ -127,6 +127,13 @@ def determine_ess_status(output_path: str, keywords = ['GL301'] elif 'l401.exe' in line: keywords = ['GL401'] + elif 'l601.exe' in line: + # L601 with RdWrB1 typically signals a checkpoint/read-write file + # collision (e.g., concurrent jobs sharing a chk path) — Gaussian + # cannot read the chk safely, so the next attempt must rebuild it. + keywords = ['CheckFile', 'GL601'] + error = ('Gaussian L601 read-write error, often from a chk/rwf ' + 'collision between concurrent jobs.') elif 'l502.exe' in line: # Check if Inaccurate quadrature in CalDSu inacc_quad = False @@ -912,7 +919,18 @@ def trsh_ess_job(label: str, # - Changing Computational Parameters remove_checkfile, ess_trsh_methods, couldnt_trsh = trsh_keyword_checkfile(job_status, ess_trsh_methods, couldnt_trsh) if remove_checkfile: - logger_info.append('that failed with "Basis set data is not on the checkpoint file" by removing the checkfile.') + chk_drop_keywords = job_status.get('keywords', []) or [] + if 'CheckFile' in chk_drop_keywords: + chk_drop_reason = '"Basis set data is not on the checkpoint file"' + elif 'BasisSet' in chk_drop_keywords: + chk_drop_reason = 'a failed basis-set projection from the prior checkpoint' + elif 'SCF' in chk_drop_keywords: + chk_drop_reason = 'an unconverged SCF in the prior job' + elif 'Unconverged' in chk_drop_keywords: + chk_drop_reason = 'an unconverged wavefunction in the prior job' + else: + chk_drop_reason = 'a checkpoint-related issue' + logger_info.append(f'that failed with {chk_drop_reason} by removing the checkfile.') # Check if InternalCoordinateError is in the keyword or opt=(cartesian) ess_trsh_methods, trsh_keyword, couldnt_trsh = trsh_keyword_cartesian(job_status, ess_trsh_methods, job_type, trsh_keyword,couldnt_trsh) @@ -1754,9 +1772,17 @@ def scan_quality_check(label: str, def trsh_keyword_checkfile(job_status, ess_trsh_methods, couldnt_trsh) -> tuple[bool, list, bool]: """ - Check if the job requires removal of checkfile + Check if the job requires removal of checkfile. + + Drops the checkfile when the prior job either could not read it + ('CheckFile' from L301/L401), produced a non-converged wavefunction + ('SCF' from L502, 'Unconverged' from L508), or reported a failed + basis projection ('BasisSet' from L401). Reusing MOs from a + non-converged or basis-incompatible chk re-seeds the same failure. """ - if 'CheckFile' in job_status.get('keywords', '') and 'checkfile=None' not in ess_trsh_methods: + keywords = job_status.get('keywords', []) or [] + bad_wavefunction = any(k in keywords for k in ('CheckFile', 'SCF', 'Unconverged', 'BasisSet')) + if bad_wavefunction and 'checkfile=None' not in ess_trsh_methods: ess_trsh_methods.append('checkfile=None') couldnt_trsh = False return True, ess_trsh_methods, couldnt_trsh diff --git a/arc/job/trsh_test.py b/arc/job/trsh_test.py index d974874e9c..56c0713e47 100644 --- a/arc/job/trsh_test.py +++ b/arc/job/trsh_test.py @@ -97,6 +97,16 @@ def test_determine_ess_status(self): self.assertIn("Error termination via Lnk1e", line) self.assertIn("g09/l401.exe", line) + path = os.path.join(self.base_path["gaussian"], "l601.out") + status, keywords, error, line = trsh.determine_ess_status( + output_path=path, species_label="Zr2O4H", job_type="opt" + ) + self.assertEqual(status, "errored") + self.assertEqual(keywords, ["CheckFile", "GL601"]) + self.assertIn("L601", error) + self.assertIn("Error termination via Lnk1e", line) + self.assertIn("l601.exe", line) + path = os.path.join(self.base_path["gaussian"], "l9999.out") status, keywords, error, line = trsh.determine_ess_status( output_path=path, species_label="Zr2O4H", job_type="opt" @@ -796,6 +806,25 @@ def test_trsh_ess_job(self): num_heavy_atoms, cpu_cores, ess_trsh_methods, is_h=True, is_monoatomic=True) + def test_trsh_keyword_checkfile_drops_on_bad_wavefunction(self): + """SCF/Unconverged failures must drop the checkfile so the rerun uses guess=mix.""" + for kw in ('CheckFile', 'SCF', 'Unconverged', 'BasisSet'): + ess_trsh_methods = [] + remove, ess_trsh_methods, couldnt = trsh.trsh_keyword_checkfile( + {'keywords': [kw]}, ess_trsh_methods, couldnt_trsh=True, + ) + self.assertTrue(remove, f'{kw} should drop the checkfile') + self.assertIn('checkfile=None', ess_trsh_methods) + self.assertFalse(couldnt) + + # Failures unrelated to wavefunction quality must keep the checkfile. + for kw in ('MaxOptCycles', 'InternalCoordinateError', 'DiskSpace', 'OptOrientation'): + remove, methods, _ = trsh.trsh_keyword_checkfile( + {'keywords': [kw]}, ess_trsh_methods=[], couldnt_trsh=True, + ) + self.assertFalse(remove, f'{kw} must not drop the checkfile') + self.assertNotIn('checkfile=None', methods) + def test_determine_job_log_memory_issues(self): """Test the determine_job_log_memory_issues() function.""" job_log_path_1 = os.path.join(ARC_TESTING_PATH, 'job_log', 'no_issues.log') diff --git a/arc/testing/trsh/gaussian/l601.out b/arc/testing/trsh/gaussian/l601.out new file mode 100644 index 0000000000..5238fb668e --- /dev/null +++ b/arc/testing/trsh/gaussian/l601.out @@ -0,0 +1,10 @@ + Entering Gaussian System, Link 0=g16 + Initial command: + /usr/local/g16/l1.exe "/scratch/Gau-l601.inp" -scrdir="/scratch/" + ---------------- + # opt freq m062x + ---------------- + Charge = 0 Multiplicity = 1 + RdWrB1: read garbage pointers; chk/rwf collision detected. + Error termination via Lnk1e in /usr/local/g16/l601.exe at Wed Apr 22 18:23:01 2026. + Job cpu time: 0 days 0 hours 0 minutes 3.2 seconds.