Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions arc/job/trsh.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@ def determine_ess_status(output_path: str,
keywords = ['GL301']
elif 'l401.exe' in line:
keywords = ['GL401']
elif 'l601.exe' in line:
# L601 with RdWrB1 typically signals a checkpoint/read-write file
# collision (e.g., concurrent jobs sharing a chk path) — Gaussian
# cannot read the chk safely, so the next attempt must rebuild it.
keywords = ['CheckFile', 'GL601']
error = ('Gaussian L601 read-write error, often from a chk/rwf '
'collision between concurrent jobs.')
elif 'l502.exe' in line:
# Check if Inaccurate quadrature in CalDSu
inacc_quad = False
Expand Down Expand Up @@ -912,7 +919,18 @@ def trsh_ess_job(label: str,
# - Changing Computational Parameters
remove_checkfile, ess_trsh_methods, couldnt_trsh = trsh_keyword_checkfile(job_status, ess_trsh_methods, couldnt_trsh)
if remove_checkfile:
logger_info.append('that failed with "Basis set data is not on the checkpoint file" by removing the checkfile.')
chk_drop_keywords = job_status.get('keywords', []) or []
if 'CheckFile' in chk_drop_keywords:
chk_drop_reason = '"Basis set data is not on the checkpoint file"'
elif 'BasisSet' in chk_drop_keywords:
chk_drop_reason = 'a failed basis-set projection from the prior checkpoint'
elif 'SCF' in chk_drop_keywords:
chk_drop_reason = 'an unconverged SCF in the prior job'
elif 'Unconverged' in chk_drop_keywords:
chk_drop_reason = 'an unconverged wavefunction in the prior job'
else:
chk_drop_reason = 'a checkpoint-related issue'
logger_info.append(f'that failed with {chk_drop_reason} by removing the checkfile.')

# Check if InternalCoordinateError is in the keyword or opt=(cartesian)
ess_trsh_methods, trsh_keyword, couldnt_trsh = trsh_keyword_cartesian(job_status, ess_trsh_methods, job_type, trsh_keyword,couldnt_trsh)
Expand Down Expand Up @@ -1754,9 +1772,17 @@ def scan_quality_check(label: str,

def trsh_keyword_checkfile(job_status, ess_trsh_methods, couldnt_trsh) -> tuple[bool, list, bool]:
"""
Check if the job requires removal of checkfile
Check if the job requires removal of checkfile.

Drops the checkfile when the prior job either could not read it
('CheckFile' from L301/L401), produced a non-converged wavefunction
('SCF' from L502, 'Unconverged' from L508), or reported a failed
basis projection ('BasisSet' from L401). Reusing MOs from a
Comment on lines +1777 to +1780
non-converged or basis-incompatible chk re-seeds the same failure.
"""
if 'CheckFile' in job_status.get('keywords', '') and 'checkfile=None' not in ess_trsh_methods:
keywords = job_status.get('keywords', []) or []
bad_wavefunction = any(k in keywords for k in ('CheckFile', 'SCF', 'Unconverged', 'BasisSet'))
if bad_wavefunction and 'checkfile=None' not in ess_trsh_methods:
ess_trsh_methods.append('checkfile=None')
couldnt_trsh = False
return True, ess_trsh_methods, couldnt_trsh
Expand Down
29 changes: 29 additions & 0 deletions arc/job/trsh_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,16 @@ def test_determine_ess_status(self):
self.assertIn("Error termination via Lnk1e", line)
self.assertIn("g09/l401.exe", line)

path = os.path.join(self.base_path["gaussian"], "l601.out")
status, keywords, error, line = trsh.determine_ess_status(
output_path=path, species_label="Zr2O4H", job_type="opt"
)
self.assertEqual(status, "errored")
self.assertEqual(keywords, ["CheckFile", "GL601"])
self.assertIn("L601", error)
self.assertIn("Error termination via Lnk1e", line)
self.assertIn("l601.exe", line)

path = os.path.join(self.base_path["gaussian"], "l9999.out")
status, keywords, error, line = trsh.determine_ess_status(
output_path=path, species_label="Zr2O4H", job_type="opt"
Expand Down Expand Up @@ -796,6 +806,25 @@ def test_trsh_ess_job(self):
num_heavy_atoms, cpu_cores, ess_trsh_methods,
is_h=True, is_monoatomic=True)

def test_trsh_keyword_checkfile_drops_on_bad_wavefunction(self):
"""SCF/Unconverged failures must drop the checkfile so the rerun uses guess=mix."""
for kw in ('CheckFile', 'SCF', 'Unconverged', 'BasisSet'):
ess_trsh_methods = []
remove, ess_trsh_methods, couldnt = trsh.trsh_keyword_checkfile(
{'keywords': [kw]}, ess_trsh_methods, couldnt_trsh=True,
)
self.assertTrue(remove, f'{kw} should drop the checkfile')
self.assertIn('checkfile=None', ess_trsh_methods)
self.assertFalse(couldnt)

# Failures unrelated to wavefunction quality must keep the checkfile.
for kw in ('MaxOptCycles', 'InternalCoordinateError', 'DiskSpace', 'OptOrientation'):
remove, methods, _ = trsh.trsh_keyword_checkfile(
{'keywords': [kw]}, ess_trsh_methods=[], couldnt_trsh=True,
)
self.assertFalse(remove, f'{kw} must not drop the checkfile')
self.assertNotIn('checkfile=None', methods)

def test_determine_job_log_memory_issues(self):
"""Test the determine_job_log_memory_issues() function."""
job_log_path_1 = os.path.join(ARC_TESTING_PATH, 'job_log', 'no_issues.log')
Expand Down
10 changes: 10 additions & 0 deletions arc/testing/trsh/gaussian/l601.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Entering Gaussian System, Link 0=g16
Initial command:
/usr/local/g16/l1.exe "/scratch/Gau-l601.inp" -scrdir="/scratch/"
----------------
# opt freq m062x
----------------
Charge = 0 Multiplicity = 1
RdWrB1: read garbage pointers; chk/rwf collision detected.
Error termination via Lnk1e in /usr/local/g16/l601.exe at Wed Apr 22 18:23:01 2026.
Job cpu time: 0 days 0 hours 0 minutes 3.2 seconds.
Loading