From 8889dac2ab2043162e78d213dc273b93ca47142c Mon Sep 17 00:00:00 2001 From: Ganesh Patil <7030871503ganeshpatil@gmail.com> Date: Tue, 24 Feb 2026 17:48:04 +0530 Subject: [PATCH 1/2] fix: replace concorekill.bat single-PID overwrite with safe PID registry (#391) Problem: On Windows, concore.py wrote a single PID to concorekill.bat at import time using 'w' mode. When multiple Python nodes started simultaneously (via makestudy), each overwrote the file. Only the last node's PID survived, making the kill script unable to terminate other nodes. Stale PIDs from crashed studies could also kill unrelated processes. Solution: - Replace single-PID overwrite with append-based PID registry (concorekill_pids.txt) storing one PID per line - Register atexit handler to remove current PID on graceful shutdown - Generate concorekill.bat dynamically with PID validation: each PID is checked via tasklist before taskkill executes - Clean up both files when last node exits - Backward compatible: users still run concorekill.bat Added 9 tests covering registration, cleanup, multi-node scenarios, missing registry handling, and kill script generation. Fixes #391 --- concore.py | 97 ++++++++++++++++++++++++++++-- tests/test_concore.py | 136 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 226 insertions(+), 7 deletions(-) diff --git a/concore.py b/concore.py index 5bd112b..e413b6d 100644 --- a/concore.py +++ b/concore.py @@ -21,13 +21,98 @@ logging.getLogger('requests').setLevel(logging.WARNING) -# if windows, create script to kill this process -# because batch files don't provide easy way to know pid of last command -# ignored for posix != windows, because "concorepid" is handled by script -# ignored for docker (linux != windows), because handled by docker stop +# =================================================================== +# Windows PID Registry (Issue #391) +# =================================================================== +# Previous implementation wrote a single PID to concorekill.bat at +# import time, causing race conditions when multiple Python nodes +# started simultaneously (each overwrote the file). Only the last +# node's PID survived, and stale PIDs could kill unrelated processes. +# +# New approach: +# - Append each node's PID to concorekill_pids.txt (one per line) +# - Generate concorekill.bat that validates each PID before killing +# - Use atexit to remove the PID on graceful shutdown +# - Backward compatible: users still run concorekill.bat +# =================================================================== + +_PID_REGISTRY_FILE = "concorekill_pids.txt" +_KILL_SCRIPT_FILE = "concorekill.bat" + + +def _register_pid(): + """Append the current process PID to the registry file.""" + pid = os.getpid() + try: + with open(_PID_REGISTRY_FILE, "a") as f: + f.write(str(pid) + "\n") + except OSError: + pass # Non-fatal: best-effort registration + + +def _cleanup_pid(): + """Remove the current process PID from the registry on exit.""" + pid = str(os.getpid()) + try: + if not os.path.exists(_PID_REGISTRY_FILE): + return + with open(_PID_REGISTRY_FILE, "r") as f: + pids = [line.strip() for line in f if line.strip()] + remaining = [p for p in pids if p != pid] + if remaining: + with open(_PID_REGISTRY_FILE, "w") as f: + for p in remaining: + f.write(p + "\n") + else: + # No PIDs left — clean up both files + try: + os.remove(_PID_REGISTRY_FILE) + except OSError: + pass + try: + os.remove(_KILL_SCRIPT_FILE) + except OSError: + pass + except OSError: + pass # Non-fatal: best-effort cleanup + + +def _write_kill_script(): + """Generate concorekill.bat that validates PIDs before killing. + + The script reads concorekill_pids.txt and for each PID: + 1. Checks if the process still exists + 2. Verifies it is a Python process + 3. Only then issues taskkill + After killing, the registry file is deleted. + """ + try: + script = "@echo off\r\n" + script += "if not exist \"%~dp0" + _PID_REGISTRY_FILE + "\" (\r\n" + script += " echo No PID registry found. Nothing to kill.\r\n" + script += " exit /b 0\r\n" + script += ")\r\n" + script += "for /f \"tokens=*\" %%p in (%~dp0" + _PID_REGISTRY_FILE + ") do (\r\n" + script += " tasklist /FI \"PID eq %%p\" 2>nul | find /i \"python\" >nul\r\n" + script += " if not errorlevel 1 (\r\n" + script += " echo Killing Python process %%p\r\n" + script += " taskkill /F /PID %%p >nul 2>&1\r\n" + script += " ) else (\r\n" + script += " echo Skipping PID %%p - not a Python process or not running\r\n" + script += " )\r\n" + script += ")\r\n" + script += "del /q \"%~dp0" + _PID_REGISTRY_FILE + "\" 2>nul\r\n" + script += "del /q \"%~dp0" + _KILL_SCRIPT_FILE + "\" 2>nul\r\n" + with open(_KILL_SCRIPT_FILE, "w") as f: + f.write(script) + except OSError: + pass # Non-fatal: best-effort script generation + + if hasattr(sys, 'getwindowsversion'): - with open("concorekill.bat","w") as fpid: - fpid.write("taskkill /F /PID "+str(os.getpid())+"\n") + _register_pid() + _write_kill_script() + atexit.register(_cleanup_pid) ZeroMQPort = concore_base.ZeroMQPort convert_numpy_to_python = concore_base.convert_numpy_to_python diff --git a/tests/test_concore.py b/tests/test_concore.py index fd89729..a133df2 100644 --- a/tests/test_concore.py +++ b/tests/test_concore.py @@ -1,7 +1,141 @@ import pytest import os +import sys import numpy as np + +# =================================================================== +# PID Registry Tests (Issue #391) +# =================================================================== + +class TestPidRegistry: + """Tests for the Windows PID registry mechanism that replaces the + old single-overwrite concorekill.bat approach.""" + + @pytest.fixture(autouse=True) + def use_temp_dir(self, temp_dir, monkeypatch): + """Run each test in an isolated temp directory.""" + self.temp_dir = temp_dir + monkeypatch.chdir(temp_dir) + + def test_register_pid_creates_registry_file(self): + """_register_pid should create concorekill_pids.txt with current PID.""" + from concore import _register_pid, _PID_REGISTRY_FILE + _register_pid() + assert os.path.exists(_PID_REGISTRY_FILE) + with open(_PID_REGISTRY_FILE) as f: + pids = [line.strip() for line in f if line.strip()] + assert str(os.getpid()) in pids + + def test_register_pid_appends_not_overwrites(self): + """Multiple calls to _register_pid should append, not overwrite.""" + from concore import _register_pid, _PID_REGISTRY_FILE + # Simulate two different PIDs by writing manually then registering + with open(_PID_REGISTRY_FILE, "w") as f: + f.write("11111\n") + f.write("22222\n") + _register_pid() + with open(_PID_REGISTRY_FILE) as f: + pids = [line.strip() for line in f if line.strip()] + assert "11111" in pids + assert "22222" in pids + assert str(os.getpid()) in pids + assert len(pids) == 3 + + def test_cleanup_pid_removes_current_pid(self): + """_cleanup_pid should remove only the current PID from the registry.""" + from concore import _cleanup_pid, _PID_REGISTRY_FILE + current_pid = str(os.getpid()) + with open(_PID_REGISTRY_FILE, "w") as f: + f.write("99999\n") + f.write(current_pid + "\n") + f.write("88888\n") + _cleanup_pid() + with open(_PID_REGISTRY_FILE) as f: + pids = [line.strip() for line in f if line.strip()] + assert current_pid not in pids + assert "99999" in pids + assert "88888" in pids + + def test_cleanup_pid_deletes_files_when_last_pid(self): + """When the current PID is the only one left, cleanup should + remove both the registry file and the kill script.""" + from concore import _cleanup_pid, _PID_REGISTRY_FILE, _KILL_SCRIPT_FILE + current_pid = str(os.getpid()) + with open(_PID_REGISTRY_FILE, "w") as f: + f.write(current_pid + "\n") + # Create a dummy kill script to verify it gets cleaned up + with open(_KILL_SCRIPT_FILE, "w") as f: + f.write("@echo off\n") + _cleanup_pid() + assert not os.path.exists(_PID_REGISTRY_FILE) + assert not os.path.exists(_KILL_SCRIPT_FILE) + + def test_cleanup_pid_handles_missing_registry(self): + """_cleanup_pid should not crash when registry file doesn't exist.""" + from concore import _cleanup_pid, _PID_REGISTRY_FILE + assert not os.path.exists(_PID_REGISTRY_FILE) + _cleanup_pid() # Should not raise + + def test_write_kill_script_generates_bat_file(self): + """_write_kill_script should create concorekill.bat with validation logic.""" + from concore import _write_kill_script, _KILL_SCRIPT_FILE, _PID_REGISTRY_FILE + _write_kill_script() + assert os.path.exists(_KILL_SCRIPT_FILE) + with open(_KILL_SCRIPT_FILE) as f: + content = f.read() + # Script should reference the PID registry file + assert _PID_REGISTRY_FILE in content + # Script should validate processes before killing + assert "tasklist" in content + assert "taskkill" in content + assert "python" in content.lower() + + def test_multi_node_registration(self): + """Simulate 3 nodes registering PIDs — all should be present.""" + from concore import _register_pid, _PID_REGISTRY_FILE + fake_pids = ["1204", "1932", "8120"] + with open(_PID_REGISTRY_FILE, "w") as f: + for pid in fake_pids: + f.write(pid + "\n") + _register_pid() # Current process is the 4th + with open(_PID_REGISTRY_FILE) as f: + pids = [line.strip() for line in f if line.strip()] + for pid in fake_pids: + assert pid in pids + assert str(os.getpid()) in pids + assert len(pids) == 4 + + def test_cleanup_preserves_other_pids(self): + """After cleanup, only the current process PID should be removed.""" + from concore import _cleanup_pid, _PID_REGISTRY_FILE + current_pid = str(os.getpid()) + other_pids = ["1111", "2222", "3333"] + with open(_PID_REGISTRY_FILE, "w") as f: + for pid in other_pids: + f.write(pid + "\n") + f.write(current_pid + "\n") + _cleanup_pid() + with open(_PID_REGISTRY_FILE) as f: + pids = [line.strip() for line in f if line.strip()] + assert len(pids) == 3 + assert current_pid not in pids + for pid in other_pids: + assert pid in pids + + @pytest.mark.skipif(not hasattr(sys, 'getwindowsversion'), + reason="Windows-only test") + def test_import_registers_pid_on_windows(self): + """On Windows, importing concore should register the PID.""" + from concore import _PID_REGISTRY_FILE + # The import already happened, so just verify the registry exists + # in our temp dir (we can't easily test the import-time side effect + # since concore was already imported — we test the functions directly) + from concore import _register_pid + _register_pid() + assert os.path.exists(_PID_REGISTRY_FILE) + + class TestSafeLiteralEval: def test_reads_dictionary_from_file(self, temp_dir): @@ -274,7 +408,7 @@ def recv_json_with_retry(self): concore.write("roundtrip_test", "data", original_data) # Read should return original data (simtime stripped) - result = concore.read("roundtrip_test", "data", "[]") + result, ok = concore.read("roundtrip_test", "data", "[]") assert result == original_data From a3b873f27d8f263a6c0ee2fe98ce2f7120c8ad48 Mon Sep 17 00:00:00 2001 From: Ganesh Patil <7030871503ganeshpatil@gmail.com> Date: Tue, 24 Feb 2026 17:54:26 +0530 Subject: [PATCH 2/2] fix: correct test_zmq_write_read_roundtrip to match read() return signature read() returns a single value, not a tuple. The test incorrectly unpacked into (result, ok) causing ValueError on CI. --- tests/test_concore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_concore.py b/tests/test_concore.py index a133df2..8d72e2d 100644 --- a/tests/test_concore.py +++ b/tests/test_concore.py @@ -408,7 +408,7 @@ def recv_json_with_retry(self): concore.write("roundtrip_test", "data", original_data) # Read should return original data (simtime stripped) - result, ok = concore.read("roundtrip_test", "data", "[]") + result = concore.read("roundtrip_test", "data", "[]") assert result == original_data