From 41efc3acef40749ff50af7e227c22ab34eb6712a Mon Sep 17 00:00:00 2001 From: NinaCai Date: Wed, 13 May 2026 02:30:14 +0000 Subject: [PATCH 1/2] kill subprocesses when server process is killed --- .../hitl_agent/server_utils/cpu_server.py | 17 +++++++++------- .../server_utils/server_manager_mixin.py | 7 +++++++ MaxKernel/hitl_agent/server_utils/setup.sh | 4 +++- .../hitl_agent/server_utils/tpu_server.py | 17 +++++++++------- .../subagents/autotuning/autotune_tool.py | 20 +++++++++++++++++-- 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/MaxKernel/hitl_agent/server_utils/cpu_server.py b/MaxKernel/hitl_agent/server_utils/cpu_server.py index 4a12590..63fe40c 100644 --- a/MaxKernel/hitl_agent/server_utils/cpu_server.py +++ b/MaxKernel/hitl_agent/server_utils/cpu_server.py @@ -97,7 +97,7 @@ async def compilation_test(request: CodeRequest): request.code = code_content # Create a temporary file to store the code with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False + mode="w", suffix=".py", prefix="hitl_eval_", delete=False ) as temp_file: temp_file.write(request.code) temp_file_path = temp_file.name @@ -180,7 +180,7 @@ async def correctness_test(request: CodeRequest): request.code = code_content # Create a temporary file to store the code with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False + mode="w", suffix=".py", prefix="hitl_eval_", delete=False ) as temp_file: temp_file.write(request.code) temp_file_path = temp_file.name @@ -262,7 +262,7 @@ async def performance_test(request: CodeRequest): request.code = code_content # Create a temporary file to store the code with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False + mode="w", suffix=".py", prefix="hitl_eval_", delete=False ) as temp_file: temp_file.write(request.code) temp_file_path = temp_file.name @@ -340,11 +340,12 @@ async def autotune(request: AutotuneRequest): # Execute the code with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False + mode="w", suffix=".py", prefix="hitl_eval_", delete=False ) as temp_file: temp_file.write(code_content) temp_file_path = temp_file.name + process = None try: process = await asyncio.create_subprocess_exec( sys.executable, @@ -383,8 +384,9 @@ async def autotune(request: AutotuneRequest): except asyncio.TimeoutError: logging.warning(f"Config {cfg} timed out") - process.kill() - await process.wait() + if process: + process.kill() + await process.wait() except Exception as e: logging.error(f"Error running config {cfg}: {e}") finally: @@ -392,6 +394,7 @@ async def autotune(request: AutotuneRequest): os.unlink(temp_file_path) except OSError: pass + await asyncio.sleep(2) if best_cfg is None: return CodeResponse( @@ -442,7 +445,7 @@ async def profile(request: CodeRequest): request.code = code_content # Create a temporary directory to store the code and any generated files - temp_dir = tempfile.mkdtemp() + temp_dir = tempfile.mkdtemp(prefix="hitl_eval_") logging.info("temp_dir: " + str(temp_dir)) # Create a temporary file to store the code within temp_dir diff --git a/MaxKernel/hitl_agent/server_utils/server_manager_mixin.py b/MaxKernel/hitl_agent/server_utils/server_manager_mixin.py index aa45b18..bad6a55 100644 --- a/MaxKernel/hitl_agent/server_utils/server_manager_mixin.py +++ b/MaxKernel/hitl_agent/server_utils/server_manager_mixin.py @@ -270,3 +270,10 @@ async def _cleanup_servers(self): process_name = f"{server_type}_server.py" self._stop_server_sync(process_name) await asyncio.sleep(0.5) # Brief pause between stops + + # Clean up dangling evaluation subprocesses + try: + logging.info("Cleaning up dangling evaluation subprocesses...") + subprocess.run(["pkill", "-f", "/tmp/hitl_eval_.*\.py"], check=False) + except Exception as e: + logging.warning(f"Failed to clean up subprocesses: {e}") diff --git a/MaxKernel/hitl_agent/server_utils/setup.sh b/MaxKernel/hitl_agent/server_utils/setup.sh index 134b271..0bd510b 100644 --- a/MaxKernel/hitl_agent/server_utils/setup.sh +++ b/MaxKernel/hitl_agent/server_utils/setup.sh @@ -26,7 +26,9 @@ elif [ "$1" = "--end" ]; then pkill -f "tpu_server.py" pkill -f "cpu_server.py" pkill -f "eval_server.py" - + # Kill any dangling evaluation subprocesses + pkill -f "/tmp/hitl_eval_.*\.py" + echo "Server(s) stopped successfully" else echo "Usage: $0 --start-tpu|--start-cpu|--start-eval|--end" diff --git a/MaxKernel/hitl_agent/server_utils/tpu_server.py b/MaxKernel/hitl_agent/server_utils/tpu_server.py index 1027995..3211f0f 100644 --- a/MaxKernel/hitl_agent/server_utils/tpu_server.py +++ b/MaxKernel/hitl_agent/server_utils/tpu_server.py @@ -86,7 +86,7 @@ async def compilation_test(request: CodeRequest): request.code = code_content # Create a temporary file to store the code with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False + mode="w", suffix=".py", prefix="hitl_eval_", delete=False ) as temp_file: temp_file.write(request.code) temp_file_path = temp_file.name @@ -168,7 +168,7 @@ async def correctness_test(request: CodeRequest): request.code = code_content # Create a temporary file to store the code with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False + mode="w", suffix=".py", prefix="hitl_eval_", delete=False ) as temp_file: temp_file.write(request.code) temp_file_path = temp_file.name @@ -249,7 +249,7 @@ async def performance_test(request: CodeRequest): request.code = code_content # Create a temporary file to store the code with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False + mode="w", suffix=".py", prefix="hitl_eval_", delete=False ) as temp_file: temp_file.write(request.code) temp_file_path = temp_file.name @@ -327,11 +327,12 @@ async def autotune(request: AutotuneRequest): # Execute the code with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False + mode="w", suffix=".py", prefix="hitl_eval_", delete=False ) as temp_file: temp_file.write(code_content) temp_file_path = temp_file.name + process = None try: process = await asyncio.create_subprocess_exec( sys.executable, @@ -383,8 +384,9 @@ async def autotune(request: AutotuneRequest): except asyncio.TimeoutError: logging.warning(f"Config {cfg} timed out") - process.kill() - await process.wait() + if process: + process.kill() + await process.wait() all_results.append({"cfg": cfg, "status": "timeout"}) except Exception as e: logging.error(f"Error running config {cfg}: {e}") @@ -397,6 +399,7 @@ async def autotune(request: AutotuneRequest): os.unlink(temp_file_path) except OSError: pass + await asyncio.sleep(2) if best_cfg is None: return CodeResponse( @@ -448,7 +451,7 @@ async def profile(request: CodeRequest): request.code = code_content # Create a temporary directory to store the code and any generated files - temp_dir = tempfile.mkdtemp() + temp_dir = tempfile.mkdtemp(prefix="hitl_eval_") logging.info("temp_dir: " + str(temp_dir)) # Create a temporary file to store the code within temp_dir diff --git a/MaxKernel/hitl_agent/subagents/autotuning/autotune_tool.py b/MaxKernel/hitl_agent/subagents/autotuning/autotune_tool.py index 70601bd..d2e0af4 100644 --- a/MaxKernel/hitl_agent/subagents/autotuning/autotune_tool.py +++ b/MaxKernel/hitl_agent/subagents/autotuning/autotune_tool.py @@ -2,11 +2,12 @@ import json import logging +import subprocess from typing import Any import requests -from hitl_agent.constants import EVAL_SERVER_PORT +from hitl_agent.constants import EVAL_SERVER_PORT, AUTOTUNE_TIMEOUT def autotune_kernel( @@ -48,7 +49,7 @@ def autotune_kernel( "timeout": 300, "backend_type": backend, }, - timeout=3600, # 1 hour timeout for the whole autotune request + timeout=AUTOTUNE_TIMEOUT, # timeout for the whole autotune request ) if response.status_code == 200: @@ -104,5 +105,20 @@ def autotune_kernel( f"Could not connect to server at {url}. Make sure it is running." ), } + except requests.exceptions.Timeout: + logging.warning( + "Autotune timed out on client side. Cleaning up dangling subprocesses on TPU server..." + ) + try: + subprocess.run(["pkill", "-9", "-f", "tpu_server.py"], check=False) + subprocess.run(["pkill", "-f", "/tmp/hitl_eval_.*\\.py"], check=False) + logging.info("Killed dangling evaluations and tpu_server.py") + except Exception as cleanup_error: + logging.error(f"Failed to run cleanup commands: {cleanup_error}") + + return { + "status": "error", + "message": f"Autotune request timed out after {AUTOTUNE_TIMEOUT} seconds. Dangling processes were killed.", + } except Exception as e: return {"status": "error", "message": str(e)} From 464b758e4a8e3caf9a3851071a774a0454de00be Mon Sep 17 00:00:00 2001 From: NinaCai Date: Wed, 13 May 2026 15:56:20 +0000 Subject: [PATCH 2/2] stop both tpu and cpu servers --- MaxKernel/hitl_agent/subagents/autotuning/autotune_tool.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MaxKernel/hitl_agent/subagents/autotuning/autotune_tool.py b/MaxKernel/hitl_agent/subagents/autotuning/autotune_tool.py index d2e0af4..0da7522 100644 --- a/MaxKernel/hitl_agent/subagents/autotuning/autotune_tool.py +++ b/MaxKernel/hitl_agent/subagents/autotuning/autotune_tool.py @@ -107,10 +107,11 @@ def autotune_kernel( } except requests.exceptions.Timeout: logging.warning( - "Autotune timed out on client side. Cleaning up dangling subprocesses on TPU server..." + "Autotune timed out on client side. Cleaning up dangling subprocesses on server..." ) try: subprocess.run(["pkill", "-9", "-f", "tpu_server.py"], check=False) + subprocess.run(["pkill", "-9", "-f", "cpu_server.py"], check=False) subprocess.run(["pkill", "-f", "/tmp/hitl_eval_.*\\.py"], check=False) logging.info("Killed dangling evaluations and tpu_server.py") except Exception as cleanup_error: