From f909d86d416abd4b1af0a4ecbaa6533adc3371f6 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 12 Feb 2026 07:11:25 +0000 Subject: [PATCH] Don't terminate unreachable SSH instances Fixes: https://github.com/dstackai/dstack/issues/2531 --- .../background/tasks/process_instances.py | 4 +-- .../tasks/test_process_instances.py | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index 184287b31..da47cf16e 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -778,7 +778,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non ) return - if instance.termination_deadline is None: + if not is_ssh_instance(instance) and instance.termination_deadline is None: instance.termination_deadline = get_current_datetime() + TERMINATION_DEADLINE_OFFSET if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None: @@ -792,7 +792,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non switch_instance_status(session, instance, InstanceStatus.TERMINATING) elif instance.status.is_available(): deadline = instance.termination_deadline - if get_current_datetime() > deadline: + if deadline is not None and get_current_datetime() > deadline: instance.termination_reason = InstanceTerminationReason.UNREACHABLE switch_instance_status(session, instance, InstanceStatus.TERMINATING) diff --git a/src/tests/_internal/server/background/tasks/test_process_instances.py b/src/tests/_internal/server/background/tasks/test_process_instances.py index 8691f3e7e..8d94ee059 100644 --- a/src/tests/_internal/server/background/tasks/test_process_instances.py +++ b/src/tests/_internal/server/background/tasks/test_process_instances.py @@ -198,6 +198,7 @@ async def test_check_shim_start_termination_deadline(self, test_db, session: Asy session=session, project=project, status=InstanceStatus.IDLE, + unreachable=False, ) health_status = "SSH connection fail" with patch( @@ -210,11 +211,39 @@ async def test_check_shim_start_termination_deadline(self, test_db, session: Asy assert instance is not None assert instance.status == InstanceStatus.IDLE + assert instance.unreachable assert instance.termination_deadline is not None assert instance.termination_deadline.replace( tzinfo=dt.timezone.utc ) > get_current_datetime() + dt.timedelta(minutes=19) + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_check_shim_does_not_start_termination_deadline_with_ssh_instance( + self, test_db, session: AsyncSession + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + unreachable=False, + remote_connection_info=get_remote_connection_info(), + ) + health_status = "SSH connection fail" + with patch( + "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + ) as healthcheck: + healthcheck.return_value = InstanceCheck(reachable=False, message=health_status) + await process_instances() + + await session.refresh(instance) + + assert instance is not None + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable + assert instance.termination_deadline is None + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_check_shim_stop_termination_deadline(self, test_db, session: AsyncSession):