From e7851c64bc4e8d9b265aaba6e72b686bbeff339d Mon Sep 17 00:00:00 2001 From: "haoyu.ding" Date: Fri, 15 May 2026 17:16:27 +0800 Subject: [PATCH] [kvm]: extend reconnect echo timeout after libvirtd restart Resolves: ZSTAC-84691 Change-Id: I6668736975707575756d7665647867686e6b6b76 --- .../src/main/java/org/zstack/kvm/KVMHost.java | 19 +++- .../java/org/zstack/kvm/KVMHostUtils.java | 30 +++++++ .../host/LibvirtRestartEchoTimeoutCase.groovy | 86 +++++++++++++++++++ .../org/zstack/test/kvm/KVMHostUtilsTest.java | 64 ++++++++++++++ 4 files changed, 197 insertions(+), 2 deletions(-) create mode 100644 test/src/test/groovy/org/zstack/test/integration/kvm/host/LibvirtRestartEchoTimeoutCase.groovy diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java index 48370a2a144..464ee1536a0 100755 --- a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java +++ b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java @@ -5316,6 +5316,15 @@ private boolean noStorageAccessible(){ return inaccessiblePsCount == attachedPsCount && attachedPsCount > 0; } + @Transactional(readOnly = true) + private long getLibvirtRestartedEchoTimeout() { + long vmCount = KVMHostUtils.countVmsForLibvirtRestartEchoTimeout(self.getUuid()); + long timeout = KVMHostUtils.calculateLibvirtRestartEchoTimeoutMillis(vmCount); + logger.info(String.format("extend kvmagent echo timeout after libvirtd restart on host[uuid:%s, vmCount:%s] to %sms", + self.getUuid(), vmCount, timeout)); + return timeout; + } + private void updateHostOsInformation(String distro, String release, String version) { final KVMHostVO kvmHostVO = getSelf(); kvmHostVO.setOsDistribution(distro); @@ -5586,6 +5595,7 @@ public void connectHook(final ConnectHostInfo info, final Completion complete) { chain.allowWatch(); chain.then(new ShareFlow() { boolean deployed = false; + boolean libvirtRestarted = false; @Override public void setup() { flow(new NoRollbackFlow() { @@ -5949,6 +5959,8 @@ public void success(Boolean run) { deployed = run; } if (deployed) { + libvirtRestarted = KVMHostUtils.shouldRestartLibvirtdDuringDeploy( + deployArguments.getInit(), deployArguments.getRestartLibvirtd()); // update host agent version when open grayScaleUpgrade upgradeChecker.updateAgentVersion(self.getUuid(), AnsibleConstant.KVM_AGENT_NAME, dbf.getDbVersion(), dbf.getDbVersion()); } @@ -6023,6 +6035,9 @@ public boolean skip(Map data) { @Override public void run(final FlowTrigger trigger, Map data) { + final long echoTimeout = libvirtRestarted + ? getLibvirtRestartedEchoTimeout() + : TimeUnit.SECONDS.toMillis(CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT); restf.echo(echoPath, new Completion(trigger) { @Override public void success() { @@ -6054,7 +6069,7 @@ public void success() { public void fail(ErrorCode errorCode) { trigger.fail(errorCode); } - }); + }, TimeUnit.SECONDS.toMillis(1), echoTimeout); } @Override @@ -6066,7 +6081,7 @@ public void fail(ErrorCode errorCode) { trigger.fail(errorCode); } } - }); + }, TimeUnit.SECONDS.toMillis(1), echoTimeout); } }); diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java index 53db539044b..82f040f15bb 100644 --- a/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java +++ b/plugin/kvm/src/main/java/org/zstack/kvm/KVMHostUtils.java @@ -9,6 +9,9 @@ import org.zstack.header.tag.SystemTagVO; import org.zstack.header.tag.SystemTagVO_; import org.zstack.header.tag.TagType; +import org.zstack.header.vm.VmInstanceState; +import org.zstack.header.vm.VmInstanceVO; +import org.zstack.header.vm.VmInstanceVO_; import org.zstack.utils.CollectionUtils; import org.zstack.utils.TagUtils; import org.zstack.utils.logging.CLogger; @@ -22,6 +25,7 @@ import java.util.List; import java.util.Locale; import java.util.Set; +import java.util.concurrent.TimeUnit; /** * Created by GuoYi on 4/16/20. @@ -33,6 +37,9 @@ public class KVMHostUtils { // to keep check-flow and deploy-flow IP lists identical. public static final Set EXCLUDED_INTERNAL_IPS = Collections.unmodifiableSet( new LinkedHashSet<>(Collections.singletonList("169.254.64.1"))); + public static final long LIBVIRT_RESTART_ECHO_TIMEOUT_VM_THRESHOLD = 100; + public static final long LIBVIRT_RESTART_ECHO_TIMEOUT_PER_VM_SECONDS = 1; + public static final long LIBVIRT_RESTART_ECHO_TIMEOUT_MAX_SECONDS = 180; // Collect host IPv4 addresses; mirrors host_plugin.fact() filters and // applies EXCLUDED_INTERNAL_IPS so check and deploy share one source. @@ -147,6 +154,29 @@ public static boolean shouldForceTlsRedeploy(boolean needDeployTlsCert, return allowRestartLibvirtd || isNewAdded; } + public static boolean shouldRestartLibvirtdDuringDeploy(String init, String restartLibvirtd) { + return "true".equalsIgnoreCase(init) || "true".equalsIgnoreCase(restartLibvirtd); + } + + public static long countVmsForLibvirtRestartEchoTimeout(String hostUuid) { + return Q.New(VmInstanceVO.class) + .eq(VmInstanceVO_.hostUuid, hostUuid) + .notEq(VmInstanceVO_.state, VmInstanceState.Stopped) + .count(); + } + + public static long calculateLibvirtRestartEchoTimeoutMillis(long vmCount) { + long defaultTimeoutSeconds = CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT; + if (vmCount <= LIBVIRT_RESTART_ECHO_TIMEOUT_VM_THRESHOLD) { + return TimeUnit.SECONDS.toMillis(defaultTimeoutSeconds); + } + + long maxExtraSeconds = Math.max(0, LIBVIRT_RESTART_ECHO_TIMEOUT_MAX_SECONDS - defaultTimeoutSeconds); + long extraVmCount = vmCount - LIBVIRT_RESTART_ECHO_TIMEOUT_VM_THRESHOLD; + long extraSeconds = Math.min(extraVmCount * LIBVIRT_RESTART_ECHO_TIMEOUT_PER_VM_SECONDS, maxExtraSeconds); + return TimeUnit.SECONDS.toMillis(defaultTimeoutSeconds + extraSeconds); + } + public static boolean shouldContinueReconnectOnAnsibleFailure(boolean isNewAdded, ErrorCode errorCode) { return !isNewAdded && isLibvirtSocketMaskSystemdTimeout(errorCode); } diff --git a/test/src/test/groovy/org/zstack/test/integration/kvm/host/LibvirtRestartEchoTimeoutCase.groovy b/test/src/test/groovy/org/zstack/test/integration/kvm/host/LibvirtRestartEchoTimeoutCase.groovy new file mode 100644 index 00000000000..b67eced4a13 --- /dev/null +++ b/test/src/test/groovy/org/zstack/test/integration/kvm/host/LibvirtRestartEchoTimeoutCase.groovy @@ -0,0 +1,86 @@ +package org.zstack.test.integration.kvm.host + +import org.zstack.core.CoreGlobalProperty +import org.zstack.core.db.Q +import org.zstack.core.db.SQL +import org.zstack.header.vm.VmInstanceState +import org.zstack.header.vm.VmInstanceVO +import org.zstack.header.vm.VmInstanceVO_ +import org.zstack.kvm.KVMHostUtils +import org.zstack.sdk.HostInventory +import org.zstack.sdk.VmInstanceInventory +import org.zstack.test.integration.kvm.Env +import org.zstack.test.integration.kvm.KvmTest +import org.zstack.testlib.EnvSpec +import org.zstack.testlib.SubCase + +import java.util.concurrent.TimeUnit + +class LibvirtRestartEchoTimeoutCase extends SubCase { + EnvSpec env + + @Override + void setup() { + useSpring(KvmTest.springSpec) + } + + @Override + void environment() { + env = Env.oneVmBasicEnv() + } + + @Override + void test() { + env.create { + testCalculateLibvirtRestartEchoTimeout() + testCountVmsForLibvirtRestartEchoTimeoutExcludesStoppedVm() + } + } + + @Override + void clean() { + env.delete() + } + + void testCalculateLibvirtRestartEchoTimeout() { + int oldTimeout = CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT + try { + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = 60 + + assert KVMHostUtils.calculateLibvirtRestartEchoTimeoutMillis(100) == TimeUnit.SECONDS.toMillis(60) + assert KVMHostUtils.calculateLibvirtRestartEchoTimeoutMillis(101) == TimeUnit.SECONDS.toMillis(61) + assert KVMHostUtils.calculateLibvirtRestartEchoTimeoutMillis(10000) == TimeUnit.SECONDS.toMillis(180) + + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = 300 + assert KVMHostUtils.calculateLibvirtRestartEchoTimeoutMillis(10000) == TimeUnit.SECONDS.toMillis(300) + } finally { + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = oldTimeout + } + } + + void testCountVmsForLibvirtRestartEchoTimeoutExcludesStoppedVm() { + HostInventory host = env.inventoryByName("kvm") as HostInventory + VmInstanceInventory vm = env.inventoryByName("vm") as VmInstanceInventory + + long originalCount = KVMHostUtils.countVmsForLibvirtRestartEchoTimeout(host.uuid) + assert originalCount > 0 + + VmInstanceState originalState = Q.New(VmInstanceVO.class) + .select(VmInstanceVO_.state) + .eq(VmInstanceVO_.uuid, vm.uuid) + .findValue() + try { + SQL.New(VmInstanceVO.class) + .eq(VmInstanceVO_.uuid, vm.uuid) + .set(VmInstanceVO_.state, VmInstanceState.Stopped) + .update() + + assert KVMHostUtils.countVmsForLibvirtRestartEchoTimeout(host.uuid) == originalCount - 1 + } finally { + SQL.New(VmInstanceVO.class) + .eq(VmInstanceVO_.uuid, vm.uuid) + .set(VmInstanceVO_.state, originalState) + .update() + } + } +} diff --git a/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java b/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java index f3c1222a010..0ae331da3d5 100644 --- a/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java +++ b/test/src/test/java/org/zstack/test/kvm/KVMHostUtilsTest.java @@ -2,10 +2,12 @@ import org.junit.Assert; import org.junit.Test; +import org.zstack.core.CoreGlobalProperty; import org.zstack.header.errorcode.ErrorCode; import org.zstack.kvm.KVMHostUtils; import java.util.Collections; +import java.util.concurrent.TimeUnit; public class KVMHostUtilsTest { @@ -189,6 +191,68 @@ public void shouldForceTlsRedeploy_reconnectWithoutRestartSkips() { Assert.assertFalse(KVMHostUtils.shouldForceTlsRedeploy(true, false, false)); } + @Test + public void shouldRestartLibvirtdDuringDeploy_initOrRestartLibvirtdTriggers() { + Assert.assertFalse(KVMHostUtils.shouldRestartLibvirtdDuringDeploy(null, null)); + Assert.assertFalse(KVMHostUtils.shouldRestartLibvirtdDuringDeploy("false", "false")); + Assert.assertTrue(KVMHostUtils.shouldRestartLibvirtdDuringDeploy("true", "false")); + Assert.assertTrue(KVMHostUtils.shouldRestartLibvirtdDuringDeploy("false", "true")); + Assert.assertTrue(KVMHostUtils.shouldRestartLibvirtdDuringDeploy("TrUe", null)); + Assert.assertTrue(KVMHostUtils.shouldRestartLibvirtdDuringDeploy(null, "TrUe")); + } + + @Test + public void calculateLibvirtRestartEchoTimeout_keepsDefaultAtThreshold() { + int oldTimeout = CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT; + try { + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = 60; + Assert.assertEquals(TimeUnit.SECONDS.toMillis(60), + KVMHostUtils.calculateLibvirtRestartEchoTimeoutMillis(100)); + } finally { + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = oldTimeout; + } + } + + @Test + public void calculateLibvirtRestartEchoTimeout_addsOneSecondAfterThreshold() { + int oldTimeout = CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT; + try { + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = 60; + Assert.assertEquals(TimeUnit.SECONDS.toMillis(61), + KVMHostUtils.calculateLibvirtRestartEchoTimeoutMillis(101)); + } finally { + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = oldTimeout; + } + } + + @Test + public void calculateLibvirtRestartEchoTimeout_capsAt180Seconds() { + int oldTimeout = CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT; + try { + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = 60; + Assert.assertEquals(TimeUnit.SECONDS.toMillis(180), + KVMHostUtils.calculateLibvirtRestartEchoTimeoutMillis(10000)); + } finally { + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = oldTimeout; + } + } + + @Test + public void calculateLibvirtRestartEchoTimeout_doesNotReduceConfiguredTimeoutAboveCap() { + int oldTimeout = CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT; + try { + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = 180; + Assert.assertEquals(TimeUnit.SECONDS.toMillis(180), + KVMHostUtils.calculateLibvirtRestartEchoTimeoutMillis(10000)); + + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = 300; + Assert.assertEquals(TimeUnit.SECONDS.toMillis(300), + KVMHostUtils.calculateLibvirtRestartEchoTimeoutMillis(10000)); + } finally { + CoreGlobalProperty.REST_FACADE_ECHO_TIMEOUT = oldTimeout; + } + } + @Test public void zstac77120_continueReconnectOnLibvirtSocketMaskSystemdTimeout() { ErrorCode error = new ErrorCode();