Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions api/src/main/java/com/cloud/agent/api/Command.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public static enum OnError {
@LogLevel(Log4jLevel.Trace)
protected Map<String, String> contextMap = new HashMap<String, String>();
private int wait; //in second
private boolean bypassHostMaintenance = false;

protected Command() {
this.wait = 0;
Expand Down Expand Up @@ -74,6 +75,14 @@ public boolean allowCaching() {
return true;
}

public boolean isBypassHostMaintenance() {
return bypassHostMaintenance;
}

public void setBypassHostMaintenance(boolean bypassHostMaintenance) {
this.bypassHostMaintenance = bypassHostMaintenance;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,7 @@
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import com.cloud.agent.api.ModifySshKeysCommand;
import com.cloud.agent.api.ModifyStoragePoolCommand;
import org.apache.cloudstack.agent.lb.SetupMSListCommand;
import com.cloud.agent.api.RollingMaintenanceCommand;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.log4j.Logger;

Expand All @@ -48,10 +45,13 @@
import com.cloud.agent.api.Command;
import com.cloud.agent.api.MaintainCommand;
import com.cloud.agent.api.MigrateCommand;
import com.cloud.agent.api.ModifySshKeysCommand;
import com.cloud.agent.api.ModifyStoragePoolCommand;
import com.cloud.agent.api.ModifyTargetsCommand;
import com.cloud.agent.api.PingTestCommand;
import com.cloud.agent.api.PvlanSetupCommand;
import com.cloud.agent.api.ReadyCommand;
import com.cloud.agent.api.RollingMaintenanceCommand;
import com.cloud.agent.api.SetupCommand;
import com.cloud.agent.api.ShutdownCommand;
import com.cloud.agent.api.StartCommand;
Expand Down Expand Up @@ -167,7 +167,7 @@ protected void checkAvailability(final Command[] cmds) throws AgentUnavailableEx

if (_maintenance) {
for (final Command cmd : cmds) {
if (Arrays.binarySearch(s_commandsAllowedInMaintenanceMode, cmd.getClass().toString()) < 0) {
if (Arrays.binarySearch(s_commandsAllowedInMaintenanceMode, cmd.getClass().toString()) < 0 && !cmd.isBypassHostMaintenance()) {
throw new AgentUnavailableException("Unable to send " + cmd.getClass().toString() + " because agent " + _name + " is in maintenance mode", _id);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,11 @@ public void advanceExpunge(final String vmUuid) throws ResourceUnavailableExcept
advanceExpunge(vm);
}

private boolean expungeCommandCanBypassHostMaintenance(VirtualMachine vm) {
return VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType()) ||
VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
}

protected void advanceExpunge(VMInstanceVO vm) throws ResourceUnavailableException, OperationTimedoutException, ConcurrentOperationException {
if (vm == null || vm.getRemoved() != null) {
if (s_logger.isDebugEnabled()) {
Expand Down Expand Up @@ -565,6 +570,7 @@ protected void advanceExpunge(VMInstanceVO vm) throws ResourceUnavailableExcepti
final Commands cmds = new Commands(Command.OnError.Stop);

for (final Command volumeExpungeCommand : volumeExpungeCommands) {
volumeExpungeCommand.setBypassHostMaintenance(expungeCommandCanBypassHostMaintenance(vm));
cmds.addCommand(volumeExpungeCommand);
}

Expand Down Expand Up @@ -606,10 +612,12 @@ protected void advanceExpunge(VMInstanceVO vm) throws ResourceUnavailableExcepti
if (hostId != null) {
final Commands cmds = new Commands(Command.OnError.Stop);
for (final Command command : finalizeExpungeCommands) {
command.setBypassHostMaintenance(expungeCommandCanBypassHostMaintenance(vm));
cmds.addCommand(command);
}
if (nicExpungeCommands != null) {
for (final Command command : nicExpungeCommands) {
command.setBypassHostMaintenance(expungeCommandCanBypassHostMaintenance(vm));
cmds.addCommand(command);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
*/
package org.apache.cloudstack.storage.datastore.driver;

import static com.cloud.utils.NumbersUtil.toHumanReadableSize;

import java.util.HashMap;
import java.util.Map;
import java.util.UUID;

import javax.inject.Inject;

import org.apache.log4j.Logger;

import org.apache.cloudstack.engine.subsystem.api.storage.ChapInfo;
import org.apache.cloudstack.engine.subsystem.api.storage.CopyCommandResult;
import org.apache.cloudstack.engine.subsystem.api.storage.CreateCmdResult;
Expand Down Expand Up @@ -53,6 +53,7 @@
import org.apache.cloudstack.storage.to.SnapshotObjectTO;
import org.apache.cloudstack.storage.to.TemplateObjectTO;
import org.apache.cloudstack.storage.volume.VolumeObject;
import org.apache.log4j.Logger;

import com.cloud.agent.api.Answer;
import com.cloud.agent.api.storage.ResizeVolumeAnswer;
Expand All @@ -70,16 +71,17 @@
import com.cloud.storage.Storage;
import com.cloud.storage.StorageManager;
import com.cloud.storage.StoragePool;
import com.cloud.storage.Volume;
import com.cloud.storage.dao.DiskOfferingDao;
import com.cloud.storage.dao.SnapshotDao;
import com.cloud.storage.dao.VMTemplateDao;
import com.cloud.storage.dao.VolumeDao;
import com.cloud.storage.snapshot.SnapshotManager;
import com.cloud.template.TemplateManager;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine;
import com.cloud.vm.dao.VMInstanceDao;

import static com.cloud.utils.NumbersUtil.toHumanReadableSize;

public class CloudStackPrimaryDataStoreDriverImpl implements PrimaryDataStoreDriver {
@Override
public Map<String, String> getCapabilities() {
Expand Down Expand Up @@ -211,10 +213,22 @@ public void createAsync(DataStore dataStore, DataObject data, AsyncCompletionCal
}
}

private boolean commandCanBypassHostMaintenance(DataObject data) {
if (DataObjectType.VOLUME.equals(data.getType())) {
Volume volume = (Volume)data;
if (volume.getInstanceId() != null) {
VMInstanceVO vm = vmDao.findById(volume.getInstanceId());
return vm != null && (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType()) ||
VirtualMachine.Type.ConsoleProxy.equals(vm.getType()));
}
}
return false;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shwstppr Is bypass host maintenance applicable for volumes in managed storage as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sureshanaparti yes, bypasshostmaintenance flag could be true for volumes in managed stores as well if the volume is of cpvm or ssvm

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok @shwstppr how other drivers set this parameter ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sureshanaparti other drivers don't seem to be using DeleteCommand.

~/lab/shapeblue/cloudstack|fix-uncleared-sys-vms-3719⚡ 
⇒  grep "new DeleteCommand" -R .
./engine/storage/image/src/main/java/org/apache/cloudstack/storage/image/TemplateServiceImpl.java:                            DeleteCommand dtCommand = new DeleteCommand(tmplTO);
./engine/storage/volume/src/main/java/org/apache/cloudstack/storage/volume/VolumeServiceImpl.java:                        DeleteCommand dtCommand = new DeleteCommand(tmplTO);
./engine/storage/src/main/java/org/apache/cloudstack/storage/image/BaseImageStoreDriverImpl.java:            DeleteCommand cmd = new DeleteCommand(data.getTO());
./server/src/main/java/com/cloud/vm/UserVmManagerImpl.java:                    cmd = new DeleteCommand(volumeInfo.getTO());
./plugins/storage/volume/default/src/main/java/org/apache/cloudstack/storage/datastore/driver/CloudStackPrimaryDataStoreDriverImpl.java:        DeleteCommand cmd = new DeleteCommand(data.getTO());
./plugins/storage/volume/sample/src/main/java/org/apache/cloudstack/storage/datastore/driver/SamplePrimaryDataStoreDriverImpl.java:         * DeleteCommand cmd = new DeleteCommand(vo.getUri());
./plugins/hypervisors/vmware/src/main/java/com/cloud/storage/resource/VmwareStorageSubsystemCommandHandler.java:                        DeleteCommand deleteCommand = new DeleteCommand(template);
./plugins/hypervisors/vmware/src/main/java/com/cloud/storage/resource/VmwareStorageSubsystemCommandHandler.java:                    DeleteCommand deleteCommand = new DeleteCommand(newSnapshot);
./plugins/hypervisors/vmware/src/main/java/com/cloud/hypervisor/guru/VMwareGuru.java:                    DeleteCommand cmd = new DeleteCommand(volumeInfo.getTO());
./plugins/hypervisors/ovm3/src/test/java/com/cloud/hypervisor/ovm3/resources/Ovm3StorageProcessorTest.java:        DeleteCommand delete = new DeleteCommand(vol);
./plugins/hypervisors/ovm3/src/test/java/com/cloud/hypervisor/ovm3/resources/Ovm3StorageProcessorTest.java:        delete = new DeleteCommand(template);
./plugins/hypervisors/ovm3/src/test/java/com/cloud/hypervisor/ovm3/resources/Ovm3StorageProcessorTest.java:        delete = new DeleteCommand(snap);
./services/secondary-storage/server/src/main/java/org/apache/cloudstack/storage/resource/NfsSecondaryStorageResource.java:            DeleteCommand deleteCommand = new DeleteCommand(newTemplate)

PS: SamplePrimaryDataStoreDriverImpl code is commented

@Override
public void deleteAsync(DataStore dataStore, DataObject data, AsyncCompletionCallback<CommandResult> callback) {
DeleteCommand cmd = new DeleteCommand(data.getTO());

cmd.setBypassHostMaintenance(commandCanBypassHostMaintenance(data));
CommandResult result = new CommandResult();
try {
EndPoint ep = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,13 @@ private void allocCapacity(long dataCenterId) {
}

public boolean isZoneReady(Map<Long, ZoneHostInfo> zoneHostInfoMap, long dataCenterId) {
List <HostVO> hosts = _hostDao.listByDataCenterId(dataCenterId);
if (CollectionUtils.isEmpty(hosts)) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Zone " + dataCenterId + " has no host available which is enabled and in Up state");
}
return false;
}
Comment on lines +1009 to +1015
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the exact same code as inSecondaryStorageManagerImpl.isZoneReady() can you factor out and reuse?

ZoneHostInfo zoneHostInfo = zoneHostInfoMap.get(dataCenterId);
if (zoneHostInfo != null && isZoneHostReady(zoneHostInfo)) {
VMTemplateVO template = _templateDao.findSystemVMReadyTemplate(dataCenterId, HypervisorType.Any);
Expand Down
102 changes: 63 additions & 39 deletions server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,32 @@
// under the License.
package com.cloud.ha;

import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import javax.inject.Inject;
import javax.naming.ConfigurationException;

import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationService;
import org.apache.cloudstack.framework.config.ConfigKey;
import org.apache.cloudstack.framework.config.Configurable;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
import org.apache.log4j.Logger;
import org.apache.log4j.NDC;

import com.cloud.agent.AgentManager;
import com.cloud.alert.AlertManager;
import com.cloud.cluster.ClusterManagerListener;
import com.cloud.consoleproxy.ConsoleProxyManager;
import com.cloud.dc.ClusterDetailsDao;
import com.cloud.dc.DataCenterVO;
import com.cloud.dc.HostPodVO;
Expand Down Expand Up @@ -46,37 +69,16 @@
import com.cloud.storage.StorageManager;
import com.cloud.storage.dao.GuestOSCategoryDao;
import com.cloud.storage.dao.GuestOSDao;
import com.cloud.storage.secondary.SecondaryStorageVmManager;
import com.cloud.user.AccountManager;
import com.cloud.utils.component.ManagerBase;
import com.cloud.utils.concurrency.NamedThreadFactory;
import com.cloud.utils.exception.CloudRuntimeException;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine;
import com.cloud.vm.VirtualMachine.State;
import com.cloud.vm.VirtualMachineManager;
import com.cloud.vm.VirtualMachineProfile;
import com.cloud.vm.dao.VMInstanceDao;
import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationService;
import org.apache.cloudstack.framework.config.ConfigKey;
import org.apache.cloudstack.framework.config.Configurable;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
import org.apache.log4j.Logger;
import org.apache.log4j.NDC;

import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import javax.inject.Inject;
import javax.naming.ConfigurationException;

/**
* HighAvailabilityManagerImpl coordinates the HA process. VMs are registered with the HA Manager for HA. The request is stored
Expand Down Expand Up @@ -125,9 +127,12 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
HostPodDao _podDao;
@Inject
ClusterDetailsDao _clusterDetailsDao;

@Inject
ServiceOfferingDao _serviceOfferingDao;
@Inject
private ConsoleProxyManager consoleProxyManager;
@Inject
private SecondaryStorageVmManager secondaryStorageVmManager;

long _serverId;

Expand Down Expand Up @@ -680,31 +685,51 @@ public void cancelDestroy(VMInstanceVO vm, Long hostId) {
_haDao.delete(vm.getId(), WorkType.Destroy);
}

private void stopVMWithCleanup(VirtualMachine vm, VirtualMachine.State state) throws OperationTimedoutException, ResourceUnavailableException {
if (VirtualMachine.State.Running.equals(state)) {
_itMgr.advanceStop(vm.getUuid(), true);
}
}

private void destroyVM(VirtualMachine vm, boolean expunge) throws OperationTimedoutException, AgentUnavailableException {
s_logger.info("Destroying " + vm.toString());
if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
consoleProxyManager.destroyProxy(vm.getId());
} else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
secondaryStorageVmManager.destroySecStorageVm(vm.getId());
} else {
_itMgr.destroy(vm.getUuid(), expunge);
}
}

protected Long destroyVM(final HaWorkVO work) {
final VirtualMachine vm = _itMgr.findById(work.getInstanceId());
s_logger.info("Destroying " + vm.toString());
if (vm == null) {
s_logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
return null;
}
boolean expunge = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
boolean expunge = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
boolean expungeSystemVM = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sureshanaparti I think expunge name is better here as it is use irrespective of vm type in destroyVM method. Through it holds value - true only for ssvm and cpvm

|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
if (!expunge && VirtualMachine.State.Destroyed.equals(work.getPreviousState())) {
s_logger.info("VM " + vm.getUuid() + " already in " + vm.getState() + " state. Throwing away " + work);
return null;
}
try {
if (vm.getState() != State.Destroyed) {
s_logger.info("VM is no longer in Destroyed state " + vm.toString());
return null;
}

if (vm.getHostId() != null) {
_itMgr.destroy(vm.getUuid(), false);
s_logger.info("Successfully destroy " + vm);
stopVMWithCleanup(vm, work.getPreviousState());
if (!VirtualMachine.State.Expunging.equals(work.getPreviousState())) {
destroyVM(vm, expunge);
return null;
} else {
if (s_logger.isDebugEnabled()) {
s_logger.debug(vm + " has already been stopped");
}
return null;
s_logger.info("VM " + vm.getUuid() + " still in " + vm.getState() + " state.");
}
} catch (final AgentUnavailableException e) {
s_logger.debug("Agnet is not available" + e.getMessage());
s_logger.debug("Agent is not available" + e.getMessage());
} catch (OperationTimedoutException e) {
s_logger.debug("operation timed out: " + e.getMessage());
} catch (ConcurrentOperationException e) {
s_logger.debug("concurrent operation: " + e.getMessage());
} catch (ResourceUnavailableException e) {
s_logger.debug("Resource unavailable: " + e.getMessage());
}

return (System.currentTimeMillis() >> 10) + _stopRetryInterval;
Expand Down Expand Up @@ -793,9 +818,8 @@ private long getRescheduleTime(WorkType workType) {
case Stop:
case CheckStop:
case ForceStop:
return ((System.currentTimeMillis() >> 10) + _stopRetryInterval);
case Destroy:
return ((System.currentTimeMillis() >> 10) + _restartRetryInterval);
return ((System.currentTimeMillis() >> 10) + _stopRetryInterval);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shwstppr any reason to change to stop retry interval? if you want to keep it, move the case stmt after 'ForceStop'.

Alternatively, you can use different config (may be "destroy.retry.interval") for destroy operation, and set its value to the current value in "restart.retry.interval" on upgrade.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sureshanaparti clubbed Destroy case with other cases.
Used _stopRetryInterval because it uses value of config - stop.retry.interval. The description for that config is - The time in seconds between retries to stop or destroy a VM.

https://github.com/apache/cloudstack/blob/4.15/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java#L50-L51

}
return 0;
}
Expand Down
25 changes: 17 additions & 8 deletions server/src/main/java/com/cloud/resource/ResourceManagerImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
// under the License.
package com.cloud.resource;

import static com.cloud.configuration.ConfigurationManagerImpl.SET_HOST_DOWN_TO_MAINTENANCE;

import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
Expand Down Expand Up @@ -180,9 +182,6 @@
import com.cloud.vm.dao.VMInstanceDao;
import com.google.gson.Gson;


import static com.cloud.configuration.ConfigurationManagerImpl.SET_HOST_DOWN_TO_MAINTENANCE;

@Component
public class ResourceManagerImpl extends ManagerBase implements ResourceManager, ResourceService, Manager {
private static final Logger s_logger = Logger.getLogger(ResourceManagerImpl.class);
Expand Down Expand Up @@ -1229,6 +1228,19 @@ public boolean resourceStateTransitTo(final Host host, final ResourceState.Event
return _hostDao.updateResourceState(currentState, event, nextState, host);
}

private void handleVmForLastHostOrWithVGpu(final HostVO host, final VMInstanceVO vm) {
// Migration is not supported for VGPU Vms so stop them.
// for the last host in this cluster, destroy SSVM/CPVM and stop all other VMs
if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
s_logger.error(String.format("Maintenance: VM is of type %s. Destroying VM %s (ID: %s) immediately instead of migration.", vm.getType().toString(), vm.getInstanceName(), vm.getUuid()));
_haMgr.scheduleDestroy(vm, host.getId());
return;
}
s_logger.error(String.format("Maintenance: No hosts available for migrations. Scheduling shutdown for VM %s instead of migration.", vm.getUuid()));
_haMgr.scheduleStop(vm, host.getId(), WorkType.ForceStop);
}

private boolean doMaintain(final long hostId) {
final HostVO host = _hostDao.findById(hostId);
s_logger.info("Maintenance: attempting maintenance of host " + host.getUuid());
Expand Down Expand Up @@ -1266,10 +1278,7 @@ private boolean doMaintain(final long hostId) {
for (final VMInstanceVO vm : vms) {
if (hosts == null || hosts.isEmpty() || !answer.getMigrate()
|| _serviceOfferingDetailsDao.findDetail(vm.getServiceOfferingId(), GPU.Keys.vgpuType.toString()) != null) {
// Migration is not supported for VGPU Vms so stop them.
// for the last host in this cluster, stop all the VMs
s_logger.error("Maintenance: No hosts available for migrations. Scheduling shutdown instead of migrations.");
_haMgr.scheduleStop(vm, hostId, WorkType.ForceStop);
handleVmForLastHostOrWithVGpu(host, vm);
} else if (HypervisorType.LXC.equals(host.getHypervisorType()) && VirtualMachine.Type.User.equals(vm.getType())){
//Migration is not supported for LXC Vms. Schedule restart instead.
_haMgr.scheduleRestart(vm, false);
Expand Down Expand Up @@ -1417,7 +1426,7 @@ protected boolean setHostIntoPrepareForMaintenanceAfterErrorsFixed(HostVO host)
* on a host. We need to track the various VM states on each run and accordingly transit to the
* appropriate state.
*
* We change states as follws -
* We change states as follows -
* 1. If there are no VMs in running, migrating, starting, stopping, error, unknown states we can move
* to maintenance state. Note that there cannot be incoming migrations as the API Call prepare for
* maintenance checks incoming migrations before starting.
Expand Down
Loading