Skip to content

Commit 957b905

Browse files
committed
CLOUDSTACK-9782: Improve scheduling of jobs
- Removed three bg thread tasks, uses FSM event-trigger based scheduling - On successful recovery, kicks VM HA - Improves overall HA scheduling and task submission, lower DB access Signed-off-by: Rohit Yadav <rohit.yadav@shapeblue.com>
1 parent 681875d commit 957b905

13 files changed

Lines changed: 206 additions & 216 deletions

File tree

api/src/org/apache/cloudstack/ha/HAConfig.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,10 @@ enum Event {
4747
ActivityCheckFailureUnderThresholdRatio,
4848
PowerCycle,
4949
Recovered,
50+
RetryRecovery,
5051
RecoveryWaitPeriodTimeout,
5152
RecoveryOperationThresholdExceeded,
53+
RetryFencing,
5254
Fenced;
5355

5456
public Long getServerId() {
@@ -123,6 +125,7 @@ public String getDescription() {
123125

124126
FSM.addTransition(Recovering, Event.Disabled, Disabled);
125127
FSM.addTransition(Recovering, Event.Ineligible, Ineligible);
128+
FSM.addTransition(Recovering, Event.RetryRecovery, Recovering);
126129
FSM.addTransition(Recovering, Event.Recovered, Recovered);
127130
FSM.addTransition(Recovering, Event.RecoveryOperationThresholdExceeded, Fencing);
128131

@@ -132,6 +135,7 @@ public String getDescription() {
132135

133136
FSM.addTransition(Fencing, Event.Disabled, Disabled);
134137
FSM.addTransition(Fencing, Event.Ineligible, Ineligible);
138+
FSM.addTransition(Fencing, Event.RetryFencing, Fencing);
135139
FSM.addTransition(Fencing, Event.Fenced, Fenced);
136140

137141
FSM.addTransition(Fenced, Event.Disabled, Disabled);

plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ public Boolean checkingHB() {
5454
OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser();
5555
String result = cmd.execute(parser);
5656
s_logger.debug("KVMHAChecker pool: " + pool._poolIp);
57-
s_logger.debug("KVMHAChecker reture: " + result);
57+
s_logger.debug("KVMHAChecker result: " + result);
5858
s_logger.debug("KVMHAChecker parser: " + parser.getLine());
5959
if (result == null && parser.getLine().contains("> DEAD <")) {
6060
s_logger.debug("read heartbeat failed: ");

plugins/hypervisors/simulator/src/org/apache/cloudstack/ha/SimulatorHAProvider.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ public HAResource.ResourceSubType resourceSubType() {
7272

7373
@Override
7474
public boolean isEligible(final Host host) {
75+
if (host == null) {
76+
return false;
77+
}
7578
final SimulatorHAState haState = hostHAStateMap.get(host.getId());
7679
return !isInMaintenanceMode(host) && !isDisabled(host) && haState != null
7780
&& Hypervisor.HypervisorType.Simulator.equals(host.getHypervisorType());

plugins/outofbandmanagement-drivers/nested-cloudstack/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
<dependency>
4141
<groupId>br.com.autonomiccs</groupId>
4242
<artifactId>apache-cloudstack-java-client</artifactId>
43-
<version>1.0.4</version>
43+
<version>1.0.5</version>
4444
</dependency>
4545
</dependencies>
4646
</project>

server/src/org/apache/cloudstack/ha/HAManagerImpl.java

Lines changed: 133 additions & 176 deletions
Large diffs are not rendered by default.

server/src/org/apache/cloudstack/ha/HAResourceCounter.java

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ public long getRecoveryCounter() {
4141
}
4242

4343
public synchronized void incrActivityCounter(final boolean isFailure) {
44-
lastActivityCheckTimestamp = System.currentTimeMillis();
4544
activityCheckCounter.incrementAndGet();
4645
if (isFailure) {
4746
activityCheckFailureCounter.incrementAndGet();
@@ -71,8 +70,12 @@ public boolean hasActivityThresholdExceeded(final double failureRatio) {
7170
return activityCheckFailureCounter.get() > (activityCheckCounter.get() * failureRatio);
7271
}
7372

74-
public boolean canPerformActivityCheck(final Long activityCheckInterval) {
75-
return lastActivityCheckTimestamp == null || (System.currentTimeMillis() - lastActivityCheckTimestamp) > (activityCheckInterval * 1000);
73+
public synchronized boolean canPerformActivityCheck(final Long activityCheckInterval) {
74+
if (lastActivityCheckTimestamp == null || (System.currentTimeMillis() - lastActivityCheckTimestamp) > (activityCheckInterval * 1000)) {
75+
lastActivityCheckTimestamp = System.currentTimeMillis();
76+
return true;
77+
}
78+
return false;
7679
}
7780

7881
public boolean canRecheckActivity(final Long maxDegradedPeriod) {
@@ -121,7 +124,7 @@ public void setFenceFuture(final Future<Boolean> future) {
121124
fenceFuture = future;
122125
}
123126

124-
public boolean lastFencingCompleted() {
127+
public boolean canAttemptFencing() {
125128
return fenceFuture == null || fenceFuture.isDone();
126129
}
127130

server/src/org/apache/cloudstack/ha/provider/HAProvider.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,11 @@
1717

1818
package org.apache.cloudstack.ha.provider;
1919

20-
import com.cloud.utils.component.Adapter;
21-
2220
import org.apache.cloudstack.ha.HAConfig;
21+
import org.apache.cloudstack.ha.HAResource;
2322
import org.joda.time.DateTime;
2423

25-
import org.apache.cloudstack.ha.HAResource;
24+
import com.cloud.utils.component.Adapter;
2625

2726
public interface HAProvider<R extends HAResource> extends Adapter {
2827

@@ -57,7 +56,9 @@ enum HAProviderConfig {
5756

5857
boolean fence(R r) throws HAFenceException;
5958

60-
void setFenced(R r);
59+
void fenceSubResources(R r);
60+
61+
void enableMaintenance(R r);
6162

6263
void sendAlert(R r, HAConfig.HAState nextState);
6364

server/src/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ public boolean isInMaintenanceMode(final Host host) {
7171
}
7272

7373
@Override
74-
public void setFenced(final Host r) {
74+
public void fenceSubResources(final Host r) {
7575
if (r.getState() != Status.Down) {
7676
try {
7777
LOG.debug("Trying to disconnect the host without investigation and scheduling HA for the VMs on host id=" + r.getId());
@@ -80,11 +80,15 @@ public void setFenced(final Host r) {
8080
} catch (Exception e) {
8181
LOG.error("Failed to disconnect host and schedule HA restart of VMs after fencing the host: ", e);
8282
}
83-
try {
84-
resourceManager.resourceStateTransitTo(r, ResourceState.Event.InternalEnterMaintenance, ManagementServerNode.getManagementServerId());
85-
} catch (NoTransitionException e) {
86-
LOG.error("Failed to put host in maintenance mode after host-ha fencing and scheduling VM-HA: ", e);
87-
}
83+
}
84+
}
85+
86+
@Override
87+
public void enableMaintenance(final Host r) {
88+
try {
89+
resourceManager.resourceStateTransitTo(r, ResourceState.Event.InternalEnterMaintenance, ManagementServerNode.getManagementServerId());
90+
} catch (NoTransitionException e) {
91+
LOG.error("Failed to put host in maintenance mode after host-ha fencing and scheduling VM-HA: ", e);
8892
}
8993
}
9094

server/src/org/apache/cloudstack/ha/task/ActivityCheckTask.java

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717

1818
package org.apache.cloudstack.ha.task;
1919

20+
import java.util.concurrent.ExecutorService;
21+
22+
import javax.inject.Inject;
23+
2024
import org.apache.cloudstack.ha.HAConfig;
2125
import org.apache.cloudstack.ha.HAManager;
2226
import org.apache.cloudstack.ha.HAResource;
@@ -25,11 +29,7 @@
2529
import org.apache.cloudstack.ha.provider.HAProvider;
2630
import org.apache.cloudstack.ha.provider.HAProvider.HAProviderConfig;
2731
import org.apache.log4j.Logger;
28-
29-
import javax.inject.Inject;
30-
3132
import org.joda.time.DateTime;
32-
import java.util.concurrent.ExecutorService;
3333

3434
public class ActivityCheckTask extends BaseHATask {
3535

@@ -38,22 +38,24 @@ public class ActivityCheckTask extends BaseHATask {
3838
@Inject
3939
private HAManager haManager;
4040

41-
private final long disconnectTime;
41+
private long disconnectTime;
42+
private long maxActivityChecks;
43+
private double activityCheckFailureRatio;
4244

4345
public ActivityCheckTask(final HAResource resource, final HAProvider<HAResource> haProvider, final HAConfig haConfig, final HAProvider.HAProviderConfig haProviderConfig,
4446
final ExecutorService executor, final long disconnectTime) {
4547
super(resource, haProvider, haConfig, haProviderConfig, executor);
4648
this.disconnectTime = disconnectTime;
49+
this.maxActivityChecks = (Long)haProvider.getConfigValue(HAProviderConfig.MaxActivityChecks, resource);
50+
this.activityCheckFailureRatio = (Double)haProvider.getConfigValue(HAProviderConfig.ActivityCheckFailureRatio, resource);
4751
}
4852

4953
public boolean performAction() throws HACheckerException {
5054
return getHaProvider().hasActivity(getResource(), new DateTime(disconnectTime));
5155
}
5256

53-
public void processResult(boolean result, Throwable t) {
57+
public synchronized void processResult(boolean result, Throwable t) {
5458
final HAConfig haConfig = getHaConfig();
55-
final HAProvider<HAResource> haProvider = getHaProvider();
56-
final HAResource resource = getResource();
5759
final HAResourceCounter counter = haManager.getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
5860

5961
if (t != null && t instanceof HACheckerException) {
@@ -64,18 +66,17 @@ public void processResult(boolean result, Throwable t) {
6466

6567
counter.incrActivityCounter(!result);
6668

67-
long maxActivityChecks = (Long)haProvider.getConfigValue(HAProviderConfig.MaxActivityChecks, resource);
6869
if (counter.getActivityCheckCounter() < maxActivityChecks) {
6970
haManager.transitionHAState(HAConfig.Event.TooFewActivityCheckSamples, haConfig);
7071
return;
7172
}
7273

73-
double activityCheckFailureRatio = (Double)haProvider.getConfigValue(HAProviderConfig.ActivityCheckFailureRatio, resource);
7474
if (counter.hasActivityThresholdExceeded(activityCheckFailureRatio)) {
7575
haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureOverThresholdRatio, haConfig);
7676
} else {
77-
haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig);
78-
counter.markResourceDegraded();
77+
if (haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig)) {
78+
counter.markResourceDegraded();
79+
}
7980
}
8081
counter.resetActivityCounter();
8182
}

server/src/org/apache/cloudstack/ha/task/BaseHATask.java

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,21 @@
1717

1818
package org.apache.cloudstack.ha.task;
1919

20+
import java.util.concurrent.Callable;
21+
import java.util.concurrent.ExecutionException;
22+
import java.util.concurrent.ExecutorService;
23+
import java.util.concurrent.Future;
24+
import java.util.concurrent.TimeUnit;
25+
import java.util.concurrent.TimeoutException;
26+
2027
import org.apache.cloudstack.ha.HAConfig;
2128
import org.apache.cloudstack.ha.HAResource;
2229
import org.apache.cloudstack.ha.provider.HACheckerException;
2330
import org.apache.cloudstack.ha.provider.HAFenceException;
2431
import org.apache.cloudstack.ha.provider.HAProvider;
2532
import org.apache.cloudstack.ha.provider.HARecoveryException;
2633
import org.apache.log4j.Logger;
27-
28-
import java.util.concurrent.Callable;
29-
import java.util.concurrent.ExecutionException;
30-
import java.util.concurrent.ExecutorService;
31-
import java.util.concurrent.Future;
32-
import java.util.concurrent.TimeUnit;
33-
import java.util.concurrent.TimeoutException;
34+
import org.joda.time.DateTime;
3435

3536
public abstract class BaseHATask implements Callable<Boolean> {
3637
public static final Logger LOG = Logger.getLogger(BaseHATask.class);
@@ -40,6 +41,7 @@ public abstract class BaseHATask implements Callable<Boolean> {
4041
private final HAConfig haConfig;
4142
private final ExecutorService executor;
4243
private Long timeout;
44+
private DateTime created;
4345

4446
public BaseHATask(final HAResource resource, final HAProvider<HAResource> haProvider, final HAConfig haConfig, final HAProvider.HAProviderConfig haProviderConfig,
4547
final ExecutorService executor) {
@@ -48,6 +50,7 @@ public BaseHATask(final HAResource resource, final HAProvider<HAResource> haProv
4850
this.haConfig = haConfig;
4951
this.executor = executor;
5052
this.timeout = (Long)haProvider.getConfigValue(haProviderConfig, resource);
53+
this.created = new DateTime();
5154
}
5255

5356
public HAProvider<HAResource> getHaProvider() {
@@ -74,6 +77,9 @@ public boolean performAction() throws HACheckerException, HAFenceException, HARe
7477

7578
@Override
7679
public Boolean call() {
80+
if (new DateTime().minusHours(1).isAfter(getCreated())) {
81+
return false;
82+
}
7783
final Future<Boolean> future = executor.submit(new Callable<Boolean>() {
7884
@Override
7985
public Boolean call() throws HACheckerException, HAFenceException, HARecoveryException {
@@ -99,4 +105,7 @@ public Boolean call() throws HACheckerException, HAFenceException, HARecoveryExc
99105
return result;
100106
}
101107

108+
public DateTime getCreated() {
109+
return created;
110+
}
102111
}

0 commit comments

Comments
 (0)