Skip to content

Commit 9feec2a

Browse files
committed
[executor] Further tweak kill timeouts
1 parent 29328c7 commit 9feec2a

1 file changed

Lines changed: 27 additions & 16 deletions

File tree

executor/executable/controllabletask.go

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,9 @@ import (
4646
)
4747

4848
const(
49-
KILL_TIMEOUT = 2*time.Second
50-
KILL_TRANSITION_TIMEOUT = 3*time.Second
49+
SIGTERM_TIMEOUT = 1*time.Second
50+
SIGINT_TIMEOUT = 3*time.Second
51+
KILL_TRANSITION_TIMEOUT = 1*time.Second
5152
TRANSITION_TIMEOUT = 10*time.Second
5253
)
5354

@@ -449,7 +450,8 @@ func (t *ControllableTask) Kill() error {
449450
select {
450451
case commitResponse = <-commitDone:
451452
case <-time.After(KILL_TRANSITION_TIMEOUT):
452-
log.Error("deadline exceeded")
453+
log.WithField("task", t.ti.TaskID.Value).
454+
Warn("teardown transition sequence timed out")
453455
}
454456
// timeout we should break
455457
if commitResponse == nil {
@@ -458,22 +460,28 @@ func (t *ControllableTask) Kill() error {
458460

459461
log.WithField("newState", commitResponse.newState).
460462
WithError(commitResponse.transitionError).
463+
WithField("task", t.ti.TaskID.Value).
461464
Debug("transition committed")
462465
if commitResponse.transitionError != nil || len(cmd.Event) == 0 {
463-
log.WithError(commitResponse.transitionError).Error("cannot gracefully end task")
466+
log.WithError(commitResponse.transitionError).
467+
WithField("task", t.ti.TaskID.Value).
468+
Warn("teardown transition sequence error")
464469
break
465470
}
466471
reachedState = commitResponse.newState
467472
}
468473

469-
log.Debug("end transition loop done")
474+
log.WithField("task", t.ti.TaskID.Value).
475+
Debug("teardown transition sequence done")
470476
pid = int(response.GetPid())
471477
if pid == 0 {
472478
// t.knownPid must be valid because GetState was sure to have been successful in the past
473479
pid = t.knownPid
474480
}
475-
} else {
476-
log.WithError(err).WithField("taskId", t.ti.GetTaskID()).Warn("cannot query task status for graceful process termination")
481+
} else { // If a true PID was never acquired during the lifetime of this task
482+
log.WithError(err).
483+
WithField("taskId", t.ti.GetTaskID()).
484+
Warn("cannot query task status for graceful process termination")
477485
pid = t.knownPid
478486
if pid == 0 {
479487
// The pid was never known through a successful `GetState` in the lifetime
@@ -486,18 +494,21 @@ func (t *ControllableTask) Kill() error {
486494
// terminate the shell that is wrapping the command, so we avoid using
487495
// negative PID is all other cases in order to allow FairMQ cleanup to
488496
// run.
489-
log.WithError(err).WithField("taskId", t.ti.GetTaskID()).Warn("task PID not known from task, using containing shell PGID")
497+
log.WithError(err).WithField("taskId", t.ti.GetTaskID()).
498+
Warn("task PID not known from task, using containing shell PGID")
490499
}
491500
}
492501

493502
_ = t.rpc.Close()
494503
t.rpc = nil
495504

496505
if reachedState == "DONE" {
497-
log.Debug("task exited correctly")
506+
log.WithField("taskId", t.ti.TaskID.Value).
507+
Debug("task exited correctly")
498508
t.pendingFinalTaskStateCh <- mesos.TASK_FINISHED
499509
} else { // something went wrong
500-
log.Debug("task killed")
510+
log.WithField("taskId", t.ti.TaskID.Value).
511+
Debug("task killed")
501512
t.pendingFinalTaskStateCh <- mesos.TASK_KILLED
502513
}
503514

@@ -507,7 +518,7 @@ func (t *ControllableTask) Kill() error {
507518
if err != nil {
508519
log.WithError(err).
509520
WithField("taskId", t.ti.GetTaskID()).
510-
Warning("could not gracefully kill task")
521+
Warning("task SIGTERM failed")
511522
}
512523
killErrCh <- err
513524
}()
@@ -517,30 +528,30 @@ func (t *ControllableTask) Kill() error {
517528
select {
518529
case killErr := <- killErrCh:
519530
if killErr == nil {
520-
time.Sleep(KILL_TIMEOUT)
531+
time.Sleep(SIGTERM_TIMEOUT) // Waiting for the SIGTERM to kick in
521532
if pidExists(pid) {
522533
// SIGINT for the "Waiting for graceful device shutdown.
523534
// Hit Ctrl-C again to abort immediately" message.
524535
killErr = syscall.Kill(pid, syscall.SIGINT)
525536
if killErr != nil {
526537
log.WithError(killErr).
527538
WithField("taskId", t.ti.GetTaskID()).
528-
Warning("could not gracefully kill task")
539+
Warning("task SIGINT failed")
529540
}
530-
time.Sleep(KILL_TIMEOUT)
541+
time.Sleep(SIGINT_TIMEOUT)
531542
}
532543
if !pidExists(pid) {
533544
return killErr
534545
}
535546
}
536-
case <-time.After(KILL_TRANSITION_TIMEOUT):
547+
case <-time.After(SIGTERM_TIMEOUT + SIGINT_TIMEOUT):
537548
}
538549

539550
killErr := syscall.Kill(pid, syscall.SIGKILL)
540551
if killErr != nil {
541552
log.WithError(killErr).
542553
WithField("taskId", t.ti.GetTaskID()).
543-
Warning("could not kill task")
554+
Warning("task SIGKILL failed")
544555
}
545556

546557
return killErr

0 commit comments

Comments
 (0)