Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM alpine:3.13.5
FROM alpine:3.17.3

RUN apk add --no-cache curl jq
RUN apk add --no-cache curl jq bash

COPY docker-entrypoint /
ENTRYPOINT ["/docker-entrypoint"]
Expand All @@ -9,6 +9,7 @@ ENV AUTOHEAL_CONTAINER_LABEL=autoheal \
AUTOHEAL_START_PERIOD=0 \
AUTOHEAL_INTERVAL=5 \
AUTOHEAL_DEFAULT_STOP_TIMEOUT=10 \
AUTOHEAL_RETRIES=5 \
DOCKER_SOCK=/var/run/docker.sock \
CURL_TIMEOUT=30 \
WEBHOOK_URL=""
Expand Down
21 changes: 11 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,25 +47,26 @@ The certificates, and keys need these names:

### Change Timezone
If you need the timezone to match the local machine, you can map the `/etc/localtime` into the container.
```
```bash
docker run ... -v /etc/localtime:/etc/localtime:ro
```


## ENV Defaults
```
```bash
AUTOHEAL_CONTAINER_LABEL=autoheal
AUTOHEAL_INTERVAL=5 # check every 5 seconds
AUTOHEAL_START_PERIOD=0 # wait 0 seconds before first health check
AUTOHEAL_DEFAULT_STOP_TIMEOUT=10 # Docker waits max 10 seconds (the Docker default) for a container to stop before killing during restarts (container overridable via label, see below)
DOCKER_SOCK=/var/run/docker.sock # Unix socket for curl requests to Docker API
CURL_TIMEOUT=30 # --max-time seconds for curl requests to Docker API
WEBHOOK_URL="" # post message to the webhook if a container was restarted (or restart failed)
AUTOHEAL_INTERVAL=5 # check every 5 seconds
AUTOHEAL_START_PERIOD=0 # wait 0 seconds before first health check
AUTOHEAL_DEFAULT_STOP_TIMEOUT=10 # Docker waits max 10 seconds (the Docker default) for a container to stop before killing during restarts (container overridable via label, see below)
AUTOHEAL_RETRIES=5 # Number of retries pf restarting an unhealthy container
DOCKER_SOCK=/var/run/docker.sock # Unix socket for curl requests to Docker API
CURL_TIMEOUT=30 # --max-time seconds for curl requests to Docker API
WEBHOOK_URL="" # post message to the webhook if a container was restarted (or restart failed)
```

### Optional Container Labels
```
autoheal.stop.timeout=20 # Per containers override for stop timeout seconds during restart
```bash
autoheal.stop.timeout=20 # Per containers override for stop timeout seconds during restart
```

## Testing
Expand Down
59 changes: 46 additions & 13 deletions docker-entrypoint
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env sh
#!/usr/bin/env bash

set -e
# shellcheck disable=2039
Expand All @@ -25,6 +25,7 @@ AUTOHEAL_CONTAINER_LABEL=${AUTOHEAL_CONTAINER_LABEL:-autoheal}
AUTOHEAL_START_PERIOD=${AUTOHEAL_START_PERIOD:-0}
AUTOHEAL_INTERVAL=${AUTOHEAL_INTERVAL:-5}
AUTOHEAL_DEFAULT_STOP_TIMEOUT=${AUTOHEAL_DEFAULT_STOP_TIMEOUT:-10}
AUTOHEAL_RETRIES=${AUTOHEAL_RETRIES:-5}

docker_curl() {
curl --max-time "${CURL_TIMEOUT}" --no-buffer -s \
Expand Down Expand Up @@ -73,7 +74,7 @@ generate_webhook_payload() {
cat <<EOF
{
"text":"$text"
}
}
EOF
}

Expand All @@ -89,14 +90,16 @@ if [ "$1" = "autoheal" ] && [ -e "$DOCKER_SOCK" ];then
# Delayed startup
if [ "$AUTOHEAL_START_PERIOD" -gt 0 ]
then
echo "Monitoring containers for unhealthy status in $AUTOHEAL_START_PERIOD second(s)"
echo "Monitoring containers for unhealthy status in $AUTOHEAL_START_PERIOD second(s)" >&2
sleep "$AUTOHEAL_START_PERIOD"
fi


declare -A SICK_CONTAINERS

while true
do
STOP_TIMEOUT=".Labels[\"autoheal.stop.timeout\"] // $AUTOHEAL_DEFAULT_STOP_TIMEOUT"
get_container_info | \
shopt -s lastpipe; set +m; get_container_info | \
jq -r "foreach .[] as \$CONTAINER([];[]; \$CONTAINER | .Id, .Names[0], .State, ${STOP_TIMEOUT})" | \
while read -r CONTAINER_ID && read -r CONTAINER_NAME && read -r CONTAINER_STATE && read -r TIMEOUT
do
Expand All @@ -109,18 +112,48 @@ if [ "$1" = "autoheal" ] && [ -e "$DOCKER_SOCK" ];then
echo "$DATE Container name of (${CONTAINER_SHORT_ID}) is null, which implies container does not exist - don't restart" >&2
elif [ "$CONTAINER_STATE" = "restarting" ]
then
echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be restarting - don't restart"
echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be restarting - don't restart" >&2
else
echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be unhealthy - Restarting container now with ${TIMEOUT}s timeout"
if ! restart_container "$CONTAINER_ID" "$TIMEOUT"
then
echo "$DATE Restarting container $CONTAINER_SHORT_ID failed" >&2
notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Failed to restart the container!" &
else
notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Successfully restarted the container!" &
if [ ${SICK_CONTAINERS[$CONTAINER_ID]+_} ]; then
echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) already in a queue" >&2
else
echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be unhealthy - adding to queue" >&2
SICK_CONTAINERS[$CONTAINER_ID]="CONTAINER_ID=\"$CONTAINER_ID\" CONTAINER_SHORT_ID=\"$CONTAINER_SHORT_ID\" CONTAINER_NAME=\"$CONTAINER_NAME\" CONTAINER_STATE=\"$CONTAINER_STATE\" TIMEOUT=\"$TIMEOUT\" RETRY=0"
fi
fi
done

declare -A SICK_CONTAINERS_NEW

for SICK_CONTAINER in "${SICK_CONTAINERS[@]}"; do
eval "$SICK_CONTAINER"

if ! restart_container "$CONTAINER_ID" "$TIMEOUT"
then
echo "$DATE Restarting container $CONTAINER_SHORT_ID failed" >&2
notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Failed to restart the container!" &

RETRY_NEW=$((RETRY+1))
if [ $RETRY_NEW -lt $AUTOHEAL_RETRIES ]; then
SICK_CONTAINER_NEW="${SICK_CONTAINER/RETRY=$RETRY/RETRY=$RETRY_NEW}"
SICK_CONTAINERS_NEW[$CONTAINER_ID]="$SICK_CONTAINER_NEW"
else
echo "$DATE All attempts to restart the container $CONTAINER_SHORT_ID have failed" >&2
fi
else
echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) - successfully restarted the container" >&2
notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Successfully restarted the container!" &
fi
done

unset SICK_CONTAINERS
declare -A SICK_CONTAINERS

for idx in "${!SICK_CONTAINERS_NEW[@]}"; do
SICK_CONTAINERS[$idx]="${SICK_CONTAINERS_NEW[$idx]}"
done
unset SICK_CONTAINERS_NEW

sleep "$AUTOHEAL_INTERVAL"
done

Expand Down