Files

206 lines
7.9 KiB
Bash
Executable File

#!/usr/bin/env bash
# openclaw-watchdog — Monitor OpenClaw gateway, auto-recover from crashes
# https://github.com/jlgrimes/openclaw-watchdog
# MIT License — Jared Grimes
set -euo pipefail
# ─── Configuration (all overridable via env vars) ────────────────────────────
OPENCLAW_CONFIG_PATH="${OPENCLAW_CONFIG_PATH:-$HOME/.openclaw/config.yaml}"
HEALTH_URL="${HEALTH_URL:-http://localhost:3000/health}"
CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # seconds between checks
FAIL_THRESHOLD="${FAIL_THRESHOLD:-3}" # consecutive failures before escalation
WATCHDOG_LOG="${WATCHDOG_LOG:-$HOME/.openclaw/watchdog.log}"
GOOD_CONFIG_PATH="${GOOD_CONFIG_PATH:-$HOME/.openclaw/config.yaml.good}"
# Notification settings (Telegram + ntfy)
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
TELEGRAM_CHAT="${TELEGRAM_CHAT:-1793951355}"
NTFY_URL="${NTFY_URL:-}"
NTFY_TOPIC="${NTFY_TOPIC:-}"
# Legacy Discord settings (optional, for backwards compat)
DISCORD_CHANNEL_ID="${DISCORD_CHANNEL_ID:-}"
DISCORD_BOT_TOKEN="${DISCORD_BOT_TOKEN:-}"
# ─── State ────────────────────────────────────────────────────────────────────
fail_count=0
alerted=false # true after SOS sent, prevents spam
was_down=false # tracks if we're recovering
# ─── Logging ──────────────────────────────────────────────────────────────────
log() {
local ts
ts="$(date '+%Y-%m-%d %H:%M:%S')"
echo "[$ts] $*" | tee -a "$WATCHDOG_LOG"
}
# ─── Discord messaging ───────────────────────────────────────────────────────
discord_send() {
local msg="$1"
[[ -z "$DISCORD_CHANNEL_ID" ]] && return 0
[[ -z "$DISCORD_BOT_TOKEN" ]] && return 0
curl -sf -X POST \
"https://discord.com/api/v10/channels/${DISCORD_CHANNEL_ID}/messages" \
-H "Authorization: Bot ${DISCORD_BOT_TOKEN}" \
-H "Content-Type: application/json" \
-d "{\"content\": $(printf '%s' "$msg" | jq -Rs .)}" \
>/dev/null 2>&1 || log "WARN: Failed to send Discord message"
}
# ─── Telegram messaging ──────────────────────────────────────────────────────
telegram_send() {
local msg="$1"
[[ -z "$TELEGRAM_BOT_TOKEN" ]] && return 0
[[ -z "$TELEGRAM_CHAT" ]] && return 0
curl -sf -X POST \
"https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-H "Content-Type: application/json" \
-d "{\"chat_id\": \"$TELEGRAM_CHAT\", \"text\": $(printf '%s' "$msg" | jq -Rs .), \"parse_mode\": \"Markdown\"}" \
>/dev/null 2>&1 || log "WARN: Failed to send Telegram message"
}
# ─── ntfy messaging ─────────────────────────────────────────────────────────
ntfy_send() {
local title="$1"
local msg="$2"
local priority="${3:-4}" # 1-5 or low|default|high|urgent
local sound="${4:-default}"
[[ -z "$NTFY_URL" ]] && return 0
[[ -z "$NTFY_TOPIC" ]] && return 0
# Enforce minimum priority (default 4)
local minp="${NTFY_MIN_PRIORITY:-4}"
if [[ "$priority" =~ ^[0-9]+$ ]] && [[ "$minp" =~ ^[0-9]+$ ]]; then
if (( priority < minp )); then
priority="$minp"
fi
fi
curl -sf -X POST \
"${NTFY_URL%/}/${NTFY_TOPIC}" \
-H "Title: $title" \
-H "Priority: $priority" \
-H "Sound: $sound" \
-d "$msg" \
>/dev/null 2>&1 || log "WARN: Failed to send ntfy message"
}
# ─── Send to all configured channels ─────────────────────────────────────────
send_notification() {
local title="$1"
local msg="$2"
local priority="${3:-5}"
discord_send "$msg"
telegram_send "$msg"
ntfy_send "$title" "$msg" "4" "default"
}
# ─── Health check ─────────────────────────────────────────────────────────────
check_health() {
curl -sf --max-time 10 "$HEALTH_URL" >/dev/null 2>&1
}
# ─── Save last-known-good config ─────────────────────────────────────────────
save_good_config() {
if [[ -f "$OPENCLAW_CONFIG_PATH" ]]; then
cp "$OPENCLAW_CONFIG_PATH" "$GOOD_CONFIG_PATH"
fi
}
# ─── Revert to last-known-good config ────────────────────────────────────────
revert_config() {
if [[ -f "$GOOD_CONFIG_PATH" ]]; then
log "Reverting config to last-known-good snapshot"
cp "$GOOD_CONFIG_PATH" "$OPENCLAW_CONFIG_PATH"
return 0
else
log "WARN: No good config snapshot available to revert"
return 1
fi
}
# ─── Restart gateway ─────────────────────────────────────────────────────────
restart_gateway() {
log "Restarting OpenClaw gateway..."
openclaw gateway restart >/dev/null 2>&1 || true
sleep 5
}
# ─── SOS alert ────────────────────────────────────────────────────────────────
send_sos() {
local hostname
hostname="$(hostname 2>/dev/null || echo 'unknown')"
local title="🚨 OpenClaw Gateway DOWN"
local msg="🚨 **OpenClaw Gateway DOWN** on \`${hostname}\`
Watchdog tried:
1. ✅ Simple restart
2. ✅ Config rollback + restart
3. ❌ Still unreachable after ${FAIL_THRESHOLD}+ failures
**Manual intervention needed:**
\`\`\`
ssh ${USER}@${hostname}
openclaw gateway status
journalctl -u openclaw-gateway --since '10 min ago'
\`\`\`"
log "CRITICAL: Sending SOS alert"
send_notification "$title" "$msg" 10
alerted=true
}
# ─── Recovery notification ────────────────────────────────────────────────────
send_recovery() {
local hostname
hostname="$(hostname 2>/dev/null || echo 'unknown')"
local title="✅ OpenClaw Gateway Recovered"
local msg="✅ **OpenClaw Gateway recovered** on \`${hostname}\` — back online!"
log "Gateway recovered"
send_notification "$title" "$msg" 5
}
# ─── Main loop ────────────────────────────────────────────────────────────────
main() {
log "Watchdog started (interval=${CHECK_INTERVAL}s, threshold=${FAIL_THRESHOLD}, health=${HEALTH_URL})"
log "Notifications: Telegram=${TELEGRAM_CHAT:+enabled}, ntfy=${NTFY_TOPIC:+enabled}, Discord=${DISCORD_CHANNEL_ID:+enabled}"
while true; do
if check_health; then
# ── Healthy ──
if [[ "$was_down" == true ]]; then
send_recovery
was_down=false
alerted=false
fi
fail_count=0
save_good_config
else
# ── Unhealthy ──
fail_count=$((fail_count + 1))
log "Health check failed (${fail_count}/${FAIL_THRESHOLD})"
if [[ $fail_count -eq 1 ]]; then
# Stage 1: simple restart
was_down=true
restart_gateway
elif [[ $fail_count -eq 2 ]]; then
# Stage 2: config revert + restart
revert_config && restart_gateway
elif [[ $fail_count -ge $FAIL_THRESHOLD && "$alerted" == false ]]; then
# Stage 3: SOS (once)
send_sos
fi
fi
sleep "$CHECK_INTERVAL"
done
}
main "$@"