206 lines
7.9 KiB
Bash
Executable File
206 lines
7.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# openclaw-watchdog — Monitor OpenClaw gateway, auto-recover from crashes
|
|
# https://github.com/jlgrimes/openclaw-watchdog
|
|
# MIT License — Jared Grimes
|
|
|
|
set -euo pipefail
|
|
|
|
# ─── Configuration (all overridable via env vars) ────────────────────────────
|
|
OPENCLAW_CONFIG_PATH="${OPENCLAW_CONFIG_PATH:-$HOME/.openclaw/config.yaml}"
|
|
HEALTH_URL="${HEALTH_URL:-http://localhost:3000/health}"
|
|
CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # seconds between checks
|
|
FAIL_THRESHOLD="${FAIL_THRESHOLD:-3}" # consecutive failures before escalation
|
|
WATCHDOG_LOG="${WATCHDOG_LOG:-$HOME/.openclaw/watchdog.log}"
|
|
GOOD_CONFIG_PATH="${GOOD_CONFIG_PATH:-$HOME/.openclaw/config.yaml.good}"
|
|
|
|
# Notification settings (Telegram + ntfy)
|
|
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
|
TELEGRAM_CHAT="${TELEGRAM_CHAT:-1793951355}"
|
|
NTFY_URL="${NTFY_URL:-}"
|
|
NTFY_TOPIC="${NTFY_TOPIC:-}"
|
|
|
|
# Legacy Discord settings (optional, for backwards compat)
|
|
DISCORD_CHANNEL_ID="${DISCORD_CHANNEL_ID:-}"
|
|
DISCORD_BOT_TOKEN="${DISCORD_BOT_TOKEN:-}"
|
|
|
|
# ─── State ────────────────────────────────────────────────────────────────────
|
|
fail_count=0
|
|
alerted=false # true after SOS sent, prevents spam
|
|
was_down=false # tracks if we're recovering
|
|
|
|
# ─── Logging ──────────────────────────────────────────────────────────────────
|
|
log() {
|
|
local ts
|
|
ts="$(date '+%Y-%m-%d %H:%M:%S')"
|
|
echo "[$ts] $*" | tee -a "$WATCHDOG_LOG"
|
|
}
|
|
|
|
# ─── Discord messaging ───────────────────────────────────────────────────────
|
|
discord_send() {
|
|
local msg="$1"
|
|
[[ -z "$DISCORD_CHANNEL_ID" ]] && return 0
|
|
[[ -z "$DISCORD_BOT_TOKEN" ]] && return 0
|
|
|
|
curl -sf -X POST \
|
|
"https://discord.com/api/v10/channels/${DISCORD_CHANNEL_ID}/messages" \
|
|
-H "Authorization: Bot ${DISCORD_BOT_TOKEN}" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"content\": $(printf '%s' "$msg" | jq -Rs .)}" \
|
|
>/dev/null 2>&1 || log "WARN: Failed to send Discord message"
|
|
}
|
|
|
|
# ─── Telegram messaging ──────────────────────────────────────────────────────
|
|
telegram_send() {
|
|
local msg="$1"
|
|
[[ -z "$TELEGRAM_BOT_TOKEN" ]] && return 0
|
|
[[ -z "$TELEGRAM_CHAT" ]] && return 0
|
|
|
|
curl -sf -X POST \
|
|
"https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"chat_id\": \"$TELEGRAM_CHAT\", \"text\": $(printf '%s' "$msg" | jq -Rs .), \"parse_mode\": \"Markdown\"}" \
|
|
>/dev/null 2>&1 || log "WARN: Failed to send Telegram message"
|
|
}
|
|
|
|
# ─── ntfy messaging ─────────────────────────────────────────────────────────
|
|
ntfy_send() {
|
|
local title="$1"
|
|
local msg="$2"
|
|
local priority="${3:-4}" # 1-5 or low|default|high|urgent
|
|
local sound="${4:-default}"
|
|
|
|
[[ -z "$NTFY_URL" ]] && return 0
|
|
[[ -z "$NTFY_TOPIC" ]] && return 0
|
|
|
|
# Enforce minimum priority (default 4)
|
|
local minp="${NTFY_MIN_PRIORITY:-4}"
|
|
if [[ "$priority" =~ ^[0-9]+$ ]] && [[ "$minp" =~ ^[0-9]+$ ]]; then
|
|
if (( priority < minp )); then
|
|
priority="$minp"
|
|
fi
|
|
fi
|
|
|
|
curl -sf -X POST \
|
|
"${NTFY_URL%/}/${NTFY_TOPIC}" \
|
|
-H "Title: $title" \
|
|
-H "Priority: $priority" \
|
|
-H "Sound: $sound" \
|
|
-d "$msg" \
|
|
>/dev/null 2>&1 || log "WARN: Failed to send ntfy message"
|
|
}
|
|
|
|
# ─── Send to all configured channels ─────────────────────────────────────────
|
|
send_notification() {
|
|
local title="$1"
|
|
local msg="$2"
|
|
local priority="${3:-5}"
|
|
|
|
discord_send "$msg"
|
|
telegram_send "$msg"
|
|
ntfy_send "$title" "$msg" "4" "default"
|
|
}
|
|
|
|
# ─── Health check ─────────────────────────────────────────────────────────────
|
|
check_health() {
|
|
curl -sf --max-time 10 "$HEALTH_URL" >/dev/null 2>&1
|
|
}
|
|
|
|
# ─── Save last-known-good config ─────────────────────────────────────────────
|
|
save_good_config() {
|
|
if [[ -f "$OPENCLAW_CONFIG_PATH" ]]; then
|
|
cp "$OPENCLAW_CONFIG_PATH" "$GOOD_CONFIG_PATH"
|
|
fi
|
|
}
|
|
|
|
# ─── Revert to last-known-good config ────────────────────────────────────────
|
|
revert_config() {
|
|
if [[ -f "$GOOD_CONFIG_PATH" ]]; then
|
|
log "Reverting config to last-known-good snapshot"
|
|
cp "$GOOD_CONFIG_PATH" "$OPENCLAW_CONFIG_PATH"
|
|
return 0
|
|
else
|
|
log "WARN: No good config snapshot available to revert"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# ─── Restart gateway ─────────────────────────────────────────────────────────
|
|
restart_gateway() {
|
|
log "Restarting OpenClaw gateway..."
|
|
openclaw gateway restart >/dev/null 2>&1 || true
|
|
sleep 5
|
|
}
|
|
|
|
# ─── SOS alert ────────────────────────────────────────────────────────────────
|
|
send_sos() {
|
|
local hostname
|
|
hostname="$(hostname 2>/dev/null || echo 'unknown')"
|
|
local title="🚨 OpenClaw Gateway DOWN"
|
|
local msg="🚨 **OpenClaw Gateway DOWN** on \`${hostname}\`
|
|
|
|
Watchdog tried:
|
|
1. ✅ Simple restart
|
|
2. ✅ Config rollback + restart
|
|
3. ❌ Still unreachable after ${FAIL_THRESHOLD}+ failures
|
|
|
|
**Manual intervention needed:**
|
|
\`\`\`
|
|
ssh ${USER}@${hostname}
|
|
openclaw gateway status
|
|
journalctl -u openclaw-gateway --since '10 min ago'
|
|
\`\`\`"
|
|
|
|
log "CRITICAL: Sending SOS alert"
|
|
send_notification "$title" "$msg" 10
|
|
alerted=true
|
|
}
|
|
|
|
# ─── Recovery notification ────────────────────────────────────────────────────
|
|
send_recovery() {
|
|
local hostname
|
|
hostname="$(hostname 2>/dev/null || echo 'unknown')"
|
|
local title="✅ OpenClaw Gateway Recovered"
|
|
local msg="✅ **OpenClaw Gateway recovered** on \`${hostname}\` — back online!"
|
|
log "Gateway recovered"
|
|
send_notification "$title" "$msg" 5
|
|
}
|
|
|
|
# ─── Main loop ────────────────────────────────────────────────────────────────
|
|
main() {
|
|
log "Watchdog started (interval=${CHECK_INTERVAL}s, threshold=${FAIL_THRESHOLD}, health=${HEALTH_URL})"
|
|
log "Notifications: Telegram=${TELEGRAM_CHAT:+enabled}, ntfy=${NTFY_TOPIC:+enabled}, Discord=${DISCORD_CHANNEL_ID:+enabled}"
|
|
|
|
while true; do
|
|
if check_health; then
|
|
# ── Healthy ──
|
|
if [[ "$was_down" == true ]]; then
|
|
send_recovery
|
|
was_down=false
|
|
alerted=false
|
|
fi
|
|
fail_count=0
|
|
save_good_config
|
|
else
|
|
# ── Unhealthy ──
|
|
fail_count=$((fail_count + 1))
|
|
log "Health check failed (${fail_count}/${FAIL_THRESHOLD})"
|
|
|
|
if [[ $fail_count -eq 1 ]]; then
|
|
# Stage 1: simple restart
|
|
was_down=true
|
|
restart_gateway
|
|
elif [[ $fail_count -eq 2 ]]; then
|
|
# Stage 2: config revert + restart
|
|
revert_config && restart_gateway
|
|
elif [[ $fail_count -ge $FAIL_THRESHOLD && "$alerted" == false ]]; then
|
|
# Stage 3: SOS (once)
|
|
send_sos
|
|
fi
|
|
fi
|
|
|
|
sleep "$CHECK_INTERVAL"
|
|
done
|
|
}
|
|
|
|
main "$@"
|