#!/usr/bin/env bash # openclaw-watchdog — Monitor OpenClaw gateway, auto-recover from crashes # https://github.com/jlgrimes/openclaw-watchdog # MIT License — Jared Grimes set -euo pipefail # ─── Configuration (all overridable via env vars) ──────────────────────────── OPENCLAW_CONFIG_PATH="${OPENCLAW_CONFIG_PATH:-$HOME/.openclaw/config.yaml}" HEALTH_URL="${HEALTH_URL:-http://localhost:3000/health}" CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # seconds between checks FAIL_THRESHOLD="${FAIL_THRESHOLD:-3}" # consecutive failures before escalation WATCHDOG_LOG="${WATCHDOG_LOG:-$HOME/.openclaw/watchdog.log}" GOOD_CONFIG_PATH="${GOOD_CONFIG_PATH:-$HOME/.openclaw/config.yaml.good}" # Notification settings (Telegram + ntfy) TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}" TELEGRAM_CHAT="${TELEGRAM_CHAT:-1793951355}" NTFY_URL="${NTFY_URL:-}" NTFY_TOPIC="${NTFY_TOPIC:-}" # Legacy Discord settings (optional, for backwards compat) DISCORD_CHANNEL_ID="${DISCORD_CHANNEL_ID:-}" DISCORD_BOT_TOKEN="${DISCORD_BOT_TOKEN:-}" # ─── State ──────────────────────────────────────────────────────────────────── fail_count=0 alerted=false # true after SOS sent, prevents spam was_down=false # tracks if we're recovering # ─── Logging ────────────────────────────────────────────────────────────────── log() { local ts ts="$(date '+%Y-%m-%d %H:%M:%S')" echo "[$ts] $*" | tee -a "$WATCHDOG_LOG" } # ─── Discord messaging ─────────────────────────────────────────────────────── discord_send() { local msg="$1" [[ -z "$DISCORD_CHANNEL_ID" ]] && return 0 [[ -z "$DISCORD_BOT_TOKEN" ]] && return 0 curl -sf -X POST \ "https://discord.com/api/v10/channels/${DISCORD_CHANNEL_ID}/messages" \ -H "Authorization: Bot ${DISCORD_BOT_TOKEN}" \ -H "Content-Type: application/json" \ -d "{\"content\": $(printf '%s' "$msg" | jq -Rs .)}" \ >/dev/null 2>&1 || log "WARN: Failed to send Discord message" } # ─── Telegram messaging ────────────────────────────────────────────────────── telegram_send() { local msg="$1" [[ -z "$TELEGRAM_BOT_TOKEN" ]] && return 0 [[ -z "$TELEGRAM_CHAT" ]] && return 0 curl -sf -X POST \ "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ -H "Content-Type: application/json" \ -d "{\"chat_id\": \"$TELEGRAM_CHAT\", \"text\": $(printf '%s' "$msg" | jq -Rs .), \"parse_mode\": \"Markdown\"}" \ >/dev/null 2>&1 || log "WARN: Failed to send Telegram message" } # ─── ntfy messaging ───────────────────────────────────────────────────────── ntfy_send() { local title="$1" local msg="$2" local priority="${3:-4}" # 1-5 or low|default|high|urgent local sound="${4:-default}" [[ -z "$NTFY_URL" ]] && return 0 [[ -z "$NTFY_TOPIC" ]] && return 0 # Enforce minimum priority (default 4) local minp="${NTFY_MIN_PRIORITY:-4}" if [[ "$priority" =~ ^[0-9]+$ ]] && [[ "$minp" =~ ^[0-9]+$ ]]; then if (( priority < minp )); then priority="$minp" fi fi curl -sf -X POST \ "${NTFY_URL%/}/${NTFY_TOPIC}" \ -H "Title: $title" \ -H "Priority: $priority" \ -H "Sound: $sound" \ -d "$msg" \ >/dev/null 2>&1 || log "WARN: Failed to send ntfy message" } # ─── Send to all configured channels ───────────────────────────────────────── send_notification() { local title="$1" local msg="$2" local priority="${3:-5}" discord_send "$msg" telegram_send "$msg" ntfy_send "$title" "$msg" "4" "default" } # ─── Health check ───────────────────────────────────────────────────────────── check_health() { curl -sf --max-time 10 "$HEALTH_URL" >/dev/null 2>&1 } # ─── Save last-known-good config ───────────────────────────────────────────── save_good_config() { if [[ -f "$OPENCLAW_CONFIG_PATH" ]]; then cp "$OPENCLAW_CONFIG_PATH" "$GOOD_CONFIG_PATH" fi } # ─── Revert to last-known-good config ──────────────────────────────────────── revert_config() { if [[ -f "$GOOD_CONFIG_PATH" ]]; then log "Reverting config to last-known-good snapshot" cp "$GOOD_CONFIG_PATH" "$OPENCLAW_CONFIG_PATH" return 0 else log "WARN: No good config snapshot available to revert" return 1 fi } # ─── Restart gateway ───────────────────────────────────────────────────────── restart_gateway() { log "Restarting OpenClaw gateway..." openclaw gateway restart >/dev/null 2>&1 || true sleep 5 } # ─── SOS alert ──────────────────────────────────────────────────────────────── send_sos() { local hostname hostname="$(hostname 2>/dev/null || echo 'unknown')" local title="🚨 OpenClaw Gateway DOWN" local msg="🚨 **OpenClaw Gateway DOWN** on \`${hostname}\` Watchdog tried: 1. ✅ Simple restart 2. ✅ Config rollback + restart 3. ❌ Still unreachable after ${FAIL_THRESHOLD}+ failures **Manual intervention needed:** \`\`\` ssh ${USER}@${hostname} openclaw gateway status journalctl -u openclaw-gateway --since '10 min ago' \`\`\`" log "CRITICAL: Sending SOS alert" send_notification "$title" "$msg" 10 alerted=true } # ─── Recovery notification ──────────────────────────────────────────────────── send_recovery() { local hostname hostname="$(hostname 2>/dev/null || echo 'unknown')" local title="✅ OpenClaw Gateway Recovered" local msg="✅ **OpenClaw Gateway recovered** on \`${hostname}\` — back online!" log "Gateway recovered" send_notification "$title" "$msg" 5 } # ─── Main loop ──────────────────────────────────────────────────────────────── main() { log "Watchdog started (interval=${CHECK_INTERVAL}s, threshold=${FAIL_THRESHOLD}, health=${HEALTH_URL})" log "Notifications: Telegram=${TELEGRAM_CHAT:+enabled}, ntfy=${NTFY_TOPIC:+enabled}, Discord=${DISCORD_CHANNEL_ID:+enabled}" while true; do if check_health; then # ── Healthy ── if [[ "$was_down" == true ]]; then send_recovery was_down=false alerted=false fi fail_count=0 save_good_config else # ── Unhealthy ── fail_count=$((fail_count + 1)) log "Health check failed (${fail_count}/${FAIL_THRESHOLD})" if [[ $fail_count -eq 1 ]]; then # Stage 1: simple restart was_down=true restart_gateway elif [[ $fail_count -eq 2 ]]; then # Stage 2: config revert + restart revert_config && restart_gateway elif [[ $fail_count -ge $FAIL_THRESHOLD && "$alerted" == false ]]; then # Stage 3: SOS (once) send_sos fi fi sleep "$CHECK_INTERVAL" done } main "$@"