Add openclaw-watchdog fork with Telegram + Gotify support
- Cloned jlgrimes/openclaw-watchdog and modified - Added telegram_send() and gotify_send() functions - Modified send_sos() and send_recovery() for multi-channel - Updated setup.sh to configure Telegram/Gotify - All notification channels work simultaneously - Ready to install and run as systemd service
This commit is contained in:
192
openclaw-watchdog/watchdog.sh
Executable file
192
openclaw-watchdog/watchdog.sh
Executable file
@@ -0,0 +1,192 @@
|
||||
#!/usr/bin/env bash
|
||||
# openclaw-watchdog — Monitor OpenClaw gateway, auto-recover from crashes
|
||||
# https://github.com/jlgrimes/openclaw-watchdog
|
||||
# MIT License — Jared Grimes
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ─── Configuration (all overridable via env vars) ────────────────────────────
|
||||
OPENCLAW_CONFIG_PATH="${OPENCLAW_CONFIG_PATH:-$HOME/.openclaw/config.yaml}"
|
||||
HEALTH_URL="${HEALTH_URL:-http://localhost:3000/health}"
|
||||
CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # seconds between checks
|
||||
FAIL_THRESHOLD="${FAIL_THRESHOLD:-3}" # consecutive failures before escalation
|
||||
WATCHDOG_LOG="${WATCHDOG_LOG:-$HOME/.openclaw/watchdog.log}"
|
||||
GOOD_CONFIG_PATH="${GOOD_CONFIG_PATH:-$HOME/.openclaw/config.yaml.good}"
|
||||
|
||||
# Notification settings (Telegram + Gotify)
|
||||
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
||||
TELEGRAM_CHAT="${TELEGRAM_CHAT:-1793951355}"
|
||||
GOTIFY_URL="${GOTIFY_URL:-http://runtipi.kangaroo-eel.ts.net:8129}"
|
||||
GOTIFY_TOKEN="${GOTIFY_TOKEN:-AGKnHafW3FGzBlt}"
|
||||
|
||||
# Legacy Discord settings (optional, for backwards compat)
|
||||
DISCORD_CHANNEL_ID="${DISCORD_CHANNEL_ID:-}"
|
||||
DISCORD_BOT_TOKEN="${DISCORD_BOT_TOKEN:-}"
|
||||
|
||||
# ─── State ────────────────────────────────────────────────────────────────────
|
||||
fail_count=0
|
||||
alerted=false # true after SOS sent, prevents spam
|
||||
was_down=false # tracks if we're recovering
|
||||
|
||||
# ─── Logging ──────────────────────────────────────────────────────────────────
|
||||
log() {
|
||||
local ts
|
||||
ts="$(date '+%Y-%m-%d %H:%M:%S')"
|
||||
echo "[$ts] $*" | tee -a "$WATCHDOG_LOG"
|
||||
}
|
||||
|
||||
# ─── Discord messaging ───────────────────────────────────────────────────────
|
||||
discord_send() {
|
||||
local msg="$1"
|
||||
[[ -z "$DISCORD_CHANNEL_ID" ]] && return 0
|
||||
[[ -z "$DISCORD_BOT_TOKEN" ]] && return 0
|
||||
|
||||
curl -sf -X POST \
|
||||
"https://discord.com/api/v10/channels/${DISCORD_CHANNEL_ID}/messages" \
|
||||
-H "Authorization: Bot ${DISCORD_BOT_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"content\": $(printf '%s' "$msg" | jq -Rs .)}" \
|
||||
>/dev/null 2>&1 || log "WARN: Failed to send Discord message"
|
||||
}
|
||||
|
||||
# ─── Telegram messaging ──────────────────────────────────────────────────────
|
||||
telegram_send() {
|
||||
local msg="$1"
|
||||
[[ -z "$TELEGRAM_BOT_TOKEN" ]] && return 0
|
||||
[[ -z "$TELEGRAM_CHAT" ]] && return 0
|
||||
|
||||
curl -sf -X POST \
|
||||
"https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"chat_id\": \"$TELEGRAM_CHAT\", \"text\": $(printf '%s' "$msg" | jq -Rs .), \"parse_mode\": \"Markdown\"}" \
|
||||
>/dev/null 2>&1 || log "WARN: Failed to send Telegram message"
|
||||
}
|
||||
|
||||
# ─── Gotify messaging ────────────────────────────────────────────────────────
|
||||
gotify_send() {
|
||||
local title="$1"
|
||||
local msg="$2"
|
||||
local priority="${3:-5}"
|
||||
[[ -z "$GOTIFY_TOKEN" ]] && return 0
|
||||
|
||||
curl -sf -X POST \
|
||||
"${GOTIFY_URL}/message?token=${GOTIFY_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"title\": \"$title\", \"message\": \"$msg\", \"priority\": $priority}" \
|
||||
>/dev/null 2>&1 || log "WARN: Failed to send Gotify message"
|
||||
}
|
||||
|
||||
# ─── Send to all configured channels ─────────────────────────────────────────
|
||||
send_notification() {
|
||||
local title="$1"
|
||||
local msg="$2"
|
||||
local priority="${3:-5}"
|
||||
|
||||
discord_send "$msg"
|
||||
telegram_send "$msg"
|
||||
gotify_send "$title" "$msg" "$priority"
|
||||
}
|
||||
|
||||
# ─── Health check ─────────────────────────────────────────────────────────────
|
||||
check_health() {
|
||||
curl -sf --max-time 10 "$HEALTH_URL" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
# ─── Save last-known-good config ─────────────────────────────────────────────
|
||||
save_good_config() {
|
||||
if [[ -f "$OPENCLAW_CONFIG_PATH" ]]; then
|
||||
cp "$OPENCLAW_CONFIG_PATH" "$GOOD_CONFIG_PATH"
|
||||
fi
|
||||
}
|
||||
|
||||
# ─── Revert to last-known-good config ────────────────────────────────────────
|
||||
revert_config() {
|
||||
if [[ -f "$GOOD_CONFIG_PATH" ]]; then
|
||||
log "Reverting config to last-known-good snapshot"
|
||||
cp "$GOOD_CONFIG_PATH" "$OPENCLAW_CONFIG_PATH"
|
||||
return 0
|
||||
else
|
||||
log "WARN: No good config snapshot available to revert"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# ─── Restart gateway ─────────────────────────────────────────────────────────
|
||||
restart_gateway() {
|
||||
log "Restarting OpenClaw gateway..."
|
||||
openclaw gateway restart >/dev/null 2>&1 || true
|
||||
sleep 5
|
||||
}
|
||||
|
||||
# ─── SOS alert ────────────────────────────────────────────────────────────────
|
||||
send_sos() {
|
||||
local hostname
|
||||
hostname="$(hostname 2>/dev/null || echo 'unknown')"
|
||||
local title="🚨 OpenClaw Gateway DOWN"
|
||||
local msg="🚨 **OpenClaw Gateway DOWN** on \`${hostname}\`
|
||||
|
||||
Watchdog tried:
|
||||
1. ✅ Simple restart
|
||||
2. ✅ Config rollback + restart
|
||||
3. ❌ Still unreachable after ${FAIL_THRESHOLD}+ failures
|
||||
|
||||
**Manual intervention needed:**
|
||||
\`\`\`
|
||||
ssh ${USER}@${hostname}
|
||||
openclaw gateway status
|
||||
journalctl -u openclaw-gateway --since '10 min ago'
|
||||
\`\`\`"
|
||||
|
||||
log "CRITICAL: Sending SOS alert"
|
||||
send_notification "$title" "$msg" 10
|
||||
alerted=true
|
||||
}
|
||||
|
||||
# ─── Recovery notification ────────────────────────────────────────────────────
|
||||
send_recovery() {
|
||||
local hostname
|
||||
hostname="$(hostname 2>/dev/null || echo 'unknown')"
|
||||
local title="✅ OpenClaw Gateway Recovered"
|
||||
local msg="✅ **OpenClaw Gateway recovered** on \`${hostname}\` — back online!"
|
||||
log "Gateway recovered"
|
||||
send_notification "$title" "$msg" 5
|
||||
}
|
||||
|
||||
# ─── Main loop ────────────────────────────────────────────────────────────────
|
||||
main() {
|
||||
log "Watchdog started (interval=${CHECK_INTERVAL}s, threshold=${FAIL_THRESHOLD}, health=${HEALTH_URL})"
|
||||
log "Notifications: Telegram=${TELEGRAM_CHAT:+enabled}, Gotify=${GOTIFY_TOKEN:+enabled}, Discord=${DISCORD_CHANNEL_ID:+enabled}"
|
||||
|
||||
while true; do
|
||||
if check_health; then
|
||||
# ── Healthy ──
|
||||
if [[ "$was_down" == true ]]; then
|
||||
send_recovery
|
||||
was_down=false
|
||||
alerted=false
|
||||
fi
|
||||
fail_count=0
|
||||
save_good_config
|
||||
else
|
||||
# ── Unhealthy ──
|
||||
fail_count=$((fail_count + 1))
|
||||
log "Health check failed (${fail_count}/${FAIL_THRESHOLD})"
|
||||
|
||||
if [[ $fail_count -eq 1 ]]; then
|
||||
# Stage 1: simple restart
|
||||
was_down=true
|
||||
restart_gateway
|
||||
elif [[ $fail_count -eq 2 ]]; then
|
||||
# Stage 2: config revert + restart
|
||||
revert_config && restart_gateway
|
||||
elif [[ $fail_count -ge $FAIL_THRESHOLD && "$alerted" == false ]]; then
|
||||
# Stage 3: SOS (once)
|
||||
send_sos
|
||||
fi
|
||||
fi
|
||||
|
||||
sleep "$CHECK_INTERVAL"
|
||||
done
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user