Add openclaw-watchdog fork with Telegram + Gotify support

- Cloned jlgrimes/openclaw-watchdog and modified
- Added telegram_send() and gotify_send() functions
- Modified send_sos() and send_recovery() for multi-channel
- Updated setup.sh to configure Telegram/Gotify
- All notification channels work simultaneously
- Ready to install and run as systemd service
This commit is contained in:
Krilly
2026-02-21 02:06:12 +00:00
parent 180532d1e3
commit d55ca207d2
5 changed files with 505 additions and 0 deletions

192
openclaw-watchdog/watchdog.sh Executable file
View File

@@ -0,0 +1,192 @@
#!/usr/bin/env bash
# openclaw-watchdog — Monitor OpenClaw gateway, auto-recover from crashes
# https://github.com/jlgrimes/openclaw-watchdog
# MIT License — Jared Grimes
set -euo pipefail
# ─── Configuration (all overridable via env vars) ────────────────────────────
OPENCLAW_CONFIG_PATH="${OPENCLAW_CONFIG_PATH:-$HOME/.openclaw/config.yaml}"
HEALTH_URL="${HEALTH_URL:-http://localhost:3000/health}"
CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # seconds between checks
FAIL_THRESHOLD="${FAIL_THRESHOLD:-3}" # consecutive failures before escalation
WATCHDOG_LOG="${WATCHDOG_LOG:-$HOME/.openclaw/watchdog.log}"
GOOD_CONFIG_PATH="${GOOD_CONFIG_PATH:-$HOME/.openclaw/config.yaml.good}"
# Notification settings (Telegram + Gotify)
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
TELEGRAM_CHAT="${TELEGRAM_CHAT:-1793951355}"
GOTIFY_URL="${GOTIFY_URL:-http://runtipi.kangaroo-eel.ts.net:8129}"
GOTIFY_TOKEN="${GOTIFY_TOKEN:-AGKnHafW3FGzBlt}"
# Legacy Discord settings (optional, for backwards compat)
DISCORD_CHANNEL_ID="${DISCORD_CHANNEL_ID:-}"
DISCORD_BOT_TOKEN="${DISCORD_BOT_TOKEN:-}"
# ─── State ────────────────────────────────────────────────────────────────────
fail_count=0
alerted=false # true after SOS sent, prevents spam
was_down=false # tracks if we're recovering
# ─── Logging ──────────────────────────────────────────────────────────────────
log() {
local ts
ts="$(date '+%Y-%m-%d %H:%M:%S')"
echo "[$ts] $*" | tee -a "$WATCHDOG_LOG"
}
# ─── Discord messaging ───────────────────────────────────────────────────────
discord_send() {
local msg="$1"
[[ -z "$DISCORD_CHANNEL_ID" ]] && return 0
[[ -z "$DISCORD_BOT_TOKEN" ]] && return 0
curl -sf -X POST \
"https://discord.com/api/v10/channels/${DISCORD_CHANNEL_ID}/messages" \
-H "Authorization: Bot ${DISCORD_BOT_TOKEN}" \
-H "Content-Type: application/json" \
-d "{\"content\": $(printf '%s' "$msg" | jq -Rs .)}" \
>/dev/null 2>&1 || log "WARN: Failed to send Discord message"
}
# ─── Telegram messaging ──────────────────────────────────────────────────────
telegram_send() {
local msg="$1"
[[ -z "$TELEGRAM_BOT_TOKEN" ]] && return 0
[[ -z "$TELEGRAM_CHAT" ]] && return 0
curl -sf -X POST \
"https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-H "Content-Type: application/json" \
-d "{\"chat_id\": \"$TELEGRAM_CHAT\", \"text\": $(printf '%s' "$msg" | jq -Rs .), \"parse_mode\": \"Markdown\"}" \
>/dev/null 2>&1 || log "WARN: Failed to send Telegram message"
}
# ─── Gotify messaging ────────────────────────────────────────────────────────
gotify_send() {
local title="$1"
local msg="$2"
local priority="${3:-5}"
[[ -z "$GOTIFY_TOKEN" ]] && return 0
curl -sf -X POST \
"${GOTIFY_URL}/message?token=${GOTIFY_TOKEN}" \
-H "Content-Type: application/json" \
-d "{\"title\": \"$title\", \"message\": \"$msg\", \"priority\": $priority}" \
>/dev/null 2>&1 || log "WARN: Failed to send Gotify message"
}
# ─── Send to all configured channels ─────────────────────────────────────────
send_notification() {
local title="$1"
local msg="$2"
local priority="${3:-5}"
discord_send "$msg"
telegram_send "$msg"
gotify_send "$title" "$msg" "$priority"
}
# ─── Health check ─────────────────────────────────────────────────────────────
check_health() {
curl -sf --max-time 10 "$HEALTH_URL" >/dev/null 2>&1
}
# ─── Save last-known-good config ─────────────────────────────────────────────
save_good_config() {
if [[ -f "$OPENCLAW_CONFIG_PATH" ]]; then
cp "$OPENCLAW_CONFIG_PATH" "$GOOD_CONFIG_PATH"
fi
}
# ─── Revert to last-known-good config ────────────────────────────────────────
revert_config() {
if [[ -f "$GOOD_CONFIG_PATH" ]]; then
log "Reverting config to last-known-good snapshot"
cp "$GOOD_CONFIG_PATH" "$OPENCLAW_CONFIG_PATH"
return 0
else
log "WARN: No good config snapshot available to revert"
return 1
fi
}
# ─── Restart gateway ─────────────────────────────────────────────────────────
restart_gateway() {
log "Restarting OpenClaw gateway..."
openclaw gateway restart >/dev/null 2>&1 || true
sleep 5
}
# ─── SOS alert ────────────────────────────────────────────────────────────────
send_sos() {
local hostname
hostname="$(hostname 2>/dev/null || echo 'unknown')"
local title="🚨 OpenClaw Gateway DOWN"
local msg="🚨 **OpenClaw Gateway DOWN** on \`${hostname}\`
Watchdog tried:
1. ✅ Simple restart
2. ✅ Config rollback + restart
3. ❌ Still unreachable after ${FAIL_THRESHOLD}+ failures
**Manual intervention needed:**
\`\`\`
ssh ${USER}@${hostname}
openclaw gateway status
journalctl -u openclaw-gateway --since '10 min ago'
\`\`\`"
log "CRITICAL: Sending SOS alert"
send_notification "$title" "$msg" 10
alerted=true
}
# ─── Recovery notification ────────────────────────────────────────────────────
send_recovery() {
local hostname
hostname="$(hostname 2>/dev/null || echo 'unknown')"
local title="✅ OpenClaw Gateway Recovered"
local msg="✅ **OpenClaw Gateway recovered** on \`${hostname}\` — back online!"
log "Gateway recovered"
send_notification "$title" "$msg" 5
}
# ─── Main loop ────────────────────────────────────────────────────────────────
main() {
log "Watchdog started (interval=${CHECK_INTERVAL}s, threshold=${FAIL_THRESHOLD}, health=${HEALTH_URL})"
log "Notifications: Telegram=${TELEGRAM_CHAT:+enabled}, Gotify=${GOTIFY_TOKEN:+enabled}, Discord=${DISCORD_CHANNEL_ID:+enabled}"
while true; do
if check_health; then
# ── Healthy ──
if [[ "$was_down" == true ]]; then
send_recovery
was_down=false
alerted=false
fi
fail_count=0
save_good_config
else
# ── Unhealthy ──
fail_count=$((fail_count + 1))
log "Health check failed (${fail_count}/${FAIL_THRESHOLD})"
if [[ $fail_count -eq 1 ]]; then
# Stage 1: simple restart
was_down=true
restart_gateway
elif [[ $fail_count -eq 2 ]]; then
# Stage 2: config revert + restart
revert_config && restart_gateway
elif [[ $fail_count -ge $FAIL_THRESHOLD && "$alerted" == false ]]; then
# Stage 3: SOS (once)
send_sos
fi
fi
sleep "$CHECK_INTERVAL"
done
}
main "$@"