- FreshRSS Smart Digest: Daily AI-ranked RSS summary at 7 AM - Birthday Tracker: Smart reminders for family birthdays with gift suggestions - Home Stack Monitor: Health checks every 15 min with self-healing attempts All cron jobs configured and ready to run. Telegram bot token saved to .env
340 lines
9.8 KiB
Bash
Executable File
340 lines
9.8 KiB
Bash
Executable File
#!/bin/bash
|
|
# Home Stack Monitor & Self-Healing
|
|
# Monitors services, alerts on issues, attempts auto-recovery
|
|
# Runs every 15 minutes
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
DATA_FILE="$SCRIPT_DIR/monitor-state.json"
|
|
source "$SCRIPT_DIR/../../.env" 2>/dev/null || true
|
|
|
|
TELEGRAM_CHAT="${TELEGRAM_CHAT:-1793951355}"
|
|
GOTIFY_URL="${GOTIFY_URL:-http://runtipi.kangaroo-eel.ts.net:8129}"
|
|
GOTIFY_TOKEN="${GOTIFY_TOKEN:-AGKnHafW3FGzBlt}"
|
|
|
|
# Services to monitor
|
|
# Format: name|url|type|restart_command(optional)
|
|
# type: http, ping, port
|
|
SERVICES=(
|
|
"Gitea|http://gitea.kangaroo-eel.ts.net:3000|http"
|
|
"n8n|http://n8n.kangaroo-eel.ts.net:5678|http"
|
|
"Home Assistant|http://homeassistant.kangaroo-eel.ts.net:8123|http"
|
|
"FreshRSS|http://freshrss.kangaroo-eel.ts.net|http"
|
|
"Tailscale|100.100.100.100|ping"
|
|
)
|
|
|
|
# Thresholds
|
|
HTTP_TIMEOUT=10
|
|
PING_COUNT=3
|
|
DISK_WARNING=80 # Alert at 80% disk usage
|
|
DISK_CRITICAL=90 # Critical at 90%
|
|
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
|
|
}
|
|
|
|
init_state() {
|
|
if [[ ! -f "$DATA_FILE" ]]; then
|
|
echo '{"services": {}, "alerts_sent": {}, "stats": {"checks": 0, "failures": 0, "recoveries": 0}}' > "$DATA_FILE"
|
|
fi
|
|
}
|
|
|
|
# Check HTTP endpoint
|
|
check_http() {
|
|
local url="$1"
|
|
local status
|
|
|
|
status=$(curl -s -o /dev/null -w "%{http_code}" --max-time "$HTTP_TIMEOUT" "$url" 2>/dev/null || echo "000")
|
|
|
|
if [[ "$status" == "200" || "$status" == "302" || "$status" == "401" ]]; then
|
|
echo "up"
|
|
else
|
|
echo "down:$status"
|
|
fi
|
|
}
|
|
|
|
# Check ping
|
|
check_ping() {
|
|
local host="$1"
|
|
|
|
if ping -c "$PING_COUNT" -W 2 "$host" > /dev/null 2>&1; then
|
|
echo "up"
|
|
else
|
|
echo "down:timeout"
|
|
fi
|
|
}
|
|
|
|
# Check disk space on Proxmox (if accessible)
|
|
check_disk() {
|
|
# This would need SSH access to Proxmox host
|
|
# For now, placeholder - can be extended with SSH key setup
|
|
echo "unknown"
|
|
}
|
|
|
|
# Update service state in JSON
|
|
update_state() {
|
|
local name="$1"
|
|
local status="$2"
|
|
local timestamp=$(date -Iseconds)
|
|
|
|
local temp_file=$(mktemp)
|
|
jq --arg name "$name" \
|
|
--arg status "$status" \
|
|
--arg time "$timestamp" \
|
|
'.services[$name] = {"status": $status, "last_check": $time}' \
|
|
"$DATA_FILE" > "$temp_file"
|
|
mv "$temp_file" "$DATA_FILE"
|
|
}
|
|
|
|
# Get previous state
|
|
get_previous_state() {
|
|
local name="$1"
|
|
jq -r ".services[\"$name\"].status // \"unknown\"" "$DATA_FILE"
|
|
}
|
|
|
|
# Check if alert already sent (cooldown 1 hour)
|
|
alert_cooldown_active() {
|
|
local name="$1"
|
|
local alert_type="$2"
|
|
local cooldown_seconds=3600 # 1 hour
|
|
|
|
local last_alert=$(jq -r ".alerts_sent[\"$name-$alert_type\"] // 0" "$DATA_FILE")
|
|
local now=$(date +%s)
|
|
|
|
if ((last_alert > 0)); then
|
|
local diff=$((now - last_alert))
|
|
if ((diff < cooldown_seconds)); then
|
|
return 0 # Cooldown active
|
|
fi
|
|
fi
|
|
return 1 # No cooldown
|
|
}
|
|
|
|
# Log alert sent
|
|
log_alert() {
|
|
local name="$1"
|
|
local alert_type="$2"
|
|
local now=$(date +%s)
|
|
|
|
local temp_file=$(mktemp)
|
|
jq --arg key "$name-$alert_type" \
|
|
--arg time "$now" \
|
|
'.alerts_sent[$key] = $time' \
|
|
"$DATA_FILE" > "$temp_file"
|
|
mv "$temp_file" "$DATA_FILE"
|
|
}
|
|
|
|
# Send Telegram alert
|
|
send_telegram() {
|
|
local message="$1"
|
|
local priority="${2:-normal}"
|
|
|
|
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{
|
|
\"chat_id\": \"$TELEGRAM_CHAT\",
|
|
\"text\": \"$message\",
|
|
\"parse_mode\": \"Markdown\"
|
|
}" > /dev/null || log "Failed to send Telegram"
|
|
}
|
|
|
|
# Send Gotify alert
|
|
send_gotify() {
|
|
local title="$1"
|
|
local message="$2"
|
|
local priority="${3:-5}"
|
|
|
|
curl -s -X POST "${GOTIFY_URL}/message?token=${GOTIFY_TOKEN}" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{
|
|
\"title\": \"$title\",
|
|
\"message\": \"$message\",
|
|
\"priority\": $priority
|
|
}" > /dev/null || log "Failed to send Gotify"
|
|
}
|
|
|
|
# Attempt self-healing
|
|
attempt_heal() {
|
|
local name="$1"
|
|
local url="$2"
|
|
|
|
log "Attempting to heal $name..."
|
|
|
|
case "$name" in
|
|
"Home Assistant")
|
|
# Try to restart via SSH or API if configured
|
|
log "Home Assistant heal: Check if SSH available"
|
|
# Placeholder - would need HA SSH config
|
|
;;
|
|
"Gitea"|"n8n"|"FreshRSS")
|
|
# These are Docker/LXC - could restart container if SSH configured
|
|
log "$name heal: Would attempt container restart if SSH configured"
|
|
;;
|
|
esac
|
|
|
|
# Wait and recheck
|
|
sleep 10
|
|
local recheck
|
|
recheck=$(check_http "$url")
|
|
|
|
if [[ "$recheck" == "up" ]]; then
|
|
log "✅ $name recovered after heal attempt"
|
|
return 0
|
|
else
|
|
log "❌ $name still down after heal attempt"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Check all services
|
|
check_services() {
|
|
log "Checking services..."
|
|
|
|
local down_services=()
|
|
local recovered_services=()
|
|
local stats_changed=false
|
|
|
|
for service_def in "${SERVICES[@]}"; do
|
|
IFS='|' read -r name url check_type <<< "$service_def"
|
|
|
|
log "Checking $name ($url)..."
|
|
|
|
local current_status
|
|
case "$check_type" in
|
|
http) current_status=$(check_http "$url") ;;
|
|
ping) current_status=$(check_ping "$url") ;;
|
|
*) current_status="unknown" ;;
|
|
esac
|
|
|
|
local previous_status=$(get_previous_state "$name")
|
|
|
|
# Update state
|
|
update_state "$name" "$current_status"
|
|
|
|
# Track stats
|
|
local temp_file=$(mktemp)
|
|
jq '.stats.checks += 1' "$DATA_FILE" > "$temp_file"
|
|
mv "$temp_file" "$DATA_FILE"
|
|
|
|
# Analyze state change
|
|
if [[ "$current_status" == "up" ]]; then
|
|
if [[ "$previous_status" != "up" && "$previous_status" != "unknown" ]]; then
|
|
# Service recovered
|
|
recovered_services+=("$name")
|
|
send_telegram "✅ *$name* is back online! 🎉"
|
|
log_alert "$name" "recovery"
|
|
fi
|
|
else
|
|
# Service down
|
|
local status_code="${current_status#down:}"
|
|
|
|
if [[ "$previous_status" == "up" ]]; then
|
|
# Just went down
|
|
down_services+=("$name|$status_code")
|
|
|
|
# Try to heal
|
|
if attempt_heal "$name" "$url"; then
|
|
recovered_services+=("$name (auto-healed)")
|
|
update_state "$name" "up"
|
|
else
|
|
# Send alert
|
|
if ! alert_cooldown_active "$name" "down"; then
|
|
send_telegram "🚨 *Service Down: $name*\n\nStatus: $status_code\nURL: $url\n\nAuto-heal failed. Manual intervention may be needed."
|
|
send_gotify "Service Down: $name" "$name is down (status: $status_code)" 8
|
|
log_alert "$name" "down"
|
|
|
|
# Update failure stats
|
|
temp_file=$(mktemp)
|
|
jq '.stats.failures += 1' "$DATA_FILE" > "$temp_file"
|
|
mv "$temp_file" "$DATA_FILE"
|
|
fi
|
|
fi
|
|
elif [[ "$previous_status" != "up" ]]; then
|
|
# Still down
|
|
if ! alert_cooldown_active "$name" "still_down"; then
|
|
send_telegram "⚠️ *Still Down: $name*\n\nHas been down for a while. Might need attention."
|
|
log_alert "$name" "still_down"
|
|
fi
|
|
fi
|
|
fi
|
|
done
|
|
|
|
log "Check complete. ${#down_services[@]} down, ${#recovered_services[@]} recovered"
|
|
}
|
|
|
|
# Generate daily health report
|
|
daily_report() {
|
|
local stats=$(jq '.stats' "$DATA_FILE")
|
|
local checks=$(echo "$stats" | jq -r '.checks')
|
|
local failures=$(echo "$stats" | jq -r '.failures')
|
|
local uptime_pct=100
|
|
|
|
if ((checks > 0)); then
|
|
uptime_pct=$((100 - (failures * 100 / checks)))
|
|
fi
|
|
|
|
local report="🏠 *Home Stack Daily Report*\n\n"
|
|
report+="📊 *Uptime: ${uptime_pct}%*\n"
|
|
report+="🔍 Checks: $checks\n"
|
|
report+="❌ Failures: $failures\n\n"
|
|
|
|
report+="*Current Status:*\n"
|
|
|
|
for service_def in "${SERVICES[@]}"; do
|
|
IFS='|' read -r name url _ <<< "$service_def"
|
|
local status=$(jq -r ".services[\"$name\"].status // \"unknown\"" "$DATA_FILE")
|
|
local last_check=$(jq -r ".services[\"$name\"].last_check // \"never\"" "$DATA_FILE")
|
|
|
|
if [[ "$status" == "up" ]]; then
|
|
report+="✅ $name\n"
|
|
else
|
|
report+="❌ $name ($status)\n"
|
|
fi
|
|
done
|
|
|
|
send_telegram "$report"
|
|
}
|
|
|
|
# Cleanup old alerts (older than 24 hours)
|
|
cleanup_alerts() {
|
|
local cutoff=$(($(date +%s) - 86400))
|
|
local temp_file=$(mktemp)
|
|
|
|
jq --argjson cutoff "$cutoff" '.alerts_sent |= with_entries(select(.value > $cutoff))' "$DATA_FILE" > "$temp_file"
|
|
mv "$temp_file" "$DATA_FILE"
|
|
}
|
|
|
|
# Main
|
|
main() {
|
|
init_state
|
|
|
|
case "${1:-check}" in
|
|
check)
|
|
check_services
|
|
cleanup_alerts
|
|
;;
|
|
report)
|
|
daily_report
|
|
;;
|
|
status)
|
|
jq '.' "$DATA_FILE"
|
|
;;
|
|
reset-stats)
|
|
local temp_file=$(mktemp)
|
|
jq '.stats = {"checks": 0, "failures": 0, "recoveries": 0}' "$DATA_FILE" > "$temp_file"
|
|
mv "$temp_file" "$DATA_FILE"
|
|
log "Stats reset"
|
|
;;
|
|
*)
|
|
echo "Usage: $0 [check|report|status|reset-stats]"
|
|
echo " check - Run health check on all services"
|
|
echo " report - Generate daily status report"
|
|
echo " status - Show full state"
|
|
echo " reset-stats - Reset statistics counters"
|
|
;;
|
|
esac
|
|
}
|
|
|
|
main "$@"
|