#!/bin/bash # Home Stack Monitor & Self-Healing # Monitors services, alerts on issues, attempts auto-recovery # Runs every 15 minutes set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" DATA_FILE="$SCRIPT_DIR/monitor-state.json" source "$SCRIPT_DIR/../../.env" 2>/dev/null || true TELEGRAM_CHAT="${TELEGRAM_CHAT:-1793951355}" NTFY_URL="${NTFY_URL:-}" NTFY_TOPIC="${NTFY_TOPIC:-}" NTFY_MIN_PRIORITY="${NTFY_MIN_PRIORITY:-4}" # Services to monitor # Format: name|url|type|restart_command(optional) # type: http, ping, port SERVICES=( "Gitea|http://gitea.kangaroo-eel.ts.net:3000|http" "n8n|http://n8n.kangaroo-eel.ts.net:5678|http" "Home Assistant|http://homeassistant.kangaroo-eel.ts.net:8123|http" "FreshRSS|http://freshrss.kangaroo-eel.ts.net|http" "Tailscale|100.100.100.100|ping" ) # Thresholds HTTP_TIMEOUT=10 PING_COUNT=3 DISK_WARNING=80 # Alert at 80% disk usage DISK_CRITICAL=90 # Critical at 90% log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" } init_state() { if [[ ! -f "$DATA_FILE" ]]; then echo '{"services": {}, "alerts_sent": {}, "stats": {"checks": 0, "failures": 0, "recoveries": 0}}' > "$DATA_FILE" fi } # Check HTTP endpoint check_http() { local url="$1" local status status=$(curl -s -o /dev/null -w "%{http_code}" --max-time "$HTTP_TIMEOUT" "$url" 2>/dev/null || echo "000") if [[ "$status" == "200" || "$status" == "302" || "$status" == "401" ]]; then echo "up" else echo "down:$status" fi } # Check ping check_ping() { local host="$1" if ping -c "$PING_COUNT" -W 2 "$host" > /dev/null 2>&1; then echo "up" else echo "down:timeout" fi } # Check disk space on Proxmox (if accessible) check_disk() { # This would need SSH access to Proxmox host # For now, placeholder - can be extended with SSH key setup echo "unknown" } # Update service state in JSON update_state() { local name="$1" local status="$2" local timestamp=$(date -Iseconds) local temp_file=$(mktemp) jq --arg name "$name" \ --arg status "$status" \ --arg time "$timestamp" \ '.services[$name] = {"status": $status, "last_check": $time}' \ "$DATA_FILE" > "$temp_file" mv "$temp_file" "$DATA_FILE" } # Get previous state get_previous_state() { local name="$1" jq -r ".services[\"$name\"].status // \"unknown\"" "$DATA_FILE" } # Check if alert already sent (cooldown 1 hour) alert_cooldown_active() { local name="$1" local alert_type="$2" local cooldown_seconds=3600 # 1 hour local last_alert=$(jq -r ".alerts_sent[\"$name-$alert_type\"] // 0" "$DATA_FILE") local now=$(date +%s) if ((last_alert > 0)); then local diff=$((now - last_alert)) if ((diff < cooldown_seconds)); then return 0 # Cooldown active fi fi return 1 # No cooldown } # Log alert sent log_alert() { local name="$1" local alert_type="$2" local now=$(date +%s) local temp_file=$(mktemp) jq --arg key "$name-$alert_type" \ --arg time "$now" \ '.alerts_sent[$key] = $time' \ "$DATA_FILE" > "$temp_file" mv "$temp_file" "$DATA_FILE" } # Send Telegram alert send_telegram() { local message="$1" local priority="${2:-normal}" curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ -H "Content-Type: application/json" \ -d "{ \"chat_id\": \"$TELEGRAM_CHAT\", \"text\": \"$message\", \"parse_mode\": \"Markdown\" }" > /dev/null || log "Failed to send Telegram" } # Send ntfy alert send_ntfy() { local title="$1" local message="$2" local priority="${3:-4}" local sound="${4:-default}" [[ -z "$NTFY_URL" ]] && return 0 [[ -z "$NTFY_TOPIC" ]] && return 0 # Enforce minimum priority (default 4) if [[ "$priority" =~ ^[0-9]+$ ]] && [[ "$NTFY_MIN_PRIORITY" =~ ^[0-9]+$ ]]; then if (( priority < NTFY_MIN_PRIORITY )); then priority="$NTFY_MIN_PRIORITY" fi fi curl -s -X POST "${NTFY_URL%/}/${NTFY_TOPIC}" \ -H "Title: $title" \ -H "Priority: $priority" \ -H "Sound: $sound" \ -d "$message" > /dev/null 2>&1 || log "Failed to send ntfy" } # Attempt self-healing attempt_heal() { local name="$1" local url="$2" log "Attempting to heal $name..." case "$name" in "Home Assistant") # Try to restart via SSH or API if configured log "Home Assistant heal: Check if SSH available" # Placeholder - would need HA SSH config ;; "Gitea"|"n8n"|"FreshRSS") # These are Docker/LXC - could restart container if SSH configured log "$name heal: Would attempt container restart if SSH configured" ;; esac # Wait and recheck sleep 10 local recheck recheck=$(check_http "$url") if [[ "$recheck" == "up" ]]; then log "✅ $name recovered after heal attempt" return 0 else log "❌ $name still down after heal attempt" return 1 fi } # Check all services check_services() { log "Checking services..." local down_services=() local recovered_services=() local stats_changed=false for service_def in "${SERVICES[@]}"; do IFS='|' read -r name url check_type <<< "$service_def" log "Checking $name ($url)..." local current_status case "$check_type" in http) current_status=$(check_http "$url") ;; ping) current_status=$(check_ping "$url") ;; *) current_status="unknown" ;; esac local previous_status=$(get_previous_state "$name") # Update state update_state "$name" "$current_status" # Track stats local temp_file=$(mktemp) jq '.stats.checks += 1' "$DATA_FILE" > "$temp_file" mv "$temp_file" "$DATA_FILE" # Analyze state change if [[ "$current_status" == "up" ]]; then if [[ "$previous_status" != "up" && "$previous_status" != "unknown" ]]; then # Service recovered recovered_services+=("$name") send_telegram "✅ *$name* is back online! 🎉" log_alert "$name" "recovery" fi else # Service down local status_code="${current_status#down:}" if [[ "$previous_status" == "up" ]]; then # Just went down down_services+=("$name|$status_code") # Try to heal if attempt_heal "$name" "$url"; then recovered_services+=("$name (auto-healed)") update_state "$name" "up" else # Send alert if ! alert_cooldown_active "$name" "down"; then send_telegram "🚨 *Service Down: $name*\n\nStatus: $status_code\nURL: $url\n\nAuto-heal failed. Manual intervention may be needed." send_ntfy "🚨 Service Down: $name" "$name is down (status: $status_code)\nURL: $url" 4 default log_alert "$name" "down" # Update failure stats temp_file=$(mktemp) jq '.stats.failures += 1' "$DATA_FILE" > "$temp_file" mv "$temp_file" "$DATA_FILE" fi fi elif [[ "$previous_status" != "up" ]]; then # Still down if ! alert_cooldown_active "$name" "still_down"; then send_telegram "⚠️ *Still Down: $name*\n\nHas been down for a while. Might need attention." log_alert "$name" "still_down" fi fi fi done log "Check complete. ${#down_services[@]} down, ${#recovered_services[@]} recovered" } # Generate daily health report daily_report() { local stats=$(jq '.stats' "$DATA_FILE") local checks=$(echo "$stats" | jq -r '.checks') local failures=$(echo "$stats" | jq -r '.failures') local uptime_pct=100 if ((checks > 0)); then uptime_pct=$((100 - (failures * 100 / checks))) fi local report="🏠 *Home Stack Daily Report*\n\n" report+="📊 *Uptime: ${uptime_pct}%*\n" report+="🔍 Checks: $checks\n" report+="❌ Failures: $failures\n\n" report+="*Current Status:*\n" for service_def in "${SERVICES[@]}"; do IFS='|' read -r name url _ <<< "$service_def" local status=$(jq -r ".services[\"$name\"].status // \"unknown\"" "$DATA_FILE") local last_check=$(jq -r ".services[\"$name\"].last_check // \"never\"" "$DATA_FILE") if [[ "$status" == "up" ]]; then report+="✅ $name\n" else report+="❌ $name ($status)\n" fi done send_telegram "$report" } # Cleanup old alerts (older than 24 hours) cleanup_alerts() { local cutoff=$(($(date +%s) - 86400)) local temp_file=$(mktemp) jq --argjson cutoff "$cutoff" '.alerts_sent |= with_entries(select(.value > $cutoff))' "$DATA_FILE" > "$temp_file" mv "$temp_file" "$DATA_FILE" } # Main main() { init_state case "${1:-check}" in check) check_services cleanup_alerts ;; report) daily_report ;; status) jq '.' "$DATA_FILE" ;; reset-stats) local temp_file=$(mktemp) jq '.stats = {"checks": 0, "failures": 0, "recoveries": 0}' "$DATA_FILE" > "$temp_file" mv "$temp_file" "$DATA_FILE" log "Stats reset" ;; *) echo "Usage: $0 [check|report|status|reset-stats]" echo " check - Run health check on all services" echo " report - Generate daily status report" echo " status - Show full state" echo " reset-stats - Reset statistics counters" ;; esac } main "$@"