Initial backup: 18 monitoring scripts + timers + docs

- 18 comprehensive monitoring checks - 5 systemd timers (5min, 15min, hourly, daily, weekly) - Complete documentation - NTFY secure notification system - Fixed debianvm disk space (91% to 57%) - Fixed CloudReve integration - Date: 2026-01-07
2026-01-07 16:30:34 +08:00
commit 3a14fd2736
34 changed files with 1067 additions and 0 deletions
--- a/scripts/check-ceph.sh
+++ b/scripts/check-ceph.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Monitor Ceph cluster health
+set -euo pipefail
+
+SEND_NTFY="/usr/local/bin/send-ntfy.sh"
+
+# Check if Ceph is installed
+if ! command -v ceph &>/dev/null; then
+    logger -t ceph-monitor "Ceph not installed, skipping"
+    exit 0
+fi
+
+# Get Ceph status
+CEPH_STATUS=$(timeout 10 ceph -s 2>/dev/null || echo "FAILED")
+
+if [ "$CEPH_STATUS" = "FAILED" ]; then
+    $SEND_NTFY critical "Ceph Check Failed" "🔴 CRITICAL: Unable to get Ceph cluster status!" "skull,error"
+    exit 1
+fi
+
+# Check overall health
+HEALTH=$(echo "$CEPH_STATUS" | grep -oP 'health: \K\w+' || echo "UNKNOWN")
+
+if [ "$HEALTH" = "HEALTH_ERR" ]; then
+    $SEND_NTFY critical "Ceph Health Error" "🔴 CRITICAL: Ceph cluster is in HEALTH_ERR state!\n$(ceph health detail 2>/dev/null | head -3)" "skull,error,cd"
+elif [ "$HEALTH" = "HEALTH_WARN" ]; then
+    $SEND_NTFY warning "Ceph Health Warning" "🟡 WARNING: Ceph cluster is in HEALTH_WARN state\n$(ceph health detail 2>/dev/null | head -3)" "warning,cd"
+fi
+
+# Check for degraded PGs
+DEGRADED=$(echo "$CEPH_STATUS" | grep -i degraded || echo "")
+if [ -n "$DEGRADED" ]; then
+    $SEND_NTFY warning "Ceph PGs Degraded" "🟡 WARNING: Ceph has degraded placement groups\n$DEGRADED" "warning,cd"
+fi
+
+logger -t ceph-monitor "Ceph health: $HEALTH"