Initial backup: 18 monitoring scripts + timers + docs
- 18 comprehensive monitoring checks - 5 systemd timers (5min, 15min, hourly, daily, weekly) - Complete documentation - NTFY secure notification system - Fixed debianvm disk space (91% to 57%) - Fixed CloudReve integration - Date: 2026-01-07
This commit is contained in:
36
scripts/check-ceph.sh
Executable file
36
scripts/check-ceph.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
# Monitor Ceph cluster health
|
||||
set -euo pipefail
|
||||
|
||||
SEND_NTFY="/usr/local/bin/send-ntfy.sh"
|
||||
|
||||
# Check if Ceph is installed
|
||||
if ! command -v ceph &>/dev/null; then
|
||||
logger -t ceph-monitor "Ceph not installed, skipping"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Get Ceph status
|
||||
CEPH_STATUS=$(timeout 10 ceph -s 2>/dev/null || echo "FAILED")
|
||||
|
||||
if [ "$CEPH_STATUS" = "FAILED" ]; then
|
||||
$SEND_NTFY critical "Ceph Check Failed" "🔴 CRITICAL: Unable to get Ceph cluster status!" "skull,error"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check overall health
|
||||
HEALTH=$(echo "$CEPH_STATUS" | grep -oP 'health: \K\w+' || echo "UNKNOWN")
|
||||
|
||||
if [ "$HEALTH" = "HEALTH_ERR" ]; then
|
||||
$SEND_NTFY critical "Ceph Health Error" "🔴 CRITICAL: Ceph cluster is in HEALTH_ERR state!\n$(ceph health detail 2>/dev/null | head -3)" "skull,error,cd"
|
||||
elif [ "$HEALTH" = "HEALTH_WARN" ]; then
|
||||
$SEND_NTFY warning "Ceph Health Warning" "🟡 WARNING: Ceph cluster is in HEALTH_WARN state\n$(ceph health detail 2>/dev/null | head -3)" "warning,cd"
|
||||
fi
|
||||
|
||||
# Check for degraded PGs
|
||||
DEGRADED=$(echo "$CEPH_STATUS" | grep -i degraded || echo "")
|
||||
if [ -n "$DEGRADED" ]; then
|
||||
$SEND_NTFY warning "Ceph PGs Degraded" "🟡 WARNING: Ceph has degraded placement groups\n$DEGRADED" "warning,cd"
|
||||
fi
|
||||
|
||||
logger -t ceph-monitor "Ceph health: $HEALTH"
|
||||
Reference in New Issue
Block a user