220 lines
6.7 KiB
Bash
220 lines
6.7 KiB
Bash
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
# OpenClaw Gateway Health Check (Level 2 Self-Healing)
|
|
# HTTP 응답 검증 → 실패 시 재시작 → 5분 후 재검증 → 실패 시 Level 3 escalation
|
|
|
|
# ============================================
|
|
# Configuration (Override via environment)
|
|
# ============================================
|
|
GATEWAY_URL="${OPENCLAW_GATEWAY_URL:-http://localhost:18789/}"
|
|
MAX_RETRIES="${HEALTH_CHECK_MAX_RETRIES:-3}"
|
|
RETRY_DELAY="${HEALTH_CHECK_RETRY_DELAY:-30}"
|
|
ESCALATION_WAIT="${HEALTH_CHECK_ESCALATION_WAIT:-300}"
|
|
LOG_DIR="${OPENCLAW_MEMORY_DIR:-$HOME/openclaw/memory}"
|
|
LOG_FILE="$LOG_DIR/healthcheck-$(date +%Y-%m-%d).log"
|
|
HTTP_TIMEOUT="${HEALTH_CHECK_HTTP_TIMEOUT:-10}"
|
|
|
|
# Performance metrics
|
|
METRICS_FILE="$LOG_DIR/.healthcheck-metrics.json"
|
|
|
|
# Lock file로 중복 실행 방지
|
|
LOCKFILE=/tmp/openclaw-healthcheck.lock
|
|
if [ -f "$LOCKFILE" ]; then
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Previous health check still running, skipping..."
|
|
exit 0
|
|
fi
|
|
touch "$LOCKFILE"
|
|
trap 'rm -f "$LOCKFILE"' EXIT
|
|
|
|
# Create log directory if not exists
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
# Load environment variables
|
|
if [ -f "$HOME/openclaw/.env" ]; then
|
|
# shellcheck source=/dev/null
|
|
source "$HOME/openclaw/.env"
|
|
elif [ -f "$HOME/.openclaw/.env" ]; then
|
|
# shellcheck source=/dev/null
|
|
source "$HOME/.openclaw/.env"
|
|
fi
|
|
|
|
# Discord webhook from environment variable (optional)
|
|
DISCORD_WEBHOOK="${DISCORD_WEBHOOK_URL:-}"
|
|
|
|
# Validate webhook URL (optional, warning only)
|
|
if [ -z "$DISCORD_WEBHOOK" ]; then
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: DISCORD_WEBHOOK_URL not set. Notifications disabled." | tee -a "$LOG_FILE"
|
|
fi
|
|
|
|
# ============================================
|
|
# Functions
|
|
# ============================================
|
|
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
send_discord_notification() {
|
|
local message="$1"
|
|
if [ -n "$DISCORD_WEBHOOK" ]; then
|
|
local response_code
|
|
response_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
|
-X POST "$DISCORD_WEBHOOK" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"content\": \"$message\"}" \
|
|
2>&1)
|
|
|
|
if [ "$response_code" = "200" ] || [ "$response_code" = "204" ]; then
|
|
log "✅ Discord notification sent (HTTP $response_code)"
|
|
else
|
|
log "⚠️ Discord notification failed (HTTP $response_code)"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
check_http() {
|
|
local start_time
|
|
start_time=$(date +%s)
|
|
|
|
local http_code
|
|
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
|
--max-time "$HTTP_TIMEOUT" \
|
|
"$GATEWAY_URL" 2>/dev/null || echo "000")
|
|
|
|
local end_time
|
|
end_time=$(date +%s)
|
|
local response_time=$((end_time - start_time))
|
|
|
|
# Record metric
|
|
record_metric "http_check" "$http_code" "$response_time"
|
|
|
|
if [ "$http_code" = "200" ]; then
|
|
log "HTTP check passed (${response_time}s)"
|
|
return 0
|
|
else
|
|
log "HTTP check failed: HTTP $http_code (${response_time}s)"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
restart_gateway() {
|
|
log "Restarting OpenClaw Gateway..."
|
|
|
|
local start_time
|
|
start_time=$(date +%s)
|
|
|
|
if openclaw gateway restart >> "$LOG_FILE" 2>&1; then
|
|
local end_time
|
|
end_time=$(date +%s)
|
|
local restart_time=$((end_time - start_time))
|
|
|
|
log "Gateway restart completed (${restart_time}s)"
|
|
record_metric "gateway_restart" "success" "$restart_time"
|
|
|
|
sleep "$RETRY_DELAY"
|
|
return 0
|
|
else
|
|
log "⚠️ Gateway restart command failed"
|
|
record_metric "gateway_restart" "failed" 0
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
rotate_old_logs() {
|
|
# Delete logs older than 14 days
|
|
local deleted_count
|
|
deleted_count=$(find "$LOG_DIR" -name "healthcheck-*.log" -mtime +14 -delete -print 2>/dev/null | wc -l)
|
|
|
|
if [ "$deleted_count" -gt 0 ]; then
|
|
log "Rotated $deleted_count old log files"
|
|
fi
|
|
}
|
|
|
|
record_metric() {
|
|
local metric_name="$1"
|
|
local result="$2"
|
|
local duration="$3"
|
|
local timestamp
|
|
timestamp=$(date +%s)
|
|
|
|
# Append to metrics file (JSON Lines format)
|
|
echo "{\"timestamp\":$timestamp,\"metric\":\"$metric_name\",\"result\":\"$result\",\"duration\":$duration}" >> "$METRICS_FILE"
|
|
}
|
|
|
|
escalate_to_level3() {
|
|
log "🚨 Still unhealthy after ${ESCALATION_WAIT}s, triggering emergency recovery..."
|
|
|
|
# Discord 알림 (Level 3 시작)
|
|
send_discord_notification "🚨 **Level 3 Emergency Recovery 시작**\n\n${ESCALATION_WAIT}초 대기 후에도 Gateway 복구 안 됨.\nClaude가 자동으로 진단 및 복구를 시도합니다.\n\n예상 소요 시간: 30분\n현재 시각: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
|
|
local emergency_script="$HOME/openclaw/scripts/emergency-recovery.sh"
|
|
|
|
if [ -f "$emergency_script" ]; then
|
|
bash "$emergency_script"
|
|
else
|
|
log "❌ Emergency recovery script not found: $emergency_script"
|
|
send_discord_notification "🚨 **Level 3 실행 실패**\n\nEmergency recovery script not found:\n\`$emergency_script\`\n\n수동 개입 필요."
|
|
fi
|
|
}
|
|
|
|
# ============================================
|
|
# Main Logic
|
|
# ============================================
|
|
|
|
main() {
|
|
log "=== Health Check Started (PID: $$) ==="
|
|
|
|
# Log rotation (cleanup old logs)
|
|
rotate_old_logs
|
|
|
|
# HTTP 응답 체크
|
|
if ! check_http; then
|
|
log "⚠️ Gateway unhealthy (HTTP failed)"
|
|
|
|
# 3번 재시도
|
|
for i in $(seq 1 "$MAX_RETRIES"); do
|
|
log "Retry $i/$MAX_RETRIES..."
|
|
|
|
if restart_gateway && check_http; then
|
|
log "✅ Recovery successful on retry $i"
|
|
|
|
# Discord 알림 (복구 성공)
|
|
send_discord_notification "✅ **Gateway 복구 성공**\n\nLevel 2 Health Check가 Gateway를 재시작하여 복구했습니다.\n- 재시도 횟수: $i/$MAX_RETRIES\n- 현재 시각: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
|
|
record_metric "recovery" "success" "$i"
|
|
exit 0
|
|
fi
|
|
done
|
|
|
|
log "❌ Recovery failed after $MAX_RETRIES retries"
|
|
log "🚨 Escalating to Level 3 (Claude Emergency Recovery)..."
|
|
record_metric "recovery" "failed" "$MAX_RETRIES"
|
|
|
|
# Discord 알림 (Level 3로 escalation)
|
|
send_discord_notification "⚠️ **Level 2 Health Check 실패**\n\nGateway를 ${MAX_RETRIES}회 재시작했으나 복구 실패.\n${ESCALATION_WAIT}초 후 Level 3 (Claude Emergency Recovery)로 escalation합니다.\n\n현재 시각: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
|
|
# 5분 대기 후 최종 검증
|
|
sleep "$ESCALATION_WAIT"
|
|
|
|
if ! check_http; then
|
|
escalate_to_level3
|
|
else
|
|
log "✅ Gateway recovered during waiting period"
|
|
|
|
# Discord 알림 (대기 중 복구됨)
|
|
send_discord_notification "✅ **Gateway 자동 복구됨**\n\n${ESCALATION_WAIT}초 대기 중 Gateway가 스스로 복구되었습니다.\nLevel 3 Emergency Recovery는 실행하지 않습니다."
|
|
|
|
record_metric "recovery" "self_healed" 0
|
|
fi
|
|
else
|
|
log "✅ Gateway healthy"
|
|
record_metric "health_check" "healthy" 0
|
|
fi
|
|
|
|
log "=== Health Check Completed ==="
|
|
}
|
|
|
|
# Run main function
|
|
main
|