AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning
This commit is contained in:
@@ -0,0 +1,219 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# OpenClaw Gateway Health Check (Level 2 Self-Healing)
|
||||
# HTTP 응답 검증 → 실패 시 재시작 → 5분 후 재검증 → 실패 시 Level 3 escalation
|
||||
|
||||
# ============================================
|
||||
# Configuration (Override via environment)
|
||||
# ============================================
|
||||
GATEWAY_URL="${OPENCLAW_GATEWAY_URL:-http://localhost:18789/}"
|
||||
MAX_RETRIES="${HEALTH_CHECK_MAX_RETRIES:-3}"
|
||||
RETRY_DELAY="${HEALTH_CHECK_RETRY_DELAY:-30}"
|
||||
ESCALATION_WAIT="${HEALTH_CHECK_ESCALATION_WAIT:-300}"
|
||||
LOG_DIR="${OPENCLAW_MEMORY_DIR:-$HOME/openclaw/memory}"
|
||||
LOG_FILE="$LOG_DIR/healthcheck-$(date +%Y-%m-%d).log"
|
||||
HTTP_TIMEOUT="${HEALTH_CHECK_HTTP_TIMEOUT:-10}"
|
||||
|
||||
# Performance metrics
|
||||
METRICS_FILE="$LOG_DIR/.healthcheck-metrics.json"
|
||||
|
||||
# Lock file로 중복 실행 방지
|
||||
LOCKFILE=/tmp/openclaw-healthcheck.lock
|
||||
if [ -f "$LOCKFILE" ]; then
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Previous health check still running, skipping..."
|
||||
exit 0
|
||||
fi
|
||||
touch "$LOCKFILE"
|
||||
trap 'rm -f "$LOCKFILE"' EXIT
|
||||
|
||||
# Create log directory if not exists
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# Load environment variables
|
||||
if [ -f "$HOME/openclaw/.env" ]; then
|
||||
# shellcheck source=/dev/null
|
||||
source "$HOME/openclaw/.env"
|
||||
elif [ -f "$HOME/.openclaw/.env" ]; then
|
||||
# shellcheck source=/dev/null
|
||||
source "$HOME/.openclaw/.env"
|
||||
fi
|
||||
|
||||
# Discord webhook from environment variable (optional)
|
||||
DISCORD_WEBHOOK="${DISCORD_WEBHOOK_URL:-}"
|
||||
|
||||
# Validate webhook URL (optional, warning only)
|
||||
if [ -z "$DISCORD_WEBHOOK" ]; then
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: DISCORD_WEBHOOK_URL not set. Notifications disabled." | tee -a "$LOG_FILE"
|
||||
fi
|
||||
|
||||
# ============================================
|
||||
# Functions
|
||||
# ============================================
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
send_discord_notification() {
|
||||
local message="$1"
|
||||
if [ -n "$DISCORD_WEBHOOK" ]; then
|
||||
local response_code
|
||||
response_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
-X POST "$DISCORD_WEBHOOK" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"content\": \"$message\"}" \
|
||||
2>&1)
|
||||
|
||||
if [ "$response_code" = "200" ] || [ "$response_code" = "204" ]; then
|
||||
log "✅ Discord notification sent (HTTP $response_code)"
|
||||
else
|
||||
log "⚠️ Discord notification failed (HTTP $response_code)"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
check_http() {
|
||||
local start_time
|
||||
start_time=$(date +%s)
|
||||
|
||||
local http_code
|
||||
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
--max-time "$HTTP_TIMEOUT" \
|
||||
"$GATEWAY_URL" 2>/dev/null || echo "000")
|
||||
|
||||
local end_time
|
||||
end_time=$(date +%s)
|
||||
local response_time=$((end_time - start_time))
|
||||
|
||||
# Record metric
|
||||
record_metric "http_check" "$http_code" "$response_time"
|
||||
|
||||
if [ "$http_code" = "200" ]; then
|
||||
log "HTTP check passed (${response_time}s)"
|
||||
return 0
|
||||
else
|
||||
log "HTTP check failed: HTTP $http_code (${response_time}s)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
restart_gateway() {
|
||||
log "Restarting OpenClaw Gateway..."
|
||||
|
||||
local start_time
|
||||
start_time=$(date +%s)
|
||||
|
||||
if openclaw gateway restart >> "$LOG_FILE" 2>&1; then
|
||||
local end_time
|
||||
end_time=$(date +%s)
|
||||
local restart_time=$((end_time - start_time))
|
||||
|
||||
log "Gateway restart completed (${restart_time}s)"
|
||||
record_metric "gateway_restart" "success" "$restart_time"
|
||||
|
||||
sleep "$RETRY_DELAY"
|
||||
return 0
|
||||
else
|
||||
log "⚠️ Gateway restart command failed"
|
||||
record_metric "gateway_restart" "failed" 0
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
rotate_old_logs() {
|
||||
# Delete logs older than 14 days
|
||||
local deleted_count
|
||||
deleted_count=$(find "$LOG_DIR" -name "healthcheck-*.log" -mtime +14 -delete -print 2>/dev/null | wc -l)
|
||||
|
||||
if [ "$deleted_count" -gt 0 ]; then
|
||||
log "Rotated $deleted_count old log files"
|
||||
fi
|
||||
}
|
||||
|
||||
record_metric() {
|
||||
local metric_name="$1"
|
||||
local result="$2"
|
||||
local duration="$3"
|
||||
local timestamp
|
||||
timestamp=$(date +%s)
|
||||
|
||||
# Append to metrics file (JSON Lines format)
|
||||
echo "{\"timestamp\":$timestamp,\"metric\":\"$metric_name\",\"result\":\"$result\",\"duration\":$duration}" >> "$METRICS_FILE"
|
||||
}
|
||||
|
||||
escalate_to_level3() {
|
||||
log "🚨 Still unhealthy after ${ESCALATION_WAIT}s, triggering emergency recovery..."
|
||||
|
||||
# Discord 알림 (Level 3 시작)
|
||||
send_discord_notification "🚨 **Level 3 Emergency Recovery 시작**\n\n${ESCALATION_WAIT}초 대기 후에도 Gateway 복구 안 됨.\nClaude가 자동으로 진단 및 복구를 시도합니다.\n\n예상 소요 시간: 30분\n현재 시각: $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
|
||||
local emergency_script="$HOME/openclaw/scripts/emergency-recovery.sh"
|
||||
|
||||
if [ -f "$emergency_script" ]; then
|
||||
bash "$emergency_script"
|
||||
else
|
||||
log "❌ Emergency recovery script not found: $emergency_script"
|
||||
send_discord_notification "🚨 **Level 3 실행 실패**\n\nEmergency recovery script not found:\n\`$emergency_script\`\n\n수동 개입 필요."
|
||||
fi
|
||||
}
|
||||
|
||||
# ============================================
|
||||
# Main Logic
|
||||
# ============================================
|
||||
|
||||
main() {
|
||||
log "=== Health Check Started (PID: $$) ==="
|
||||
|
||||
# Log rotation (cleanup old logs)
|
||||
rotate_old_logs
|
||||
|
||||
# HTTP 응답 체크
|
||||
if ! check_http; then
|
||||
log "⚠️ Gateway unhealthy (HTTP failed)"
|
||||
|
||||
# 3번 재시도
|
||||
for i in $(seq 1 "$MAX_RETRIES"); do
|
||||
log "Retry $i/$MAX_RETRIES..."
|
||||
|
||||
if restart_gateway && check_http; then
|
||||
log "✅ Recovery successful on retry $i"
|
||||
|
||||
# Discord 알림 (복구 성공)
|
||||
send_discord_notification "✅ **Gateway 복구 성공**\n\nLevel 2 Health Check가 Gateway를 재시작하여 복구했습니다.\n- 재시도 횟수: $i/$MAX_RETRIES\n- 현재 시각: $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
|
||||
record_metric "recovery" "success" "$i"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
log "❌ Recovery failed after $MAX_RETRIES retries"
|
||||
log "🚨 Escalating to Level 3 (Claude Emergency Recovery)..."
|
||||
record_metric "recovery" "failed" "$MAX_RETRIES"
|
||||
|
||||
# Discord 알림 (Level 3로 escalation)
|
||||
send_discord_notification "⚠️ **Level 2 Health Check 실패**\n\nGateway를 ${MAX_RETRIES}회 재시작했으나 복구 실패.\n${ESCALATION_WAIT}초 후 Level 3 (Claude Emergency Recovery)로 escalation합니다.\n\n현재 시각: $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
|
||||
# 5분 대기 후 최종 검증
|
||||
sleep "$ESCALATION_WAIT"
|
||||
|
||||
if ! check_http; then
|
||||
escalate_to_level3
|
||||
else
|
||||
log "✅ Gateway recovered during waiting period"
|
||||
|
||||
# Discord 알림 (대기 중 복구됨)
|
||||
send_discord_notification "✅ **Gateway 자동 복구됨**\n\n${ESCALATION_WAIT}초 대기 중 Gateway가 스스로 복구되었습니다.\nLevel 3 Emergency Recovery는 실행하지 않습니다."
|
||||
|
||||
record_metric "recovery" "self_healed" 0
|
||||
fi
|
||||
else
|
||||
log "✅ Gateway healthy"
|
||||
record_metric "health_check" "healthy" 0
|
||||
fi
|
||||
|
||||
log "=== Health Check Completed ==="
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main
|
||||
Reference in New Issue
Block a user