#!/usr/bin/env node /** * Create weekly audit cron for self-evaluation system * * Runs every Sunday night at 23:30 * Analyzes all validation results from the past week * Generates comprehensive report with recommendations */ const { execSync } = require('child_process'); function createWeeklyAuditCron() { const job = { name: '๐Ÿ”ฌ Weekly Self-Evaluation Audit', enabled: true, schedule: { kind: 'cron', expr: '30 23 * * 0', // Sunday 23:30 tz: 'Asia/Seoul' }, sessionTarget: 'isolated', wakeMode: 'next-heartbeat', payload: { kind: 'agentTurn', model: 'anthropic/claude-opus-4-5', thinking: 'high', channel: 'discord', to: 'channel:1468386844621144065', deliver: true, message: ` ๐Ÿ”ฌ **์ฃผ๊ฐ„ ์ž๊ธฐํ‰๊ฐ€ ๊ฐ์‚ฌ (Weekly Self-Evaluation Audit)** **Mission:** ์ง€๋‚œ 7์ผ๊ฐ„ ์ž๊ธฐํ‰๊ฐ€ ์‹œ์Šคํ…œ ์ „์ฒด๋ฅผ ์‹ฌ์ธต ๋ถ„์„ํ•˜๊ณ  ๊ฐœ์„  ๋ฐฉํ–ฅ ์ œ์‹œ --- ## Phase 1: Data Collection 1. **Validation ๊ฒฐ๊ณผ ์ˆ˜์ง‘:** \`\`\`bash for i in {0..6}; do date=$(date -v-$i'd' '+%Y-%m-%d') if [ -f ~/openclaw/memory/validation-$date.jsonl ]; then echo "Found: validation-$date.jsonl" cat ~/openclaw/memory/validation-$date.jsonl fi done \`\`\` 2. **Self-Review ๊ธฐ๋ก ์ˆ˜์ง‘ (์žˆ์œผ๋ฉด):** \`\`\`bash for i in {0..6}; do date=$(date -v-$i'd' '+%Y-%m-%d') if [ -f ~/openclaw/memory/self-review-$date.md ]; then echo "Found: self-review-$date.md" cat ~/openclaw/memory/self-review-$date.md fi done \`\`\` 3. **Daily ๋ฉ”๋ชจ๋ฆฌ ์Šค์บ”:** \`\`\`bash for i in {0..6}; do date=$(date -v-$i'd' '+%Y-%m-%d') if [ -f ~/openclaw/memory/$date.md ]; then # Extract sections mentioning "์ž๊ธฐํ‰๊ฐ€", "ํ‰๊ฐ€", "reflection" grep -A 5 -B 5 -i "์ž๊ธฐํ‰๊ฐ€\\|ํ‰๊ฐ€\\|reflection" ~/openclaw/memory/$date.md || true fi done \`\`\` --- ## Phase 2: Statistical Analysis ์ˆ˜์ง‘ํ•œ JSONL ๋ฐ์ดํ„ฐ๋ฅผ ๋ถ„์„ํ•˜์—ฌ ๋‹ค์Œ ํ†ต๊ณ„ ์‚ฐ์ถœ: ### 2.1 ์ „์ฒด ํ†ต๊ณ„ - **์ด ํฌ๋ก  ์‹คํ–‰:** ? ํšŒ - **์ž๊ธฐํ‰๊ฐ€ ์ˆ˜ํ–‰:** ? ํšŒ (์ˆ˜ํ–‰๋ฅ : ?%) - **Validation ํ†ต๊ณผ:** ? ํšŒ (ํ†ต๊ณผ์œจ: ?%) - **Validation ์‹คํŒจ:** ? ํšŒ (์‹คํŒจ์œจ: ?%) ### 2.2 Verdict ๋ถ„ํฌ - **PASS:** ? ํšŒ (?%) - **WARN:** ? ํšŒ (?%) - **INFO:** ? ํšŒ (?%) - **FAIL:** ? ํšŒ (?%) ### 2.3 ์‹คํŒจ ์›์ธ ๋ถ„์„ ๊ฐ validationFlags์˜ type๋ณ„ ์ง‘๊ณ„: | Flag Type | Count | % | Severity Distribution | |-----------|-------|---|----------------------| | INACCURATE_SELF_EVALUATION | ? | ?% | HIGH: ?, MEDIUM: ?, LOW: ? | | FORBIDDEN_PHRASE | ? | ?% | HIGH: ?, MEDIUM: ?, LOW: ? | | EMOJI_OVERFLOW | ? | ?% | HIGH: ?, MEDIUM: ?, LOW: ? | | SEPARATOR_OVERFLOW | ? | ?% | HIGH: ?, MEDIUM: ?, LOW: ? | | TONE_MISMATCH | ? | ?% | HIGH: ?, MEDIUM: ?, LOW: ? | | HIGH_ERROR_RATE | ? | ?% | HIGH: ?, MEDIUM: ?, LOW: ? | | PERFORMANCE_DEGRADATION | ? | ?% | HIGH: ?, MEDIUM: ?, LOW: ? | | ... | ... | ... | ... | ### 2.4 ํฌ๋ก ๋ณ„ ์„ฑ์ ํ‘œ ๊ฐ ํฌ๋ก ์˜ ํ†ต๊ณผ์œจ: | Cron Name | Executions | PASS | WARN | INFO | FAIL | Pass Rate | |-----------|-----------|------|------|------|------|-----------| | TQQQ 15๋ถ„ ๋ชจ๋‹ˆํ„ฐ๋ง | ? | ? | ? | ? | ? | ?% | | Daily Stock Briefing | ? | ? | ? | ? | ? | ?% | | Trend Hunter | ? | ? | ? | ? | ? | ?% | | ... | ... | ... | ... | ... | ... | ... | --- ## Phase 3: Pattern Analysis (Deep Thinking) ### 3.1 ๋ฐ˜๋ณต ํŒจํ„ด ๋ฐœ๊ฒฌ **์ž๋ฌธ:** - ํŠน์ • ํฌ๋ก ์ด ๊ณ„์† ๊ฐ™์€ ์‹ค์ˆ˜๋ฅผ ๋ฐ˜๋ณตํ•˜๋Š”๊ฐ€? - ํŠน์ • ์‹œ๊ฐ„๋Œ€์— ํ’ˆ์งˆ์ด ๋–จ์–ด์ง€๋Š”๊ฐ€? (ํ”ผ๋กœ๋„ ์˜ํ–ฅ?) - ํŠน์ • ์œ ํ˜•์˜ ์ž‘์—…์—์„œ ์ž๊ธฐํ‰๊ฐ€๊ฐ€ ๋ถ€์ •ํ™•ํ•œ๊ฐ€? - ๊ฐœ์„  ์ œ์•ˆ์ด ์‹ค์ œ๋กœ ๋ฐ˜์˜๋˜๋Š”๊ฐ€? (Week 1 ์ œ์•ˆ โ†’ Week 2 ๊ฐœ์„  ์—ฌ๋ถ€) **์˜ˆ์‹œ:** - "TQQQ ํฌ๋ก ์€ 7ํšŒ ์ค‘ 5ํšŒ๊ฐ€ TONE_MISMATCH โ†’ ๊ธˆ์ง€ ํ‘œํ˜„์„ ๊ณ„์† ์‚ฌ์šฉ" - "์ €๋… ํฌ๋ก ๋“ค(19:00~23:00)์€ ํ†ต๊ณผ์œจ 62%, ์•„์นจ ํฌ๋ก ๋“ค(06:00~09:00)์€ ํ†ต๊ณผ์œจ 91% โ†’ ํ”ผ๋กœ๋„ ์˜ํ–ฅ?" - "Trend Hunter๋Š” INACCURATE_SELF_EVALUATION 3ํšŒ โ†’ ๋ณต์žกํ•œ ์ž‘์—…์—์„œ ์ž๊ธฐํ‰๊ฐ€ ์–ด๋ ค์›€" ### 3.2 ์ž๊ธฐํ‰๊ฐ€ ์‹ ๋ขฐ๋„ ์ธก์ • **Accuracy Score ๊ณ„์‚ฐ:** - Self-reported "OK" but validation found errors: -1 point - Self-reported "Jarvis" but forbidden phrases: -1 point - Self-reported emoji count โ‰  actual: -0.5 point - Accurate self-evaluation: +1 point ๊ฐ ํฌ๋ก ์˜ ํ‰๊ท  Accuracy Score: - Score > 0.8: ์‹ ๋ขฐ๋„ ๋†’์Œ (์ž๊ธฐํ‰๊ฐ€ ์ •ํ™•) - Score 0.5~0.8: ๋ณดํ†ต (๊ฐ€๋” ๋ถ€์ •ํ™•) - Score < 0.5: ์‹ ๋ขฐ๋„ ๋‚ฎ์Œ (์ž๊ธฐํ‰๊ฐ€ ๋ถˆ์‹ ) ### 3.3 ๊ฐœ์„  ํŠธ๋ Œ๋“œ ๋ถ„์„ **Week-over-Week ๋น„๊ต:** - ์ด๋ฒˆ ์ฃผ ํ†ต๊ณผ์œจ vs ์ง€๋‚œ ์ฃผ ํ†ต๊ณผ์œจ - ๊ฐœ์„ ๋œ ํฌ๋ก , ์•…ํ™”๋œ ํฌ๋ก  - ์ƒˆ๋กœ ๋ฐœ์ƒํ•œ ๋ฌธ์ œ, ํ•ด๊ฒฐ๋œ ๋ฌธ์ œ --- ## Phase 4: Root Cause Analysis ์‹คํŒจ ์‚ฌ๋ก€ Deep Dive (์ƒ์œ„ 3๊ฐœ): **์˜ˆ์‹œ ๋ถ„์„:** ### Case 1: TQQQ ํฌ๋ก  - TONE_MISMATCH (5ํšŒ ๋ฐ˜๋ณต) **์ฆ๊ฑฐ:** - 2026-02-01: "๋ณ€๋™ ๊ฐ์ง€ํ–ˆ์Šต๋‹ˆ๋‹ค" (forbidden) - 2026-02-03: "ํ™•์ธํ–ˆ์Šต๋‹ˆ๋‹ค" (forbidden) - 2026-02-04: "์•Œ๊ฒ ์Šต๋‹ˆ๋‹ค" (forbidden) - ... **Self-Evaluation:** - ๋ชจ๋‘ "โœ… ํ†ค: Jarvis"๋ผ๊ณ  ์ž๊ธฐํ‰๊ฐ€ โ†’ ๋ถ€์ •ํ™• **Root Cause:** - Response Guard ๊ทœ์น™์„ ์•Œ๊ณ  ์žˆ์œผ๋‚˜ ์‹ค์ œ ์‘๋‹ต ์ž‘์„ฑ ์‹œ ์Šต๊ด€์ ์œผ๋กœ ChatGPT ํ†ค ์‚ฌ์šฉ - Pre-Flight Checklist๋ฅผ ๊ฑด๋„ˆ๋œ€ (์ฒดํฌ๋ฐ•์Šค๊ฐ€ ์˜๋ฏธ ์—†์Œ) - ์ž๊ธฐํ‰๊ฐ€ ์‹œ์ ์— ์ด๋ฏธ ๊ธˆ์ง€ ํ‘œํ˜„ ์‚ฌ์šฉํ•œ ๊ฒƒ์„ ์žŠ์–ด๋ฒ„๋ฆผ **Why it persists:** - Reflection์—์„œ "๋‹ค์Œ์—” ์ฒดํฌ" ์ œ์•ˆํ–ˆ์œผ๋‚˜ ์‹ค์ œ๋กœ ์•ˆ ํ•จ - Feedback loop ๋ถ€์žฌ (์ด์ „ ์‹คํŒจ๋ฅผ ๋‹ค์Œ ์‹คํ–‰ ์‹œ ์ƒ๊ธฐ์‹œํ‚ค์ง€ ์•Š์Œ) **Recommended Fix:** - ์‘๋‹ต ์ž‘์„ฑ ์ „ Response Guard ์žฌ์ฃผ์ž… (ํ”„๋กฌํ”„ํŠธ์— ๊ธˆ์ง€ ํ‘œํ˜„ ๋ฆฌ์ŠคํŠธ ํฌํ•จ) - ๋˜๋Š” ์‘๋‹ต ์ž‘์„ฑ ํ›„ ์ž๋™ ์Šค์บ” โ†’ ๊ธˆ์ง€ ํ‘œํ˜„ ๋ฐœ๊ฒฌ ์‹œ ์žฌ์ž‘์„ฑ ๊ฐ•์ œ --- ## Phase 5: Recommendations ### 5.1 ์ฆ‰์‹œ ์กฐ์น˜ (Critical) 1. **์ž๊ธฐํ‰๊ฐ€ ๊ธฐ์ค€ ๋ช…ํ™•ํ™”** - [ ] "OK"์˜ ์ •์˜ ๋ฌธ์„œํ™”: 0 tool errors, 0 data inaccuracies - [ ] "Jarvis"์˜ ์ •์˜ ๋ฌธ์„œํ™”: 0 forbidden phrases, witty opening - [ ] AGENTS.md์— ์ฒดํฌ๋ฆฌ์ŠคํŠธ ๊ตฌ์ฒดํ™” 2. **Evaluation ์ „ ์ฒดํฌ๋ฆฌ์ŠคํŠธ ๊ฐ•์ œ ์‹คํ–‰** - [ ] Pre-Flight Checklist๋ฅผ ์˜๋ฌดํ™” - [ ] ๊ธˆ์ง€ ํ‘œํ˜„ ์Šค์บ” ์ž๋™ํ™” (์Šคํฌ๋ฆฝํŠธ) - [ ] ํฌ๋งท ์นด์šดํŠธ ์ž๋™ ์ œ๊ณต (์ด๋ชจ์ง€, ๊ตฌ๋ถ„์„ ) 3. **Feedback Loop ๊ตฌํ˜„** - [ ] Validation ์‹คํŒจ ์‹œ ๋‹ค์Œ ํฌ๋ก ์— ๊ฒฝ๊ณ  ์ฃผ์ž… - [ ] "์ง€๋‚œ๋ฒˆ ๋‹น์‹ ์€ X๋ฅผ ๋†“์ณค์Šต๋‹ˆ๋‹ค. ์ด๋ฒˆ์—” ์ฒดํฌํ•˜์„ธ์š”." ### 5.2 ์ค‘๊ธฐ ์กฐ์น˜ (Important) 4. **Reflection ํ’ˆ์งˆ ๊ฐœ์„ ** - [ ] "์™œ ์ด๋ ‡๊ฒŒ ํ‰๊ฐ€ํ–ˆ๋‚˜?" ์งˆ๋ฌธ ์ถ”๊ฐ€ - [ ] ํ‰๊ฐ€ ๊ทผ๊ฑฐ ๋ช…์‹œ (์˜ˆ: "OK - yf ์Šคํฌ๋ฆฝํŠธ exit 0") - [ ] Root cause analysis ์‹ฌํ™” 5. **Baseline ์ •ํ™•๋„ ํ–ฅ์ƒ** - [ ] Baseline ๋ฐ์ดํ„ฐ 30๊ฐœ ์ƒ˜ํ”Œ ์ถ•์  (ํ˜„์žฌ: ?๊ฐœ) - [ ] Metric thresholds ์กฐ์ • (false positive ์ค„์ด๊ธฐ) ### 5.3 ์žฅ๊ธฐ ์กฐ์น˜ (Nice to Have) 6. **LLM-as-Judge (Opus ๋ชจ๋ธ๋กœ ์ผ๋ถ€ ์žฌํ‰๊ฐ€)** - [ ] ์ฃผ๊ฐ„ ๊ฐ์‚ฌ ์‹œ Validation FAIL ๊ฑด ์ค‘ 10๊ฐœ ์ƒ˜ํ”Œ๋ง - [ ] Opus ๋ชจ๋ธ๋กœ ์žฌํ‰๊ฐ€ (Haiku ์ž๊ธฐํ‰๊ฐ€ vs Opus ํ‰๊ฐ€ ๋น„๊ต) - [ ] ์ •ํ™•๋„ ์ธก์ • 7. **Human-in-the-Loop (์›” 1ํšŒ ์ •์šฐ๋‹˜ ๋ฆฌ๋ทฐ)** - [ ] ์›”๋ง์— ํ•œ ๋‹ฌ์น˜ ๊ฐ์‚ฌ ๋ณด๊ณ ์„œ ์ƒ์„ฑ - [ ] ์ •์šฐ๋‹˜๊ป˜ top 5 ๋ฌธ์ œ ์ผ€์ด์Šค ์ œ์ถœ - [ ] ํ”ผ๋“œ๋ฐฑ ๋ฐ˜์˜ --- ## Phase 6: Report Generation ### Executive Summary **์ฃผ๊ฐ„ ์„ฑ์ : [A/B/C/D/F]** - ํ†ต๊ณผ์œจ: ?% (๋ชฉํ‘œ: 90%) - ๊ฐœ์„  ํŠธ๋ Œ๋“œ: [โ†‘ ์ƒ์Šน / โ†’ ์ •์ฒด / โ†“ ํ•˜๋ฝ] - ์ฃผ์š” ๋ฌธ์ œ: [Top 3 flags] **ํ•ต์‹ฌ ๋ฐœ๊ฒฌ:** 1. [๊ฐ€์žฅ ์ค‘์š”ํ•œ ํŒจํ„ด 1๊ฐœ] 2. [๋‘ ๋ฒˆ์งธ ์ค‘์š”ํ•œ ํŒจํ„ด] 3. [์„ธ ๋ฒˆ์งธ ์ค‘์š”ํ•œ ํŒจํ„ด] **๊ถŒ์žฅ ์กฐ์น˜:** 1. [์ฆ‰์‹œ ์กฐ์น˜ 1๊ฐœ] (์˜ˆ์ƒ ํšจ๊ณผ: ?% ๊ฐœ์„ ) 2. [์ค‘๊ธฐ ์กฐ์น˜ 1๊ฐœ] (์˜ˆ์ƒ ํšจ๊ณผ: ?% ๊ฐœ์„ ) **๋‹ค์Œ ์ฃผ ๋ชฉํ‘œ:** - ํ†ต๊ณผ์œจ ?% โ†’ ?% ๋‹ฌ์„ฑ - [ํŠน์ • ํฌ๋ก ] ๊ฐœ์„  ์ง‘์ค‘ --- ### ์ƒ์„ธ ๋ฐ์ดํ„ฐ (์œ„์—์„œ ์‚ฐ์ถœํ•œ ํ†ต๊ณ„ ํ‘œ ์ „๋ถ€ ์ฒจ๋ถ€) --- ## Final Output ์œ„ ๋ถ„์„์„ ๋ชจ๋‘ ์ˆ˜ํ–‰ํ•œ ํ›„, ๋‹ค์Œ ํ˜•์‹์œผ๋กœ ๋ณด๊ณ : \`\`\` ๐Ÿ”ฌ **์ฃผ๊ฐ„ ์ž๊ธฐํ‰๊ฐ€ ๊ฐ์‚ฌ ๋ณด๊ณ ์„œ** ๐Ÿ“… 2026-02-01 ~ 2026-02-07 [Executive Summary] [์ƒ์„ธ ํ†ต๊ณ„] [ํŒจํ„ด ๋ถ„์„] [Root Cause ์‚ฌ๋ก€] [๊ถŒ์žฅ ์กฐ์น˜] \`\`\` **์ค‘์š”:** - Opus + Thinking High๋กœ ์‹ฌ์ธต ๋ถ„์„ - ํ‘œ๋ฉด์  ์ˆซ์ž๊ฐ€ ์•„๋‹Œ ๊ทผ๋ณธ ์›์ธ ํŒŒ์•… - ์‹คํ–‰ ๊ฐ€๋Šฅํ•œ ๊ตฌ์ฒด์  ์กฐ์น˜ ์ œ์‹œ - ๋‹ค์Œ ์ฃผ ๊ฐœ์„  ๋ชฉํ‘œ ๋ช…ํ™•ํžˆ ์„ค์ • `.trim() } }; return job; } async function main() { console.log('๐Ÿ”ฌ Creating weekly self-evaluation audit cron...\n'); try { const job = createWeeklyAuditCron(); const jobJson = JSON.stringify(job).replace(/'/g, "\\'"); const cmd = `openclaw cron add '${jobJson}'`; const result = execSync(cmd, { encoding: 'utf8' }); console.log(`โœ… Created weekly audit cron`); console.log(` Schedule: ${job.schedule.expr} (Every Sunday 23:30 KST)`); console.log(` Model: ${job.payload.model} (Opus + Thinking High)`); console.log(`\n๐Ÿ“Š This cron will:`); console.log(` 1. Analyze all validation results from past 7 days`); console.log(` 2. Identify patterns and root causes`); console.log(` 3. Calculate accuracy scores per cron`); console.log(` 4. Generate comprehensive report with recommendations`); console.log(` 5. Set goals for next week`); console.log(`\nโœจ First run: Next Sunday at 23:30`); } catch (e) { if (e.message.includes('already exists')) { console.log(`โญ๏ธ Skipped: Weekly audit cron already exists`); } else { console.log(`โŒ Failed to create weekly audit cron:`); console.log(` Error: ${e.message}`); throw e; } } } if (require.main === module) { main().catch(console.error); } module.exports = { createWeeklyAuditCron };