526 lines
14 KiB
JavaScript
526 lines
14 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* Self-Review Validation Script (V3 - Stage 2)
|
||
*
|
||
* Purpose: External validation of self-evaluation results
|
||
* Runs: 1 minute after each self-evaluating cron
|
||
*
|
||
* Validation Layers:
|
||
* 1. Metric Verification (tool errors, completion time, token usage)
|
||
* 2. Format Verification (emoji count, separator count, forbidden phrases)
|
||
* 3. Consistency Verification (compare with recent evaluations)
|
||
*
|
||
* Output: validation-YYYY-MM-DD.jsonl
|
||
*/
|
||
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// ============================================================================
|
||
// Configuration
|
||
// ============================================================================
|
||
|
||
const CONFIG = {
|
||
// Forbidden phrases (from Response Guard)
|
||
FORBIDDEN_PHRASES: [
|
||
'알겠습니다',
|
||
'완료!',
|
||
'완료했습니다',
|
||
'처리했습니다',
|
||
'설정했습니다',
|
||
'확인했습니다',
|
||
'기록했습니다'
|
||
],
|
||
|
||
// Format limits
|
||
MAX_EMOJIS: 3,
|
||
MAX_SEPARATORS: 2,
|
||
|
||
// Metric thresholds
|
||
MAX_TOOL_ERRORS: 2,
|
||
COMPLETION_TIME_MULTIPLIER: 1.5, // 150% of baseline
|
||
TOKEN_USAGE_MULTIPLIER: 1.3, // 130% of baseline
|
||
|
||
// Paths
|
||
MEMORY_DIR: path.join(process.env.HOME, 'openclaw', 'memory'),
|
||
VALIDATION_DIR: path.join(process.env.HOME, 'openclaw', 'memory'),
|
||
BASELINE_FILE: path.join(process.env.HOME, 'openclaw', 'memory', 'cron-baselines.json')
|
||
};
|
||
|
||
// ============================================================================
|
||
// Utility Functions
|
||
// ============================================================================
|
||
|
||
/**
|
||
* Count emojis in text
|
||
*/
|
||
function countEmojis(text) {
|
||
// Unicode emoji ranges
|
||
const emojiRegex = /[\u{1F300}-\u{1F9FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]/gu;
|
||
const matches = text.match(emojiRegex);
|
||
return matches ? matches.length : 0;
|
||
}
|
||
|
||
/**
|
||
* Count markdown separators (---)
|
||
*/
|
||
function countSeparators(text) {
|
||
const lines = text.split('\n');
|
||
let count = 0;
|
||
for (const line of lines) {
|
||
const trimmed = line.trim();
|
||
if (trimmed === '---' || /^-{3,}$/.test(trimmed)) {
|
||
count++;
|
||
}
|
||
}
|
||
return count;
|
||
}
|
||
|
||
/**
|
||
* Detect forbidden phrases
|
||
*/
|
||
function detectForbiddenPhrases(text) {
|
||
const found = [];
|
||
for (const phrase of CONFIG.FORBIDDEN_PHRASES) {
|
||
if (text.includes(phrase)) {
|
||
found.push(phrase);
|
||
}
|
||
}
|
||
return found;
|
||
}
|
||
|
||
/**
|
||
* Parse self-evaluation section from cron output
|
||
*/
|
||
function parseSelfEvaluation(output) {
|
||
const evalSection = output.match(/##\s*자기평가[\s\S]*?(?=\n##|$)/i);
|
||
if (!evalSection) return null;
|
||
|
||
const text = evalSection[0];
|
||
const result = {
|
||
completeness: null,
|
||
accuracy: null,
|
||
tone: null,
|
||
conciseness: null,
|
||
improvement: null,
|
||
raw: text
|
||
};
|
||
|
||
// Parse completeness (e.g., "완성도: 3/3")
|
||
const completenessMatch = text.match(/완성도[::]\s*(\d+)\/(\d+)/i);
|
||
if (completenessMatch) {
|
||
result.completeness = {
|
||
score: parseInt(completenessMatch[1]),
|
||
total: parseInt(completenessMatch[2])
|
||
};
|
||
}
|
||
|
||
// Parse accuracy (e.g., "정확성: OK" or "WARNING")
|
||
const accuracyMatch = text.match(/정확성[::]\s*(OK|WARNING|FAIL)/i);
|
||
if (accuracyMatch) {
|
||
result.accuracy = accuracyMatch[1].toUpperCase();
|
||
}
|
||
|
||
// Parse tone (e.g., "톤: Jarvis" or "ChatGPT-like")
|
||
const toneMatch = text.match(/톤[::]\s*(Jarvis|ChatGPT[-\s]?like?)/i);
|
||
if (toneMatch) {
|
||
result.tone = toneMatch[1].toLowerCase().includes('jarvis') ? 'Jarvis' : 'ChatGPT-like';
|
||
}
|
||
|
||
// Parse conciseness (e.g., "간결성: 2 emojis")
|
||
const concisenessMatch = text.match(/간결성[::].*?(\d+)\s*emojis?/i);
|
||
if (concisenessMatch) {
|
||
result.conciseness = {
|
||
emojis: parseInt(concisenessMatch[1])
|
||
};
|
||
}
|
||
|
||
// Parse improvement suggestion
|
||
const improvementMatch = text.match(/개선[::]\s*(.+)/i);
|
||
if (improvementMatch) {
|
||
result.improvement = improvementMatch[1].trim();
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* Load recent evaluations for consistency check
|
||
*/
|
||
function loadRecentEvaluations(days = 7) {
|
||
const evaluations = [];
|
||
const now = new Date();
|
||
|
||
for (let i = 0; i < days; i++) {
|
||
const date = new Date(now);
|
||
date.setDate(date.getDate() - i);
|
||
const dateStr = date.toISOString().split('T')[0];
|
||
const filePath = path.join(CONFIG.VALIDATION_DIR, `validation-${dateStr}.jsonl`);
|
||
|
||
if (fs.existsSync(filePath)) {
|
||
const lines = fs.readFileSync(filePath, 'utf8').split('\n').filter(l => l.trim());
|
||
for (const line of lines) {
|
||
try {
|
||
evaluations.push(JSON.parse(line));
|
||
} catch (e) {
|
||
// Skip malformed lines
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return evaluations;
|
||
}
|
||
|
||
/**
|
||
* Load baseline metrics for a cron
|
||
*/
|
||
function loadBaseline(cronId) {
|
||
if (!fs.existsSync(CONFIG.BASELINE_FILE)) {
|
||
return null;
|
||
}
|
||
|
||
const baselines = JSON.parse(fs.readFileSync(CONFIG.BASELINE_FILE, 'utf8'));
|
||
return baselines[cronId] || null;
|
||
}
|
||
|
||
/**
|
||
* Update baseline metrics
|
||
*/
|
||
function updateBaseline(cronId, metrics) {
|
||
let baselines = {};
|
||
if (fs.existsSync(CONFIG.BASELINE_FILE)) {
|
||
baselines = JSON.parse(fs.readFileSync(CONFIG.BASELINE_FILE, 'utf8'));
|
||
}
|
||
|
||
if (!baselines[cronId]) {
|
||
baselines[cronId] = {
|
||
samples: [],
|
||
avg: {}
|
||
};
|
||
}
|
||
|
||
// Add new sample
|
||
baselines[cronId].samples.push({
|
||
timestamp: Date.now(),
|
||
completionTime: metrics.completionTime,
|
||
tokenUsage: metrics.tokenUsage,
|
||
toolErrors: metrics.toolErrors
|
||
});
|
||
|
||
// Keep only last 30 samples
|
||
if (baselines[cronId].samples.length > 30) {
|
||
baselines[cronId].samples = baselines[cronId].samples.slice(-30);
|
||
}
|
||
|
||
// Recalculate averages
|
||
const samples = baselines[cronId].samples;
|
||
baselines[cronId].avg = {
|
||
completionTime: samples.reduce((sum, s) => sum + s.completionTime, 0) / samples.length,
|
||
tokenUsage: samples.reduce((sum, s) => sum + s.tokenUsage, 0) / samples.length,
|
||
toolErrors: samples.reduce((sum, s) => sum + s.toolErrors, 0) / samples.length
|
||
};
|
||
|
||
fs.writeFileSync(CONFIG.BASELINE_FILE, JSON.stringify(baselines, null, 2));
|
||
}
|
||
|
||
// ============================================================================
|
||
// Validation Logic
|
||
// ============================================================================
|
||
|
||
/**
|
||
* Validate metrics
|
||
*/
|
||
function validateMetrics(metrics, baseline) {
|
||
const flags = [];
|
||
|
||
// Tool errors
|
||
if (metrics.toolErrors > CONFIG.MAX_TOOL_ERRORS) {
|
||
flags.push({
|
||
type: 'HIGH_ERROR_RATE',
|
||
severity: 'HIGH',
|
||
detail: `Tool errors: ${metrics.toolErrors} (threshold: ${CONFIG.MAX_TOOL_ERRORS})`,
|
||
evidence: metrics.toolErrorDetails || []
|
||
});
|
||
}
|
||
|
||
// Completion time (if baseline exists)
|
||
if (baseline && baseline.avg.completionTime) {
|
||
const threshold = baseline.avg.completionTime * CONFIG.COMPLETION_TIME_MULTIPLIER;
|
||
if (metrics.completionTime > threshold) {
|
||
flags.push({
|
||
type: 'PERFORMANCE_DEGRADATION',
|
||
severity: 'MEDIUM',
|
||
detail: `Completion time: ${metrics.completionTime}ms (baseline avg: ${baseline.avg.completionTime}ms, threshold: ${threshold}ms)`
|
||
});
|
||
}
|
||
}
|
||
|
||
// Token usage (if baseline exists)
|
||
if (baseline && baseline.avg.tokenUsage) {
|
||
const threshold = baseline.avg.tokenUsage * CONFIG.TOKEN_USAGE_MULTIPLIER;
|
||
if (metrics.tokenUsage > threshold) {
|
||
flags.push({
|
||
type: 'TOKEN_USAGE_HIGH',
|
||
severity: 'LOW',
|
||
detail: `Token usage: ${metrics.tokenUsage} (baseline avg: ${baseline.avg.tokenUsage}, threshold: ${threshold})`
|
||
});
|
||
}
|
||
}
|
||
|
||
return flags;
|
||
}
|
||
|
||
/**
|
||
* Validate format
|
||
*/
|
||
function validateFormat(output, selfEval) {
|
||
const flags = [];
|
||
|
||
// Count actual emojis
|
||
const actualEmojis = countEmojis(output);
|
||
|
||
// Count actual separators
|
||
const actualSeparators = countSeparators(output);
|
||
|
||
// Detect forbidden phrases
|
||
const forbiddenFound = detectForbiddenPhrases(output);
|
||
|
||
// Check emoji count
|
||
if (actualEmojis > CONFIG.MAX_EMOJIS) {
|
||
flags.push({
|
||
type: 'EMOJI_OVERFLOW',
|
||
severity: 'LOW',
|
||
detail: `Actual emojis: ${actualEmojis} (limit: ${CONFIG.MAX_EMOJIS})`
|
||
});
|
||
}
|
||
|
||
// Check separator count
|
||
if (actualSeparators > CONFIG.MAX_SEPARATORS) {
|
||
flags.push({
|
||
type: 'SEPARATOR_OVERFLOW',
|
||
severity: 'LOW',
|
||
detail: `Actual separators: ${actualSeparators} (limit: ${CONFIG.MAX_SEPARATORS})`
|
||
});
|
||
}
|
||
|
||
// Check forbidden phrases
|
||
if (forbiddenFound.length > 0) {
|
||
flags.push({
|
||
type: 'FORBIDDEN_PHRASE',
|
||
severity: 'MEDIUM',
|
||
detail: `Forbidden phrases detected: ${forbiddenFound.join(', ')}`
|
||
});
|
||
}
|
||
|
||
// Check self-eval accuracy
|
||
if (selfEval && selfEval.conciseness) {
|
||
if (selfEval.conciseness.emojis !== actualEmojis) {
|
||
flags.push({
|
||
type: 'INACCURATE_SELF_EVALUATION',
|
||
severity: 'MEDIUM',
|
||
detail: `Self-reported ${selfEval.conciseness.emojis} emojis, actual: ${actualEmojis}`,
|
||
evidence: {
|
||
selfReported: selfEval.conciseness.emojis,
|
||
actual: actualEmojis
|
||
}
|
||
});
|
||
}
|
||
}
|
||
|
||
return flags;
|
||
}
|
||
|
||
/**
|
||
* Validate consistency
|
||
*/
|
||
function validateConsistency(selfEval, recentEvals, forbiddenFound) {
|
||
const flags = [];
|
||
|
||
if (!selfEval) return flags;
|
||
|
||
// Check tone consistency
|
||
if (selfEval.tone === 'Jarvis' && forbiddenFound.length > 0) {
|
||
flags.push({
|
||
type: 'TONE_MISMATCH',
|
||
severity: 'MEDIUM',
|
||
detail: `Self-reported 'Jarvis' but forbidden phrases detected: ${forbiddenFound.join(', ')}`
|
||
});
|
||
}
|
||
|
||
// Check accuracy consistency (if recent evals show pattern)
|
||
const recentAccuracyIssues = recentEvals.filter(e =>
|
||
e.selfEvaluation && e.selfEvaluation.accuracy === 'OK' &&
|
||
e.validationFlags.some(f => f.type === 'HIGH_ERROR_RATE')
|
||
);
|
||
|
||
if (recentAccuracyIssues.length >= 3 && selfEval.accuracy === 'OK') {
|
||
flags.push({
|
||
type: 'ACCURACY_OPTIMISM_BIAS',
|
||
severity: 'LOW',
|
||
detail: `Self-reported 'OK' but recent history shows ${recentAccuracyIssues.length} false OKs in past 7 days`
|
||
});
|
||
}
|
||
|
||
return flags;
|
||
}
|
||
|
||
// ============================================================================
|
||
// Main Validation Function
|
||
// ============================================================================
|
||
|
||
/**
|
||
* Validate a cron execution
|
||
*
|
||
* @param {Object} input
|
||
* @param {string} input.cronId - Cron job ID
|
||
* @param {string} input.cronName - Cron job name
|
||
* @param {string} input.output - Cron output text
|
||
* @param {Object} input.metrics - Execution metrics
|
||
* @param {number} input.metrics.completionTime - Completion time in ms
|
||
* @param {number} input.metrics.tokenUsage - Token usage
|
||
* @param {number} input.metrics.toolErrors - Number of tool errors
|
||
* @param {Array} input.metrics.toolErrorDetails - Details of tool errors
|
||
*/
|
||
function validate(input) {
|
||
const { cronId, cronName, output, metrics } = input;
|
||
const timestamp = Date.now();
|
||
|
||
// Parse self-evaluation
|
||
const selfEval = parseSelfEvaluation(output);
|
||
|
||
// Load baseline
|
||
const baseline = loadBaseline(cronId);
|
||
|
||
// Load recent evaluations
|
||
const recentEvals = loadRecentEvaluations(7);
|
||
const recentSameCron = recentEvals.filter(e => e.cronId === cronId);
|
||
|
||
// Detect forbidden phrases
|
||
const forbiddenFound = detectForbiddenPhrases(output);
|
||
|
||
// Validate
|
||
const metricFlags = validateMetrics(metrics, baseline);
|
||
const formatFlags = validateFormat(output, selfEval);
|
||
const consistencyFlags = validateConsistency(selfEval, recentSameCron, forbiddenFound);
|
||
|
||
const allFlags = [...metricFlags, ...formatFlags, ...consistencyFlags];
|
||
|
||
// Determine verdict
|
||
const verdict = allFlags.length === 0 ? 'PASS' :
|
||
allFlags.some(f => f.severity === 'HIGH') ? 'FAIL' :
|
||
allFlags.some(f => f.severity === 'MEDIUM') ? 'WARN' :
|
||
'INFO';
|
||
|
||
// Update baseline
|
||
updateBaseline(cronId, metrics);
|
||
|
||
// Prepare result
|
||
const result = {
|
||
cronId,
|
||
cronName,
|
||
timestamp,
|
||
selfEvaluation: selfEval,
|
||
validationFlags: allFlags,
|
||
verdict,
|
||
metrics: {
|
||
actual: metrics,
|
||
baseline: baseline ? baseline.avg : null
|
||
},
|
||
formatChecks: {
|
||
emojis: {
|
||
actual: countEmojis(output),
|
||
selfReported: selfEval && selfEval.conciseness ? selfEval.conciseness.emojis : null,
|
||
limit: CONFIG.MAX_EMOJIS
|
||
},
|
||
separators: {
|
||
actual: countSeparators(output),
|
||
limit: CONFIG.MAX_SEPARATORS
|
||
},
|
||
forbiddenPhrases: forbiddenFound
|
||
}
|
||
};
|
||
|
||
// Write to JSONL
|
||
const dateStr = new Date().toISOString().split('T')[0];
|
||
const outputPath = path.join(CONFIG.VALIDATION_DIR, `validation-${dateStr}.jsonl`);
|
||
fs.appendFileSync(outputPath, JSON.stringify(result) + '\n');
|
||
|
||
return result;
|
||
}
|
||
|
||
// ============================================================================
|
||
// CLI Interface
|
||
// ============================================================================
|
||
|
||
if (require.main === module) {
|
||
// Read input from stdin or command line
|
||
const args = process.argv.slice(2);
|
||
|
||
if (args.length === 0) {
|
||
console.error('Usage: validate-self-review.js <cronId> <cronName> <outputFile> <completionTime> <tokenUsage> <toolErrors>');
|
||
console.error(' or: cat output.txt | validate-self-review.js <cronId> <cronName> <completionTime> <tokenUsage> <toolErrors>');
|
||
process.exit(1);
|
||
}
|
||
|
||
let output;
|
||
let cronId, cronName, completionTime, tokenUsage, toolErrors;
|
||
|
||
// Check if reading from file or stdin
|
||
if (args.length >= 6) {
|
||
// From file
|
||
cronId = args[0];
|
||
cronName = args[1];
|
||
const outputFile = args[2];
|
||
completionTime = parseInt(args[3]);
|
||
tokenUsage = parseInt(args[4]);
|
||
toolErrors = parseInt(args[5]);
|
||
|
||
output = fs.readFileSync(outputFile, 'utf8');
|
||
} else {
|
||
// From stdin
|
||
cronId = args[0];
|
||
cronName = args[1];
|
||
completionTime = parseInt(args[2]);
|
||
tokenUsage = parseInt(args[3]);
|
||
toolErrors = parseInt(args[4]);
|
||
|
||
output = fs.readFileSync(0, 'utf8'); // Read from stdin
|
||
}
|
||
|
||
const result = validate({
|
||
cronId,
|
||
cronName,
|
||
output,
|
||
metrics: {
|
||
completionTime,
|
||
tokenUsage,
|
||
toolErrors,
|
||
toolErrorDetails: []
|
||
}
|
||
});
|
||
|
||
// Print result
|
||
console.log(JSON.stringify(result, null, 2));
|
||
|
||
// Exit with error code if FAIL
|
||
if (result.verdict === 'FAIL') {
|
||
process.exit(1);
|
||
}
|
||
}
|
||
|
||
// ============================================================================
|
||
// Exports
|
||
// ============================================================================
|
||
|
||
module.exports = {
|
||
validate,
|
||
countEmojis,
|
||
countSeparators,
|
||
detectForbiddenPhrases,
|
||
parseSelfEvaluation,
|
||
loadRecentEvaluations,
|
||
loadBaseline,
|
||
updateBaseline
|
||
};
|