Files
openclaw-backups/research/DR-0002-glm5-kimi-codex-claude-minimax-coding-comparison/generate_report.py

219 lines
9.8 KiB
Python

#!/usr/bin/env python3
"""
Generate report for GLM-5 vs Kimi K2.5 vs Codex 5.3 vs Claude Opus 4.6 vs Sonnet 4.6 vs MiniMax M2.5 Agentic Coding Comparison
"""
import json
import yaml
import os
from glob import glob
# Category mapping for nested JSON structures
CATEGORY_MAPPING = {
"Model_Info": ["Model_Info", "model_info"],
"Performance_Benchmarks": ["Performance_Benchmarks", "performance_benchmarks", "performance"],
"Pricing": ["Pricing", "pricing"],
"Agentic_Capabilities": ["Agentic_Capabilities", "agentic_capabilities", "capabilities"],
"User_Experiences": ["User_Experiences", "user_experiences", "user_experience"],
"Best_Use_Cases": ["Best_Use_Cases", "best_use_cases", "use_cases"],
"Opus_Replacement_Suitability": ["Opus_Replacement_Suitability", "opus_replacement_suitability", "replacement"],
}
def load_json_results(results_dir):
"""Load all JSON result files."""
results = {}
for json_file in glob(os.path.join(results_dir, "*.json")):
name = os.path.basename(json_file).replace('.json', '')
with open(json_file, 'r') as f:
results[name] = json.load(f)
return results
def load_fields(fields_file):
"""Load field definitions from fields.yaml."""
with open(fields_file, 'r') as f:
return yaml.safe_load(f)
def get_field_value(data, field_name, category_mapping=None):
"""Get field value from data, handling nested structures."""
# Direct match
if field_name in data:
return data[field_name]
# Check in category mappings
if category_mapping:
for cat_name, keys in category_mapping.items():
for key in keys:
if key in data and isinstance(data[key], dict) and field_name in data[key]:
return data[key][field_name]
# Deep search
for key, value in data.items():
if isinstance(value, dict) and field_name in value:
return value[field_name]
return None
def format_value(value):
"""Format a value for display."""
if value is None:
return "N/A"
if isinstance(value, list):
if len(value) == 0:
return "None"
return ", ".join(str(v) for v in value)
if isinstance(value, dict):
return "; ".join(f"{k}: {v}" for k, v in value.items())
return str(value)
def is_uncertain(data, field_name):
"""Check if field is marked as uncertain."""
uncertain_list = data.get('uncertain', [])
return field_name in uncertain_list
def generate_report(session_dir, output_file="report.md"):
"""Generate the final markdown report."""
results_dir = os.path.join(session_dir, "results")
fields_file = os.path.join(session_dir, "fields.yaml")
results = load_json_results(results_dir)
fields = load_fields(fields_file)
report_lines = []
# Header
report_lines.append("# GLM-5 vs Kimi K2.5 vs Codex 5.3 vs Claude Opus 4.6 vs Sonnet 4.6 vs MiniMax M2.5")
report_lines.append("")
report_lines.append("## Agentic Coding Model Comparison Report")
report_lines.append("")
report_lines.append(f"**Generated:** 2026-03-01 ")
report_lines.append(f"**Models Compared:** {len(results)} ")
report_lines.append("")
# Executive Summary Table
report_lines.append("## Executive Summary")
report_lines.append("")
report_lines.append("| Model | SWE-bench Est. | Input $/1M | Output $/1M | Context | Opus Replacement Score |")
report_lines.append("|-------|----------------|------------|-------------|---------|------------------------|")
for name in ['GLM-5', 'Kimi_K2.5', 'Codex_5.3', 'Claude_Opus_4.6', 'Claude_Sonnet_4.6', 'MiniMax_M2.5']:
if name in results:
data = results[name]
swe = get_field_value(data, 'swe_bench_verified_score', CATEGORY_MAPPING) or "N/A"
inp = get_field_value(data, 'input_price_per_1m', CATEGORY_MAPPING) or "N/A"
out = get_field_value(data, 'output_price_per_1m', CATEGORY_MAPPING) or "N/A"
ctx = get_field_value(data, 'context_window', CATEGORY_MAPPING) or "N/A"
score = get_field_value(data, 'replacement_confidence_score', CATEGORY_MAPPING) or "N/A"
report_lines.append(f"| {name.replace('_', ' ')} | {swe} | {inp} | {out} | {ctx} | {score}/10 |")
report_lines.append("")
# Table of Contents
report_lines.append("## Table of Contents")
report_lines.append("")
for i, name in enumerate(['GLM-5', 'Kimi_K2.5', 'Codex_5.3', 'Claude_Opus_4.6', 'Claude_Sonnet_4.6', 'MiniMax_M2.5'], 1):
if name in results:
data = results[name]
score = get_field_value(data, 'replacement_confidence_score', CATEGORY_MAPPING)
price = get_field_value(data, 'input_price_per_1m', CATEGORY_MAPPING)
report_lines.append(f"{i}. [{name.replace('_', ' ')}](#{name.lower().replace('_', '-')}) - Replacement Score: {score}/10 | Input: {price}")
report_lines.append("")
# Detailed sections for each model
for name in ['GLM-5', 'Kimi_K2.5', 'Codex_5.3', 'Claude_Opus_4.6', 'Claude_Sonnet_4.6', 'MiniMax_M2.5']:
if name not in results:
continue
data = results[name]
report_lines.append(f"## {name.replace('_', ' ')}")
report_lines.append("")
# Go through each category
for cat_name, cat_data in fields.get('categories', {}).items():
report_lines.append(f"### {cat_name.replace('_', ' ')}")
report_lines.append("")
for field in cat_data.get('fields', []):
field_name = field['name']
value = get_field_value(data, field_name, CATEGORY_MAPPING)
if value is not None and value != "":
uncertain_marker = " [uncertain]" if is_uncertain(data, field_name) else ""
display_value = format_value(value)
report_lines.append(f"**{field_name.replace('_', ' ').title()}:** {display_value}{uncertain_marker}")
report_lines.append("")
report_lines.append("---")
report_lines.append("")
# Comparison Analysis
report_lines.append("## Comparative Analysis")
report_lines.append("")
report_lines.append("### Best Value for Money")
report_lines.append("")
report_lines.append("1. **MiniMax M2.5** - 10x cheaper than Opus with decent capabilities for simple tasks")
report_lines.append("2. **Kimi K2.5** - Best balance of capability and cost with massive context window")
report_lines.append("3. **Claude Sonnet 4.6** - 90-95% of Opus capability at 60% input cost")
report_lines.append("")
report_lines.append("### Best for Complex Coding")
report_lines.append("")
report_lines.append("1. **Claude Opus 4.6** - Still the benchmark for complex reasoning and safety-critical code")
report_lines.append("2. **Codex 5.3** - Purpose-built for coding, excellent for pure software development")
report_lines.append("3. **Claude Sonnet 4.6** - Nearly matches Opus for most practical coding tasks")
report_lines.append("")
report_lines.append("### Best Opus 4.6 Replacement")
report_lines.append("")
report_lines.append("Based on replacement confidence scores:")
report_lines.append("")
report_lines.append("| Rank | Model | Confidence | Key Tradeoff |")
report_lines.append("|------|-------|------------|--------------|")
report_lines.append("| 1 | Claude Sonnet 4.6 | 9/10 | Same output price, 40% cheaper input |")
report_lines.append("| 2 | Codex 5.3 | 9/10 | Better at pure coding, less versatile |")
report_lines.append("| 3 | Kimi K2.5 | 8/10 | 2-3x cheaper, larger context |")
report_lines.append("| 4 | MiniMax M2.5 | 6/10 | 10x cheaper but less capable |")
report_lines.append("| 5 | GLM-5 | 5/10 | Very cheap but limited access |")
report_lines.append("")
report_lines.append("### Pricing Comparison (per 1M tokens)")
report_lines.append("")
report_lines.append("| Model | Input | Output | vs Opus Input | vs Opus Output |")
report_lines.append("|-------|-------|--------|---------------|----------------|")
report_lines.append("| Claude Opus 4.6 | $5.00 | $15.00 | baseline | baseline |")
report_lines.append("| Claude Sonnet 4.6 | $3.00 | $15.00 | 40% cheaper | same |")
report_lines.append("| Codex 5.3 | $3.00 | $12.00 | 40% cheaper | 20% cheaper |")
report_lines.append("| Kimi K2.5 | $2.00 | $8.00 | 60% cheaper | 47% cheaper |")
report_lines.append("| GLM-5 | $0.50 | $2.00 | 90% cheaper | 87% cheaper |")
report_lines.append("| MiniMax M2.5 | $0.50 | $2.00 | 90% cheaper | 87% cheaper |")
report_lines.append("")
report_lines.append("## Recommendations")
report_lines.append("")
report_lines.append("### If Cost is Primary Concern")
report_lines.append("- **MiniMax M2.5** for prototyping and simple tasks (10x cheaper)")
report_lines.append("- **GLM-5** if you have China market access (10x cheaper)")
report_lines.append("")
report_lines.append("### If Quality is Primary Concern")
report_lines.append("- **Claude Opus 4.6** for mission-critical and complex reasoning")
report_lines.append("- **Codex 5.3** for pure coding tasks and IDE integration")
report_lines.append("")
report_lines.append("### Best All-Round Choice")
report_lines.append("- **Claude Sonnet 4.6** - Recommended first choice before trying Opus")
report_lines.append("- **Kimi K2.5** - Best non-Anthropic option with excellent value")
report_lines.append("")
# Write report
output_path = os.path.join(session_dir, output_file)
with open(output_path, 'w') as f:
f.write('\n'.join(report_lines))
print(f"Report generated: {output_path}")
return output_path
if __name__ == "__main__":
session_dir = os.path.dirname(os.path.abspath(__file__))
generate_report(session_dir)