219 lines
9.8 KiB
Python
219 lines
9.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate report for GLM-5 vs Kimi K2.5 vs Codex 5.3 vs Claude Opus 4.6 vs Sonnet 4.6 vs MiniMax M2.5 Agentic Coding Comparison
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
import os
|
|
from glob import glob
|
|
|
|
# Category mapping for nested JSON structures
|
|
CATEGORY_MAPPING = {
|
|
"Model_Info": ["Model_Info", "model_info"],
|
|
"Performance_Benchmarks": ["Performance_Benchmarks", "performance_benchmarks", "performance"],
|
|
"Pricing": ["Pricing", "pricing"],
|
|
"Agentic_Capabilities": ["Agentic_Capabilities", "agentic_capabilities", "capabilities"],
|
|
"User_Experiences": ["User_Experiences", "user_experiences", "user_experience"],
|
|
"Best_Use_Cases": ["Best_Use_Cases", "best_use_cases", "use_cases"],
|
|
"Opus_Replacement_Suitability": ["Opus_Replacement_Suitability", "opus_replacement_suitability", "replacement"],
|
|
}
|
|
|
|
def load_json_results(results_dir):
|
|
"""Load all JSON result files."""
|
|
results = {}
|
|
for json_file in glob(os.path.join(results_dir, "*.json")):
|
|
name = os.path.basename(json_file).replace('.json', '')
|
|
with open(json_file, 'r') as f:
|
|
results[name] = json.load(f)
|
|
return results
|
|
|
|
def load_fields(fields_file):
|
|
"""Load field definitions from fields.yaml."""
|
|
with open(fields_file, 'r') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def get_field_value(data, field_name, category_mapping=None):
|
|
"""Get field value from data, handling nested structures."""
|
|
# Direct match
|
|
if field_name in data:
|
|
return data[field_name]
|
|
|
|
# Check in category mappings
|
|
if category_mapping:
|
|
for cat_name, keys in category_mapping.items():
|
|
for key in keys:
|
|
if key in data and isinstance(data[key], dict) and field_name in data[key]:
|
|
return data[key][field_name]
|
|
|
|
# Deep search
|
|
for key, value in data.items():
|
|
if isinstance(value, dict) and field_name in value:
|
|
return value[field_name]
|
|
|
|
return None
|
|
|
|
def format_value(value):
|
|
"""Format a value for display."""
|
|
if value is None:
|
|
return "N/A"
|
|
if isinstance(value, list):
|
|
if len(value) == 0:
|
|
return "None"
|
|
return ", ".join(str(v) for v in value)
|
|
if isinstance(value, dict):
|
|
return "; ".join(f"{k}: {v}" for k, v in value.items())
|
|
return str(value)
|
|
|
|
def is_uncertain(data, field_name):
|
|
"""Check if field is marked as uncertain."""
|
|
uncertain_list = data.get('uncertain', [])
|
|
return field_name in uncertain_list
|
|
|
|
def generate_report(session_dir, output_file="report.md"):
|
|
"""Generate the final markdown report."""
|
|
results_dir = os.path.join(session_dir, "results")
|
|
fields_file = os.path.join(session_dir, "fields.yaml")
|
|
|
|
results = load_json_results(results_dir)
|
|
fields = load_fields(fields_file)
|
|
|
|
report_lines = []
|
|
|
|
# Header
|
|
report_lines.append("# GLM-5 vs Kimi K2.5 vs Codex 5.3 vs Claude Opus 4.6 vs Sonnet 4.6 vs MiniMax M2.5")
|
|
report_lines.append("")
|
|
report_lines.append("## Agentic Coding Model Comparison Report")
|
|
report_lines.append("")
|
|
report_lines.append(f"**Generated:** 2026-03-01 ")
|
|
report_lines.append(f"**Models Compared:** {len(results)} ")
|
|
report_lines.append("")
|
|
|
|
# Executive Summary Table
|
|
report_lines.append("## Executive Summary")
|
|
report_lines.append("")
|
|
report_lines.append("| Model | SWE-bench Est. | Input $/1M | Output $/1M | Context | Opus Replacement Score |")
|
|
report_lines.append("|-------|----------------|------------|-------------|---------|------------------------|")
|
|
|
|
for name in ['GLM-5', 'Kimi_K2.5', 'Codex_5.3', 'Claude_Opus_4.6', 'Claude_Sonnet_4.6', 'MiniMax_M2.5']:
|
|
if name in results:
|
|
data = results[name]
|
|
swe = get_field_value(data, 'swe_bench_verified_score', CATEGORY_MAPPING) or "N/A"
|
|
inp = get_field_value(data, 'input_price_per_1m', CATEGORY_MAPPING) or "N/A"
|
|
out = get_field_value(data, 'output_price_per_1m', CATEGORY_MAPPING) or "N/A"
|
|
ctx = get_field_value(data, 'context_window', CATEGORY_MAPPING) or "N/A"
|
|
score = get_field_value(data, 'replacement_confidence_score', CATEGORY_MAPPING) or "N/A"
|
|
report_lines.append(f"| {name.replace('_', ' ')} | {swe} | {inp} | {out} | {ctx} | {score}/10 |")
|
|
|
|
report_lines.append("")
|
|
|
|
# Table of Contents
|
|
report_lines.append("## Table of Contents")
|
|
report_lines.append("")
|
|
for i, name in enumerate(['GLM-5', 'Kimi_K2.5', 'Codex_5.3', 'Claude_Opus_4.6', 'Claude_Sonnet_4.6', 'MiniMax_M2.5'], 1):
|
|
if name in results:
|
|
data = results[name]
|
|
score = get_field_value(data, 'replacement_confidence_score', CATEGORY_MAPPING)
|
|
price = get_field_value(data, 'input_price_per_1m', CATEGORY_MAPPING)
|
|
report_lines.append(f"{i}. [{name.replace('_', ' ')}](#{name.lower().replace('_', '-')}) - Replacement Score: {score}/10 | Input: {price}")
|
|
report_lines.append("")
|
|
|
|
# Detailed sections for each model
|
|
for name in ['GLM-5', 'Kimi_K2.5', 'Codex_5.3', 'Claude_Opus_4.6', 'Claude_Sonnet_4.6', 'MiniMax_M2.5']:
|
|
if name not in results:
|
|
continue
|
|
|
|
data = results[name]
|
|
report_lines.append(f"## {name.replace('_', ' ')}")
|
|
report_lines.append("")
|
|
|
|
# Go through each category
|
|
for cat_name, cat_data in fields.get('categories', {}).items():
|
|
report_lines.append(f"### {cat_name.replace('_', ' ')}")
|
|
report_lines.append("")
|
|
|
|
for field in cat_data.get('fields', []):
|
|
field_name = field['name']
|
|
value = get_field_value(data, field_name, CATEGORY_MAPPING)
|
|
|
|
if value is not None and value != "":
|
|
uncertain_marker = " [uncertain]" if is_uncertain(data, field_name) else ""
|
|
display_value = format_value(value)
|
|
report_lines.append(f"**{field_name.replace('_', ' ').title()}:** {display_value}{uncertain_marker}")
|
|
report_lines.append("")
|
|
|
|
report_lines.append("---")
|
|
report_lines.append("")
|
|
|
|
# Comparison Analysis
|
|
report_lines.append("## Comparative Analysis")
|
|
report_lines.append("")
|
|
|
|
report_lines.append("### Best Value for Money")
|
|
report_lines.append("")
|
|
report_lines.append("1. **MiniMax M2.5** - 10x cheaper than Opus with decent capabilities for simple tasks")
|
|
report_lines.append("2. **Kimi K2.5** - Best balance of capability and cost with massive context window")
|
|
report_lines.append("3. **Claude Sonnet 4.6** - 90-95% of Opus capability at 60% input cost")
|
|
report_lines.append("")
|
|
|
|
report_lines.append("### Best for Complex Coding")
|
|
report_lines.append("")
|
|
report_lines.append("1. **Claude Opus 4.6** - Still the benchmark for complex reasoning and safety-critical code")
|
|
report_lines.append("2. **Codex 5.3** - Purpose-built for coding, excellent for pure software development")
|
|
report_lines.append("3. **Claude Sonnet 4.6** - Nearly matches Opus for most practical coding tasks")
|
|
report_lines.append("")
|
|
|
|
report_lines.append("### Best Opus 4.6 Replacement")
|
|
report_lines.append("")
|
|
report_lines.append("Based on replacement confidence scores:")
|
|
report_lines.append("")
|
|
report_lines.append("| Rank | Model | Confidence | Key Tradeoff |")
|
|
report_lines.append("|------|-------|------------|--------------|")
|
|
report_lines.append("| 1 | Claude Sonnet 4.6 | 9/10 | Same output price, 40% cheaper input |")
|
|
report_lines.append("| 2 | Codex 5.3 | 9/10 | Better at pure coding, less versatile |")
|
|
report_lines.append("| 3 | Kimi K2.5 | 8/10 | 2-3x cheaper, larger context |")
|
|
report_lines.append("| 4 | MiniMax M2.5 | 6/10 | 10x cheaper but less capable |")
|
|
report_lines.append("| 5 | GLM-5 | 5/10 | Very cheap but limited access |")
|
|
report_lines.append("")
|
|
|
|
report_lines.append("### Pricing Comparison (per 1M tokens)")
|
|
report_lines.append("")
|
|
report_lines.append("| Model | Input | Output | vs Opus Input | vs Opus Output |")
|
|
report_lines.append("|-------|-------|--------|---------------|----------------|")
|
|
report_lines.append("| Claude Opus 4.6 | $5.00 | $15.00 | baseline | baseline |")
|
|
report_lines.append("| Claude Sonnet 4.6 | $3.00 | $15.00 | 40% cheaper | same |")
|
|
report_lines.append("| Codex 5.3 | $3.00 | $12.00 | 40% cheaper | 20% cheaper |")
|
|
report_lines.append("| Kimi K2.5 | $2.00 | $8.00 | 60% cheaper | 47% cheaper |")
|
|
report_lines.append("| GLM-5 | $0.50 | $2.00 | 90% cheaper | 87% cheaper |")
|
|
report_lines.append("| MiniMax M2.5 | $0.50 | $2.00 | 90% cheaper | 87% cheaper |")
|
|
report_lines.append("")
|
|
|
|
report_lines.append("## Recommendations")
|
|
report_lines.append("")
|
|
report_lines.append("### If Cost is Primary Concern")
|
|
report_lines.append("- **MiniMax M2.5** for prototyping and simple tasks (10x cheaper)")
|
|
report_lines.append("- **GLM-5** if you have China market access (10x cheaper)")
|
|
report_lines.append("")
|
|
|
|
report_lines.append("### If Quality is Primary Concern")
|
|
report_lines.append("- **Claude Opus 4.6** for mission-critical and complex reasoning")
|
|
report_lines.append("- **Codex 5.3** for pure coding tasks and IDE integration")
|
|
report_lines.append("")
|
|
|
|
report_lines.append("### Best All-Round Choice")
|
|
report_lines.append("- **Claude Sonnet 4.6** - Recommended first choice before trying Opus")
|
|
report_lines.append("- **Kimi K2.5** - Best non-Anthropic option with excellent value")
|
|
report_lines.append("")
|
|
|
|
# Write report
|
|
output_path = os.path.join(session_dir, output_file)
|
|
with open(output_path, 'w') as f:
|
|
f.write('\n'.join(report_lines))
|
|
|
|
print(f"Report generated: {output_path}")
|
|
return output_path
|
|
|
|
if __name__ == "__main__":
|
|
session_dir = os.path.dirname(os.path.abspath(__file__))
|
|
generate_report(session_dir)
|