#!/usr/bin/env python3 """ Generate report for GLM-5 vs Kimi K2.5 vs Codex 5.3 vs Claude Opus 4.6 vs Sonnet 4.6 vs MiniMax M2.5 Agentic Coding Comparison """ import json import yaml import os from glob import glob # Category mapping for nested JSON structures CATEGORY_MAPPING = { "Model_Info": ["Model_Info", "model_info"], "Performance_Benchmarks": ["Performance_Benchmarks", "performance_benchmarks", "performance"], "Pricing": ["Pricing", "pricing"], "Agentic_Capabilities": ["Agentic_Capabilities", "agentic_capabilities", "capabilities"], "User_Experiences": ["User_Experiences", "user_experiences", "user_experience"], "Best_Use_Cases": ["Best_Use_Cases", "best_use_cases", "use_cases"], "Opus_Replacement_Suitability": ["Opus_Replacement_Suitability", "opus_replacement_suitability", "replacement"], } def load_json_results(results_dir): """Load all JSON result files.""" results = {} for json_file in glob(os.path.join(results_dir, "*.json")): name = os.path.basename(json_file).replace('.json', '') with open(json_file, 'r') as f: results[name] = json.load(f) return results def load_fields(fields_file): """Load field definitions from fields.yaml.""" with open(fields_file, 'r') as f: return yaml.safe_load(f) def get_field_value(data, field_name, category_mapping=None): """Get field value from data, handling nested structures.""" # Direct match if field_name in data: return data[field_name] # Check in category mappings if category_mapping: for cat_name, keys in category_mapping.items(): for key in keys: if key in data and isinstance(data[key], dict) and field_name in data[key]: return data[key][field_name] # Deep search for key, value in data.items(): if isinstance(value, dict) and field_name in value: return value[field_name] return None def format_value(value): """Format a value for display.""" if value is None: return "N/A" if isinstance(value, list): if len(value) == 0: return "None" return ", ".join(str(v) for v in value) if isinstance(value, dict): return "; ".join(f"{k}: {v}" for k, v in value.items()) return str(value) def is_uncertain(data, field_name): """Check if field is marked as uncertain.""" uncertain_list = data.get('uncertain', []) return field_name in uncertain_list def generate_report(session_dir, output_file="report.md"): """Generate the final markdown report.""" results_dir = os.path.join(session_dir, "results") fields_file = os.path.join(session_dir, "fields.yaml") results = load_json_results(results_dir) fields = load_fields(fields_file) report_lines = [] # Header report_lines.append("# GLM-5 vs Kimi K2.5 vs Codex 5.3 vs Claude Opus 4.6 vs Sonnet 4.6 vs MiniMax M2.5") report_lines.append("") report_lines.append("## Agentic Coding Model Comparison Report") report_lines.append("") report_lines.append(f"**Generated:** 2026-03-01 ") report_lines.append(f"**Models Compared:** {len(results)} ") report_lines.append("") # Executive Summary Table report_lines.append("## Executive Summary") report_lines.append("") report_lines.append("| Model | SWE-bench Est. | Input $/1M | Output $/1M | Context | Opus Replacement Score |") report_lines.append("|-------|----------------|------------|-------------|---------|------------------------|") for name in ['GLM-5', 'Kimi_K2.5', 'Codex_5.3', 'Claude_Opus_4.6', 'Claude_Sonnet_4.6', 'MiniMax_M2.5']: if name in results: data = results[name] swe = get_field_value(data, 'swe_bench_verified_score', CATEGORY_MAPPING) or "N/A" inp = get_field_value(data, 'input_price_per_1m', CATEGORY_MAPPING) or "N/A" out = get_field_value(data, 'output_price_per_1m', CATEGORY_MAPPING) or "N/A" ctx = get_field_value(data, 'context_window', CATEGORY_MAPPING) or "N/A" score = get_field_value(data, 'replacement_confidence_score', CATEGORY_MAPPING) or "N/A" report_lines.append(f"| {name.replace('_', ' ')} | {swe} | {inp} | {out} | {ctx} | {score}/10 |") report_lines.append("") # Table of Contents report_lines.append("## Table of Contents") report_lines.append("") for i, name in enumerate(['GLM-5', 'Kimi_K2.5', 'Codex_5.3', 'Claude_Opus_4.6', 'Claude_Sonnet_4.6', 'MiniMax_M2.5'], 1): if name in results: data = results[name] score = get_field_value(data, 'replacement_confidence_score', CATEGORY_MAPPING) price = get_field_value(data, 'input_price_per_1m', CATEGORY_MAPPING) report_lines.append(f"{i}. [{name.replace('_', ' ')}](#{name.lower().replace('_', '-')}) - Replacement Score: {score}/10 | Input: {price}") report_lines.append("") # Detailed sections for each model for name in ['GLM-5', 'Kimi_K2.5', 'Codex_5.3', 'Claude_Opus_4.6', 'Claude_Sonnet_4.6', 'MiniMax_M2.5']: if name not in results: continue data = results[name] report_lines.append(f"## {name.replace('_', ' ')}") report_lines.append("") # Go through each category for cat_name, cat_data in fields.get('categories', {}).items(): report_lines.append(f"### {cat_name.replace('_', ' ')}") report_lines.append("") for field in cat_data.get('fields', []): field_name = field['name'] value = get_field_value(data, field_name, CATEGORY_MAPPING) if value is not None and value != "": uncertain_marker = " [uncertain]" if is_uncertain(data, field_name) else "" display_value = format_value(value) report_lines.append(f"**{field_name.replace('_', ' ').title()}:** {display_value}{uncertain_marker}") report_lines.append("") report_lines.append("---") report_lines.append("") # Comparison Analysis report_lines.append("## Comparative Analysis") report_lines.append("") report_lines.append("### Best Value for Money") report_lines.append("") report_lines.append("1. **MiniMax M2.5** - 10x cheaper than Opus with decent capabilities for simple tasks") report_lines.append("2. **Kimi K2.5** - Best balance of capability and cost with massive context window") report_lines.append("3. **Claude Sonnet 4.6** - 90-95% of Opus capability at 60% input cost") report_lines.append("") report_lines.append("### Best for Complex Coding") report_lines.append("") report_lines.append("1. **Claude Opus 4.6** - Still the benchmark for complex reasoning and safety-critical code") report_lines.append("2. **Codex 5.3** - Purpose-built for coding, excellent for pure software development") report_lines.append("3. **Claude Sonnet 4.6** - Nearly matches Opus for most practical coding tasks") report_lines.append("") report_lines.append("### Best Opus 4.6 Replacement") report_lines.append("") report_lines.append("Based on replacement confidence scores:") report_lines.append("") report_lines.append("| Rank | Model | Confidence | Key Tradeoff |") report_lines.append("|------|-------|------------|--------------|") report_lines.append("| 1 | Claude Sonnet 4.6 | 9/10 | Same output price, 40% cheaper input |") report_lines.append("| 2 | Codex 5.3 | 9/10 | Better at pure coding, less versatile |") report_lines.append("| 3 | Kimi K2.5 | 8/10 | 2-3x cheaper, larger context |") report_lines.append("| 4 | MiniMax M2.5 | 6/10 | 10x cheaper but less capable |") report_lines.append("| 5 | GLM-5 | 5/10 | Very cheap but limited access |") report_lines.append("") report_lines.append("### Pricing Comparison (per 1M tokens)") report_lines.append("") report_lines.append("| Model | Input | Output | vs Opus Input | vs Opus Output |") report_lines.append("|-------|-------|--------|---------------|----------------|") report_lines.append("| Claude Opus 4.6 | $5.00 | $15.00 | baseline | baseline |") report_lines.append("| Claude Sonnet 4.6 | $3.00 | $15.00 | 40% cheaper | same |") report_lines.append("| Codex 5.3 | $3.00 | $12.00 | 40% cheaper | 20% cheaper |") report_lines.append("| Kimi K2.5 | $2.00 | $8.00 | 60% cheaper | 47% cheaper |") report_lines.append("| GLM-5 | $0.50 | $2.00 | 90% cheaper | 87% cheaper |") report_lines.append("| MiniMax M2.5 | $0.50 | $2.00 | 90% cheaper | 87% cheaper |") report_lines.append("") report_lines.append("## Recommendations") report_lines.append("") report_lines.append("### If Cost is Primary Concern") report_lines.append("- **MiniMax M2.5** for prototyping and simple tasks (10x cheaper)") report_lines.append("- **GLM-5** if you have China market access (10x cheaper)") report_lines.append("") report_lines.append("### If Quality is Primary Concern") report_lines.append("- **Claude Opus 4.6** for mission-critical and complex reasoning") report_lines.append("- **Codex 5.3** for pure coding tasks and IDE integration") report_lines.append("") report_lines.append("### Best All-Round Choice") report_lines.append("- **Claude Sonnet 4.6** - Recommended first choice before trying Opus") report_lines.append("- **Kimi K2.5** - Best non-Anthropic option with excellent value") report_lines.append("") # Write report output_path = os.path.join(session_dir, output_file) with open(output_path, 'w') as f: f.write('\n'.join(report_lines)) print(f"Report generated: {output_path}") return output_path if __name__ == "__main__": session_dir = os.path.dirname(os.path.abspath(__file__)) generate_report(session_dir)