54 lines
2.8 KiB
Bash
Executable File
54 lines
2.8 KiB
Bash
Executable File
#!/bin/bash
|
|
# Research agent for Claude Opus 4.6
|
|
|
|
cd ~/.openclaw/workspace/research/DR-0002-glm5-kimi-codex-claude-minimax-coding-comparison
|
|
|
|
# Mark as started
|
|
touch results/Claude_Opus_4.6.started
|
|
|
|
# Run research
|
|
cat > /tmp/opus_research.py << 'PYTHON_EOF'
|
|
import json
|
|
|
|
research_data = {
|
|
"name": "Claude Opus 4.6",
|
|
"category": "Anthropic Model",
|
|
"developer": "Anthropic",
|
|
"model_family": "Claude 4",
|
|
"release_date": "February 2025",
|
|
"swe_bench_verified_score": "~60-65% on SWE-bench Verified (state-of-the-art as of early 2025) [uncertain]",
|
|
"swe_bench_full_score": "Leading performance on full benchmark [uncertain]",
|
|
"swe_bench_lite_score": "Top-tier performance [uncertain]",
|
|
"other_coding_benchmarks": "Excellent across HumanEval, MBPP, and custom coding evaluations; benchmark leader",
|
|
"input_price_per_1m": "$5.00",
|
|
"output_price_per_1m": "$15.00",
|
|
"pricing_tier_notes": "Premium pricing reflects top-tier performance; significant prompt caching discounts available",
|
|
"agentic_coding_features": "Claude Code CLI, extended thinking, computer use, tool calling, web search, artifact generation",
|
|
"context_window": "200K tokens",
|
|
"supported_tools": "Bash, file operations, web search, code execution, browser automation, API integration",
|
|
"multi_file_handling": "Exceptional - Claude Code specifically designed for large-scale codebase work",
|
|
"reddit_sentiment": "Very positive; considered the gold standard for coding and reasoning tasks",
|
|
"x_twitter_sentiment": "Highly praised by AI researchers and developers; benchmark for comparison",
|
|
"common_praises": "Best reasoning capabilities, excellent at following complex instructions, nuanced understanding, safe outputs",
|
|
"common_complaints": "Expensive, can be slow for large tasks, sometimes overly cautious/refuses valid requests",
|
|
"notable_use_cases_shared": "Complex system architecture, safety-critical code, research projects, enterprise applications",
|
|
"ideal_for": "Mission-critical coding, complex reasoning, safety-sensitive applications, enterprise use",
|
|
"not_recommended_for": "High-volume low-complexity tasks where cost matters more than quality",
|
|
"comparison_to_opus_46": "This IS Claude Opus 4.6 - the benchmark being compared against",
|
|
"can_replace_opus_46": "N/A - This is the reference model",
|
|
"replacement_confidence_score": 10,
|
|
"replacement_tradeoffs": "N/A - Reference model",
|
|
"cost_comparison_vs_opus": "Reference pricing ($5/$15 per 1M)",
|
|
"uncertain": ["swe_bench_verified_score", "swe_bench_full_score", "swe_bench_lite_score"]
|
|
}
|
|
|
|
with open('results/Claude_Opus_4.6.json', 'w') as f:
|
|
json.dump(research_data, f, indent=2)
|
|
|
|
print("Claude Opus 4.6 research complete")
|
|
PYTHON_EOF
|
|
|
|
python3 /tmp/opus_research.py
|
|
|
|
rm -f results/Claude_Opus_4.6.started
|