97 lines
3.9 KiB
YAML
97 lines
3.9 KiB
YAML
categories:
|
|
Performance_Benchmarks:
|
|
fields:
|
|
- name: "swe_bench_verified_score"
|
|
description: "SWE-bench Verified score - percentage of software engineering tasks solved"
|
|
detail_level: "detailed"
|
|
- name: "swe_bench_full_score"
|
|
description: "SWE-bench Full score if available (broader benchmark)"
|
|
detail_level: "moderate"
|
|
- name: "swe_bench_lite_score"
|
|
description: "SWE-bench Lite score for quick comparison"
|
|
detail_level: "moderate"
|
|
- name: "other_coding_benchmarks"
|
|
description: "Other relevant coding benchmarks (HumanEval, MBPP, etc.)"
|
|
detail_level: "moderate"
|
|
Pricing:
|
|
fields:
|
|
- name: "input_price_per_1m"
|
|
description: "Price per 1 million input tokens in USD"
|
|
detail_level: "detailed"
|
|
- name: "output_price_per_1m"
|
|
description: "Price per 1 million output tokens in USD"
|
|
detail_level: "detailed"
|
|
- name: "pricing_tier_notes"
|
|
description: "Any tiered pricing, volume discounts, or special notes"
|
|
detail_level: "moderate"
|
|
Agentic_Capabilities:
|
|
fields:
|
|
- name: "agentic_coding_features"
|
|
description: "Specific features supporting agentic coding (tool use, planning, reflection, etc.)"
|
|
detail_level: "detailed"
|
|
- name: "context_window"
|
|
description: "Maximum context window size for the model"
|
|
detail_level: "detailed"
|
|
- name: "supported_tools"
|
|
description: "Tools and integrations supported (bash, file editing, web search, etc.)"
|
|
detail_level: "detailed"
|
|
- name: "multi_file_handling"
|
|
description: "Capability to handle multi-file codebases and refactoring"
|
|
detail_level: "moderate"
|
|
User_Experiences:
|
|
fields:
|
|
- name: "reddit_sentiment"
|
|
description: "Sentiment and key themes from Reddit discussions"
|
|
detail_level: "detailed"
|
|
- name: "x_twitter_sentiment"
|
|
description: "Sentiment and key themes from X/Twitter discussions"
|
|
detail_level: "detailed"
|
|
- name: "common_praises"
|
|
description: "What users commonly praise about this model"
|
|
detail_level: "detailed"
|
|
- name: "common_complaints"
|
|
description: "What users commonly complain about"
|
|
detail_level: "detailed"
|
|
- name: "notable_use_cases_shared"
|
|
description: "Specific use cases shared by real users"
|
|
detail_level: "moderate"
|
|
Best_Use_Cases:
|
|
fields:
|
|
- name: "ideal_for"
|
|
description: "Scenarios where this model excels"
|
|
detail_level: "detailed"
|
|
- name: "not_recommended_for"
|
|
description: "Scenarios where this model struggles or is not cost-effective"
|
|
detail_level: "moderate"
|
|
- name: "comparison_to_opus_46"
|
|
description: "Direct comparison to Claude Opus 4.6 for coding tasks"
|
|
detail_level: "detailed"
|
|
Opus_Replacement_Suitability:
|
|
fields:
|
|
- name: "can_replace_opus_46"
|
|
description: "Whether this model can effectively replace Claude Opus 4.6"
|
|
detail_level: "detailed"
|
|
- name: "replacement_confidence_score"
|
|
description: "Confidence score (1-10) for replacement suitability"
|
|
detail_level: "brief"
|
|
- name: "replacement_tradeoffs"
|
|
description: "Key tradeoffs when replacing Opus 4.6 with this model"
|
|
detail_level: "detailed"
|
|
- name: "cost_comparison_vs_opus"
|
|
description: "Cost comparison specifically versus Claude Opus 4.6"
|
|
detail_level: "moderate"
|
|
Model_Info:
|
|
fields:
|
|
- name: "release_date"
|
|
description: "When the model was released"
|
|
detail_level: "brief"
|
|
- name: "developer"
|
|
description: "Company/organization that developed the model"
|
|
detail_level: "brief"
|
|
- name: "model_family"
|
|
description: "Family or series the model belongs to"
|
|
detail_level: "brief"
|
|
- name: "uncertain"
|
|
description: "Array of field names with uncertain values"
|
|
detail_level: "brief"
|