From 180cf5bc86c33f25d4226cdfc29f39e4c854ac53 Mon Sep 17 00:00:00 2001 From: daniel5u <175603571+daniel5u@users.noreply.github.com> Date: Wed, 25 Mar 2026 01:18:13 +0000 Subject: [PATCH] update data from SWE Benchmark --- src/content/benchmarks/swe_bash_only.json | 138 +++++----------------- 1 file changed, 29 insertions(+), 109 deletions(-) diff --git a/src/content/benchmarks/swe_bash_only.json b/src/content/benchmarks/swe_bash_only.json index fd6a0f8..039c50c 100644 --- a/src/content/benchmarks/swe_bash_only.json +++ b/src/content/benchmarks/swe_bash_only.json @@ -8,7 +8,7 @@ "Coding", "Agent" ], - "lastUpdated": "2026-01-12", + "lastUpdated": "2026-03-25", "metrics": { "unit": "% Resolved", "isBetterHigher": true @@ -19,140 +19,60 @@ }, "snapshot": [ { - "modelRef": "anthropic/claude-opus-4-5-thinking", - "score": 74.4 + "modelRef": "anthropic/claude-4-5-opus-high-reasoning", + "score": 76.8 }, { - "modelRef": "google/gemini-3-pro", - "score": 74.2 - }, - { - "modelRef": "openai/gpt-5-2", - "score": 71.8 - }, - { - "modelRef": "anthropic/claude-4-5-sonnet", - "score": 70.6 - }, - { - "modelRef": "openai/gpt-5-2", - "score": 69.0 - }, - { - "modelRef": "anthropic/claude-4-opus", - "score": 67.6 - }, - { - "modelRef": "openai/gpt-5-1-codex", - "score": 66.0 - }, - { - "modelRef": "openai/gpt-5-1", - "score": 66.0 - }, - { - "modelRef": "openai/gpt-5-medium", - "score": 65.0 - }, - { - "modelRef": "anthropic/claude-4-sonnet", - "score": 64.93 - }, - { - "modelRef": "kimi/kimi-k2-thinking", - "score": 63.4 - }, - { - "modelRef": "minimax/minimax-m2", - "score": 61.0 + "modelRef": "google/gemini-3-flash-high-reasoning", + "score": 75.8 }, { - "modelRef": "deepseek/deepseek-v3-2-reasoning", - "score": 60.0 + "modelRef": "minimax/minimax-m2-5-high-reasoning", + "score": 75.8 }, { - "modelRef": "openai/gpt-5-mini-medium", - "score": 59.8 + "modelRef": "anthropic/claude-opus-4-6", + "score": 75.6 }, { - "modelRef": "openai/o3", - "score": 58.4 + "modelRef": "openai/gpt-5-2-codex", + "score": 72.8 }, { - "modelRef": "mistral/devstral-small", - "score": 56.4 + "modelRef": "z-ai/glm-5-high-reasoning", + "score": 72.8 }, { - "modelRef": "alibaba/qwen3-coder-480b-a35b-instruct", - "score": 55.4 - }, - { - "modelRef": "zai/glm-4-6", - "score": 55.4 - }, - { - "modelRef": "zai/glm-4-5", - "score": 54.2 - }, - { - "modelRef": "mistral/devstral-2", - "score": 53.8 - }, - { - "modelRef": "google/gemini-2-5-pro", - "score": 53.6 - }, - { - "modelRef": "anthropic/claude-3-7-sonnet", - "score": 52.8 - }, - { - "modelRef": "openai/o4-mini", - "score": 45.0 - }, - { - "modelRef": "kimi/kimi-k2-0905", - "score": 43.8 - }, - { - "modelRef": "openai/gpt-4-1", - "score": 39.58 - }, - { - "modelRef": "openai/gpt-5-nano-medium", - "score": 34.8 - }, - { - "modelRef": "google/gemini-2-5-flash", - "score": 28.73 + "modelRef": "openai/gpt-5-2", + "score": 72.8 }, { - "modelRef": "openai/gpt-oss-120b", - "score": 26.0 + "modelRef": "openai/gpt-5-2-codex", + "score": 72.8 }, { - "modelRef": "openai/gpt-4-1-mini", - "score": 23.94 + "modelRef": "anthropic/claude-4-5-sonnet-high-reasoning", + "score": 71.4 }, { - "modelRef": "openai/gpt-4o", - "score": 21.62 + "modelRef": "kimi/kimi-k2-5-high-reasoning", + "score": 70.8 }, { - "modelRef": "meta/llama-4-maverick", - "score": 21.04 + "modelRef": "deepseek/deepseek-v3-2-high-reasoning", + "score": 70.0 }, { - "modelRef": "google/gemini-2-0-flash", - "score": 13.52 + "modelRef": "google/gemini-3-pro", + "score": 69.6 }, { - "modelRef": "meta/llama-4-scout", - "score": 9.06 + "modelRef": "anthropic/claude-4-5-haiku-high-reasoning", + "score": 66.6 }, { - "modelRef": "alibaba/qwen2-5-coder-32b-instruct", - "score": 9.0 + "modelRef": "openai/gpt-5-mini", + "score": 56.2 } ] } \ No newline at end of file