Skip to content
This repository was archived by the owner on Apr 9, 2026. It is now read-only.

Commit 27992d8

Browse files
daniel5ugithub-actions[bot]
authored andcommitted
update data from SWE Benchmark
1 parent 3e19b80 commit 27992d8

1 file changed

Lines changed: 29 additions & 109 deletions

File tree

src/content/benchmarks/swe_bash_only.json

Lines changed: 29 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"Coding",
99
"Agent"
1010
],
11-
"lastUpdated": "2026-01-12",
11+
"lastUpdated": "2026-03-17",
1212
"metrics": {
1313
"unit": "% Resolved",
1414
"isBetterHigher": true
@@ -19,140 +19,60 @@
1919
},
2020
"snapshot": [
2121
{
22-
"modelRef": "anthropic/claude-opus-4-5-thinking",
23-
"score": 74.4
22+
"modelRef": "anthropic/claude-4-5-opus-high-reasoning",
23+
"score": 76.8
2424
},
2525
{
26-
"modelRef": "google/gemini-3-pro",
27-
"score": 74.2
28-
},
29-
{
30-
"modelRef": "openai/gpt-5-2",
31-
"score": 71.8
32-
},
33-
{
34-
"modelRef": "anthropic/claude-4-5-sonnet",
35-
"score": 70.6
36-
},
37-
{
38-
"modelRef": "openai/gpt-5-2",
39-
"score": 69.0
40-
},
41-
{
42-
"modelRef": "anthropic/claude-4-opus",
43-
"score": 67.6
44-
},
45-
{
46-
"modelRef": "openai/gpt-5-1-codex",
47-
"score": 66.0
48-
},
49-
{
50-
"modelRef": "openai/gpt-5-1",
51-
"score": 66.0
52-
},
53-
{
54-
"modelRef": "openai/gpt-5-medium",
55-
"score": 65.0
56-
},
57-
{
58-
"modelRef": "anthropic/claude-4-sonnet",
59-
"score": 64.93
60-
},
61-
{
62-
"modelRef": "kimi/kimi-k2-thinking",
63-
"score": 63.4
64-
},
65-
{
66-
"modelRef": "minimax/minimax-m2",
67-
"score": 61.0
26+
"modelRef": "google/gemini-3-flash-high-reasoning",
27+
"score": 75.8
6828
},
6929
{
70-
"modelRef": "deepseek/deepseek-v3-2-reasoning",
71-
"score": 60.0
30+
"modelRef": "minimax/minimax-m2-5-high-reasoning",
31+
"score": 75.8
7232
},
7333
{
74-
"modelRef": "openai/gpt-5-mini-medium",
75-
"score": 59.8
34+
"modelRef": "anthropic/claude-opus-4-6",
35+
"score": 75.6
7636
},
7737
{
78-
"modelRef": "openai/o3",
79-
"score": 58.4
38+
"modelRef": "openai/gpt-5-2-codex",
39+
"score": 72.8
8040
},
8141
{
82-
"modelRef": "mistral/devstral-small",
83-
"score": 56.4
42+
"modelRef": "z-ai/glm-5-high-reasoning",
43+
"score": 72.8
8444
},
8545
{
86-
"modelRef": "alibaba/qwen3-coder-480b-a35b-instruct",
87-
"score": 55.4
88-
},
89-
{
90-
"modelRef": "zai/glm-4-6",
91-
"score": 55.4
92-
},
93-
{
94-
"modelRef": "zai/glm-4-5",
95-
"score": 54.2
96-
},
97-
{
98-
"modelRef": "mistral/devstral-2",
99-
"score": 53.8
100-
},
101-
{
102-
"modelRef": "google/gemini-2-5-pro",
103-
"score": 53.6
104-
},
105-
{
106-
"modelRef": "anthropic/claude-3-7-sonnet",
107-
"score": 52.8
108-
},
109-
{
110-
"modelRef": "openai/o4-mini",
111-
"score": 45.0
112-
},
113-
{
114-
"modelRef": "kimi/kimi-k2-0905",
115-
"score": 43.8
116-
},
117-
{
118-
"modelRef": "openai/gpt-4-1",
119-
"score": 39.58
120-
},
121-
{
122-
"modelRef": "openai/gpt-5-nano-medium",
123-
"score": 34.8
124-
},
125-
{
126-
"modelRef": "google/gemini-2-5-flash",
127-
"score": 28.73
46+
"modelRef": "openai/gpt-5-2",
47+
"score": 72.8
12848
},
12949
{
130-
"modelRef": "openai/gpt-oss-120b",
131-
"score": 26.0
50+
"modelRef": "openai/gpt-5-2-codex",
51+
"score": 72.8
13252
},
13353
{
134-
"modelRef": "openai/gpt-4-1-mini",
135-
"score": 23.94
54+
"modelRef": "anthropic/claude-4-5-sonnet-high-reasoning",
55+
"score": 71.4
13656
},
13757
{
138-
"modelRef": "openai/gpt-4o",
139-
"score": 21.62
58+
"modelRef": "kimi/kimi-k2-5-high-reasoning",
59+
"score": 70.8
14060
},
14161
{
142-
"modelRef": "meta/llama-4-maverick",
143-
"score": 21.04
62+
"modelRef": "deepseek/deepseek-v3-2-high-reasoning",
63+
"score": 70.0
14464
},
14565
{
146-
"modelRef": "google/gemini-2-0-flash",
147-
"score": 13.52
66+
"modelRef": "google/gemini-3-pro",
67+
"score": 69.6
14868
},
14969
{
150-
"modelRef": "meta/llama-4-scout",
151-
"score": 9.06
70+
"modelRef": "anthropic/claude-4-5-haiku-high-reasoning",
71+
"score": 66.6
15272
},
15373
{
154-
"modelRef": "alibaba/qwen2-5-coder-32b-instruct",
155-
"score": 9.0
74+
"modelRef": "openai/gpt-5-mini",
75+
"score": 56.2
15676
}
15777
]
15878
}

0 commit comments

Comments
 (0)