From f8db8515cd808a4b10313d2663af56785f2b8570 Mon Sep 17 00:00:00 2001 From: daniel5u <175603571+daniel5u@users.noreply.github.com> Date: Wed, 25 Mar 2026 02:11:19 +0000 Subject: [PATCH] update data from artificial analysis --- .../artificial_analysis_coding_index.json | 230 ++++- ...rtificial_analysis_intelligence_index.json | 870 +++++++++++------- src/content/benchmarks/gpqa.json | 228 ++++- src/content/benchmarks/hle.json | 228 ++++- src/content/benchmarks/ifbench.json | 228 ++++- src/content/benchmarks/lcr.json | 228 ++++- src/content/benchmarks/livecodebench.json | 6 +- src/content/benchmarks/math_500.json | 6 +- src/content/benchmarks/mmlu_pro.json | 6 +- src/content/benchmarks/scicode.json | 224 ++++- src/content/benchmarks/tau2.json | 256 +++++- .../benchmarks/terminalbench_hard.json | 226 ++++- .../alibaba/qwen3-5-0-8b-non-reasoning.json | 9 + src/content/models/alibaba/qwen3-5-0-8b.json | 9 + .../qwen3-5-122b-a10b-non-reasoning.json | 9 + .../models/alibaba/qwen3-5-122b-a10b.json | 9 + .../alibaba/qwen3-5-27b-non-reasoning.json | 9 + src/content/models/alibaba/qwen3-5-27b.json | 9 + .../alibaba/qwen3-5-2b-non-reasoning.json | 9 + src/content/models/alibaba/qwen3-5-2b.json | 9 + .../qwen3-5-35b-a3b-non-reasoning.json | 9 + .../models/alibaba/qwen3-5-35b-a3b.json | 9 + .../qwen3-5-397b-a17b-non-reasoning.json | 9 + .../models/alibaba/qwen3-5-397b-a17b.json | 9 + .../alibaba/qwen3-5-4b-non-reasoning.json | 9 + src/content/models/alibaba/qwen3-5-4b.json | 9 + .../alibaba/qwen3-5-9b-non-reasoning.json | 9 + src/content/models/alibaba/qwen3-5-9b.json | 9 + .../models/alibaba/qwen3-coder-next.json | 2 +- .../anthropic/claude-opus-4-6-adaptive.json | 2 +- .../models/anthropic/claude-opus-4-6.json | 2 +- .../anthropic/claude-sonnet-4-6-adaptive.json | 9 + ...e-sonnet-4-6-non-reasoning-low-effort.json | 9 + .../models/anthropic/claude-sonnet-4-6.json | 9 + .../models/cohere/tiny-aya-global.json | 9 + .../google/gemini-3-1-flash-lite-preview.json | 9 + .../models/google/gemini-3-1-pro-preview.json | 9 + src/content/models/inception/mercury-2.json | 9 + src/content/models/liquidai/lfm2-24b-a2b.json | 9 + .../models/longcat/longcat-flash-lite.json | 9 + src/content/models/minimax/minimax-m2-5.json | 9 + src/content/models/minimax/minimax-m2-7.json | 9 + .../mistral-small-4-non-reasoning.json | 9 + .../models/mistral/mistral-small-4.json | 9 + .../models/nanbeige/nanbeige4-1-3b.json | 9 + .../nvidia/nvidia-nemotron-3-nano-4b.json | 9 + .../nvidia-nemotron-3-super-120b-a12b.json | 9 + src/content/models/openai/gpt-5-3-codex.json | 9 + .../models/openai/gpt-5-4-mini-medium.json | 9 + .../openai/gpt-5-4-mini-non-reasoning.json | 9 + src/content/models/openai/gpt-5-4-mini.json | 9 + .../models/openai/gpt-5-4-nano-medium.json | 9 + .../openai/gpt-5-4-nano-non-reasoning.json | 9 + src/content/models/openai/gpt-5-4-nano.json | 9 + .../models/openai/gpt-5-4-non-reasoning.json | 9 + src/content/models/openai/gpt-5-4-pro.json | 9 + src/content/models/openai/gpt-5-4.json | 9 + src/content/models/sarvam/sarvam-105b.json | 9 + src/content/models/sarvam/sarvam-30b.json | 9 + .../models/sarvam/sarvam-m-reasoning.json | 9 + .../models/stepfun/step-3-5-flash.json | 9 + src/content/models/stepfun/step-3-vl-10b.json | 9 + .../apertus-70b-instruct.json | 9 + .../apertus-8b-instruct.json | 9 + .../trillionlabs/tri-21b-think-preview.json | 9 + .../trillionlabs/tri-21b-think-v0-5.json | 9 + .../models/xai/grok-4-20-non-reasoning.json | 9 + src/content/models/xai/grok-4-20.json | 9 + src/content/models/xiaomi/mimo-v2-omni.json | 9 + src/content/models/xiaomi/mimo-v2-pro.json | 9 + .../models/zai/glm-5-non-reasoning.json | 9 + src/content/models/zai/glm-5-turbo.json | 9 + src/content/publishers/inception.json | 6 + src/content/publishers/longcat.json | 6 + src/content/publishers/nanbeige.json | 6 + src/content/publishers/sarvam.json | 6 + .../publishers/swiss-ai-initiative.json | 6 + src/content/publishers/trillionlabs.json | 6 + 78 files changed, 2934 insertions(+), 357 deletions(-) create mode 100644 src/content/models/alibaba/qwen3-5-0-8b-non-reasoning.json create mode 100644 src/content/models/alibaba/qwen3-5-0-8b.json create mode 100644 src/content/models/alibaba/qwen3-5-122b-a10b-non-reasoning.json create mode 100644 src/content/models/alibaba/qwen3-5-122b-a10b.json create mode 100644 src/content/models/alibaba/qwen3-5-27b-non-reasoning.json create mode 100644 src/content/models/alibaba/qwen3-5-27b.json create mode 100644 src/content/models/alibaba/qwen3-5-2b-non-reasoning.json create mode 100644 src/content/models/alibaba/qwen3-5-2b.json create mode 100644 src/content/models/alibaba/qwen3-5-35b-a3b-non-reasoning.json create mode 100644 src/content/models/alibaba/qwen3-5-35b-a3b.json create mode 100644 src/content/models/alibaba/qwen3-5-397b-a17b-non-reasoning.json create mode 100644 src/content/models/alibaba/qwen3-5-397b-a17b.json create mode 100644 src/content/models/alibaba/qwen3-5-4b-non-reasoning.json create mode 100644 src/content/models/alibaba/qwen3-5-4b.json create mode 100644 src/content/models/alibaba/qwen3-5-9b-non-reasoning.json create mode 100644 src/content/models/alibaba/qwen3-5-9b.json create mode 100644 src/content/models/anthropic/claude-sonnet-4-6-adaptive.json create mode 100644 src/content/models/anthropic/claude-sonnet-4-6-non-reasoning-low-effort.json create mode 100644 src/content/models/anthropic/claude-sonnet-4-6.json create mode 100644 src/content/models/cohere/tiny-aya-global.json create mode 100644 src/content/models/google/gemini-3-1-flash-lite-preview.json create mode 100644 src/content/models/google/gemini-3-1-pro-preview.json create mode 100644 src/content/models/inception/mercury-2.json create mode 100644 src/content/models/liquidai/lfm2-24b-a2b.json create mode 100644 src/content/models/longcat/longcat-flash-lite.json create mode 100644 src/content/models/minimax/minimax-m2-5.json create mode 100644 src/content/models/minimax/minimax-m2-7.json create mode 100644 src/content/models/mistral/mistral-small-4-non-reasoning.json create mode 100644 src/content/models/mistral/mistral-small-4.json create mode 100644 src/content/models/nanbeige/nanbeige4-1-3b.json create mode 100644 src/content/models/nvidia/nvidia-nemotron-3-nano-4b.json create mode 100644 src/content/models/nvidia/nvidia-nemotron-3-super-120b-a12b.json create mode 100644 src/content/models/openai/gpt-5-3-codex.json create mode 100644 src/content/models/openai/gpt-5-4-mini-medium.json create mode 100644 src/content/models/openai/gpt-5-4-mini-non-reasoning.json create mode 100644 src/content/models/openai/gpt-5-4-mini.json create mode 100644 src/content/models/openai/gpt-5-4-nano-medium.json create mode 100644 src/content/models/openai/gpt-5-4-nano-non-reasoning.json create mode 100644 src/content/models/openai/gpt-5-4-nano.json create mode 100644 src/content/models/openai/gpt-5-4-non-reasoning.json create mode 100644 src/content/models/openai/gpt-5-4-pro.json create mode 100644 src/content/models/openai/gpt-5-4.json create mode 100644 src/content/models/sarvam/sarvam-105b.json create mode 100644 src/content/models/sarvam/sarvam-30b.json create mode 100644 src/content/models/sarvam/sarvam-m-reasoning.json create mode 100644 src/content/models/stepfun/step-3-5-flash.json create mode 100644 src/content/models/stepfun/step-3-vl-10b.json create mode 100644 src/content/models/swiss-ai-initiative/apertus-70b-instruct.json create mode 100644 src/content/models/swiss-ai-initiative/apertus-8b-instruct.json create mode 100644 src/content/models/trillionlabs/tri-21b-think-preview.json create mode 100644 src/content/models/trillionlabs/tri-21b-think-v0-5.json create mode 100644 src/content/models/xai/grok-4-20-non-reasoning.json create mode 100644 src/content/models/xai/grok-4-20.json create mode 100644 src/content/models/xiaomi/mimo-v2-omni.json create mode 100644 src/content/models/xiaomi/mimo-v2-pro.json create mode 100644 src/content/models/zai/glm-5-non-reasoning.json create mode 100644 src/content/models/zai/glm-5-turbo.json create mode 100644 src/content/publishers/inception.json create mode 100644 src/content/publishers/longcat.json create mode 100644 src/content/publishers/nanbeige.json create mode 100644 src/content/publishers/sarvam.json create mode 100644 src/content/publishers/swiss-ai-initiative.json create mode 100644 src/content/publishers/trillionlabs.json diff --git a/src/content/benchmarks/artificial_analysis_coding_index.json b/src/content/benchmarks/artificial_analysis_coding_index.json index 330b664..f294ba6 100644 --- a/src/content/benchmarks/artificial_analysis_coding_index.json +++ b/src/content/benchmarks/artificial_analysis_coding_index.json @@ -7,7 +7,7 @@ "tags": [ "Coding" ], - "lastUpdated": "2026-02-12", + "lastUpdated": "2026-03-25", "metrics": { "unit": "Index", "isBetterHigher": true @@ -19,6 +19,26 @@ "initialWeight": 1000 }, "snapshot": [ + { + "modelRef": "openai/gpt-5-4", + "score": 57.3 + }, + { + "modelRef": "google/gemini-3-1-pro-preview", + "score": 55.5 + }, + { + "modelRef": "openai/gpt-5-3-codex", + "score": 53.1 + }, + { + "modelRef": "openai/gpt-5-4-mini", + "score": 51.5 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6-adaptive", + "score": 50.9 + }, { "modelRef": "openai/gpt-5-2", "score": 48.7 @@ -43,6 +63,10 @@ "modelRef": "google/gemini-3-pro", "score": 46.5 }, + { + "modelRef": "anthropic/claude-sonnet-4-6", + "score": 46.4 + }, { "modelRef": "openai/gpt-5-1", "score": 44.7 @@ -55,10 +79,18 @@ "modelRef": "openai/gpt-5-2-medium", "score": 44.2 }, + { + "modelRef": "openai/gpt-5-4-nano", + "score": 43.9 + }, { "modelRef": "openai/gpt-5-2-codex", "score": 43.0 }, + { + "modelRef": "anthropic/claude-sonnet-4-6-non-reasoning-low-effort", + "score": 43.0 + }, { "modelRef": "anthropic/claude-opus-4-5", "score": 42.9 @@ -67,6 +99,26 @@ "modelRef": "google/gemini-3-flash-reasoning", "score": 42.6 }, + { + "modelRef": "xai/grok-4-20", + "score": 42.2 + }, + { + "modelRef": "minimax/minimax-m2-7", + "score": 41.9 + }, + { + "modelRef": "xiaomi/mimo-v2-pro", + "score": 41.4 + }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b", + "score": 41.3 + }, + { + "modelRef": "openai/gpt-5-4-non-reasoning", + "score": 41.0 + }, { "modelRef": "xai/grok-4", "score": 40.5 @@ -79,6 +131,10 @@ "modelRef": "google/gemini-3-pro-low", "score": 39.4 }, + { + "modelRef": "zai/glm-5-non-reasoning", + "score": 39.0 + }, { "modelRef": "openai/gpt-5-medium", "score": 39.0 @@ -103,6 +159,22 @@ "modelRef": "google/gemini-3-flash", "score": 37.8 }, + { + "modelRef": "openai/gpt-5-4-mini-medium", + "score": 37.5 + }, + { + "modelRef": "minimax/minimax-m2-5", + "score": 37.4 + }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b-non-reasoning", + "score": 37.4 + }, + { + "modelRef": "zai/glm-5-turbo", + "score": 36.8 + }, { "modelRef": "deepseek/deepseek-v3-2-reasoning", "score": 36.7 @@ -127,10 +199,22 @@ "modelRef": "openai/gpt-5", "score": 36.0 }, + { + "modelRef": "xiaomi/mimo-v2-omni", + "score": 35.5 + }, { "modelRef": "openai/gpt-5-mini", "score": 35.3 }, + { + "modelRef": "openai/gpt-5-4-nano-medium", + "score": 35.0 + }, + { + "modelRef": "alibaba/qwen3-5-27b", + "score": 34.9 + }, { "modelRef": "kimi/kimi-k2-thinking", "score": 34.8 @@ -139,6 +223,10 @@ "modelRef": "openai/gpt-5-2-non-reasoning", "score": 34.7 }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b", + "score": 34.7 + }, { "modelRef": "deepseek/deepseek-v3-2", "score": 34.6 @@ -167,6 +255,10 @@ "modelRef": "anthropic/claude-4-5-sonnet", "score": 33.5 }, + { + "modelRef": "alibaba/qwen3-5-27b-non-reasoning", + "score": 33.4 + }, { "modelRef": "deepseek/deepseek-v3-2-reasoning-0925", "score": 33.3 @@ -199,10 +291,22 @@ "modelRef": "xiaomi/mimo-v2-flash-reasoning", "score": 31.8 }, + { + "modelRef": "stepfun/step-3-5-flash", + "score": 31.6 + }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b-non-reasoning", + "score": 31.6 + }, { "modelRef": "bytedance-seed/doubao-seed-code", "score": 31.3 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-super-120b-a12b", + "score": 31.2 + }, { "modelRef": "xai/grok-4-1-fast-reasoning", "score": 30.9 @@ -211,6 +315,10 @@ "modelRef": "openai/gpt-5-low", "score": 30.7 }, + { + "modelRef": "inception/mercury-2", + "score": 30.6 + }, { "modelRef": "anthropic/claude-4-sonnet", "score": 30.6 @@ -223,6 +331,10 @@ "modelRef": "aws/nova-2-0-pro-reasoning-medium", "score": 30.4 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b", + "score": 30.3 + }, { "modelRef": "zai/glm-4-6", "score": 30.2 @@ -231,6 +343,10 @@ "modelRef": "anthropic/claude-35-sonnet", "score": 30.2 }, + { + "modelRef": "google/gemini-3-1-flash-lite-preview", + "score": 30.1 + }, { "modelRef": "deepseek/deepseek-v3-2-0925", "score": 30.0 @@ -263,6 +379,10 @@ "modelRef": "deepseek/deepseek-v3-1", "score": 28.4 }, + { + "modelRef": "openai/gpt-5-4-nano-non-reasoning", + "score": 27.9 + }, { "modelRef": "anthropic/claude-3-7-sonnet-thinking", "score": 27.6 @@ -323,6 +443,18 @@ "modelRef": "alibaba/qwen3-max-preview", "score": 25.5 }, + { + "modelRef": "xai/grok-4-20-non-reasoning", + "score": 25.4 + }, + { + "modelRef": "openai/gpt-5-4-mini-non-reasoning", + "score": 25.3 + }, + { + "modelRef": "alibaba/qwen3-5-9b", + "score": 25.3 + }, { "modelRef": "xai/grok-3-mini-reasoning", "score": 25.2 @@ -351,6 +483,10 @@ "modelRef": "alibaba/qwen3-max-thinking-preview", "score": 24.5 }, + { + "modelRef": "mistral/mistral-small-4", + "score": 24.3 + }, { "modelRef": "openai/gpt-4o-2024-05-13", "score": 24.2 @@ -439,6 +575,10 @@ "modelRef": "openai/gpt-4-turbo", "score": 21.5 }, + { + "modelRef": "alibaba/qwen3-5-9b-non-reasoning", + "score": 21.4 + }, { "modelRef": "openai/gpt-5-chatgpt", "score": 21.2 @@ -551,6 +691,10 @@ "modelRef": "naver/hyperclova-x-seed-think-32b", "score": 17.5 }, + { + "modelRef": "alibaba/qwen3-5-4b", + "score": 17.5 + }, { "modelRef": "alibaba/qwen3-235b-a22b-instruct-reasoning", "score": 17.4 @@ -563,6 +707,10 @@ "modelRef": "inclusionai/ring-1t", "score": 16.8 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b-non-reasoning", + "score": 16.8 + }, { "modelRef": "openai/gpt-4o", "score": 16.7 @@ -579,10 +727,18 @@ "modelRef": "openai/gpt-4o-2024-08-06", "score": 16.6 }, + { + "modelRef": "longcat/longcat-flash-lite", + "score": 16.5 + }, { "modelRef": "alibaba/qwen3-vl-235b-a22b-instruct", "score": 16.5 }, + { + "modelRef": "mistral/mistral-small-4-non-reasoning", + "score": 16.4 + }, { "modelRef": "deepseek/deepseek-v3", "score": 16.4 @@ -712,7 +868,7 @@ "score": 14.0 }, { - "modelRef": "stepfun/step3-vl-10b", + "modelRef": "stepfun/step-3-vl-10b", "score": 13.9 }, { @@ -739,6 +895,10 @@ "modelRef": "alibaba/qwen3-32b-instruct-reasoning", "score": 13.8 }, + { + "modelRef": "alibaba/qwen3-5-4b-non-reasoning", + "score": 13.7 + }, { "modelRef": "mistral/mistral-medium-3", "score": 13.6 @@ -915,6 +1075,10 @@ "modelRef": "ai2/olmo-3-32b-think", "score": 10.5 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-nano-4b", + "score": 10.0 + }, { "modelRef": "mistral/ministral-3-8b", "score": 10.0 @@ -927,6 +1091,10 @@ "modelRef": "tii-uae/falcon-h1r-7b", "score": 9.8 }, + { + "modelRef": "sarvam/sarvam-105b", + "score": 9.8 + }, { "modelRef": "alibaba/qwen3-vl-8b-reasoning", "score": 9.8 @@ -971,6 +1139,10 @@ "modelRef": "reka-ai/reka-flash-3", "score": 8.9 }, + { + "modelRef": "nanbeige/nanbeige4-1-3b", + "score": 8.9 + }, { "modelRef": "ibm/granite-4-0-h-small", "score": 8.5 @@ -979,6 +1151,10 @@ "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2-reasoning", "score": 8.3 }, + { + "modelRef": "sarvam/sarvam-30b", + "score": 7.9 + }, { "modelRef": "deepseek/deepseek-r1-qwen3-8b", "score": 7.8 @@ -999,10 +1175,18 @@ "modelRef": "ai2/olmo-3-7b-think", "score": 7.6 }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 7.5 + }, { "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2", "score": 7.5 }, + { + "modelRef": "trillionlabs/tri-21b-think-preview", + "score": 7.4 + }, { "modelRef": "google/gemini-2-5-flash-lite", "score": 7.4 @@ -1027,10 +1211,18 @@ "modelRef": "meta/llama-4-scout", "score": 6.7 }, + { + "modelRef": "anthropic/claude-3-haiku", + "score": 6.7 + }, { "modelRef": "alibaba/qwen3-vl-4b-reasoning", "score": 6.7 }, + { + "modelRef": "trillionlabs/tri-21b-think-v0-5", + "score": 6.3 + }, { "modelRef": "google/gemma-3-12b", "score": 6.3 @@ -1059,6 +1251,10 @@ "modelRef": "meta/llama-3-1-instruct-8b", "score": 4.9 }, + { + "modelRef": "alibaba/qwen3-5-2b-non-reasoning", + "score": 4.9 + }, { "modelRef": "mistral/ministral-3-3b", "score": 4.8 @@ -1091,10 +1287,18 @@ "modelRef": "meta/llama-3-instruct-8b", "score": 4.0 }, + { + "modelRef": "liquidai/lfm2-24b-a2b", + "score": 3.6 + }, { "modelRef": "azure/phi-4-mini", "score": 3.6 }, + { + "modelRef": "alibaba/qwen3-5-2b", + "score": 3.5 + }, { "modelRef": "ibm/granite-3-3-8b-instruct", "score": 3.4 @@ -1151,6 +1355,14 @@ "modelRef": "google/gemma-3n-e2b", "score": 2.2 }, + { + "modelRef": "swiss-ai-initiative/apertus-70b-instruct", + "score": 1.9 + }, + { + "modelRef": "swiss-ai-initiative/apertus-8b-instruct", + "score": 1.4 + }, { "modelRef": "liquidai/lfm2-5-1-2b-thinking", "score": 1.4 @@ -1167,6 +1379,10 @@ "modelRef": "alibaba/qwen3-0-6b-instruct", "score": 1.4 }, + { + "modelRef": "cohere/tiny-aya-global", + "score": 1.2 + }, { "modelRef": "ai2/olmo-2-7b", "score": 1.2 @@ -1179,6 +1395,10 @@ "modelRef": "liquidai/lfm2-5-vl-1-6b", "score": 1.0 }, + { + "modelRef": "alibaba/qwen3-5-0-8b-non-reasoning", + "score": 1.0 + }, { "modelRef": "alibaba/qwen3-0-6b-instruct-reasoning", "score": 0.9 @@ -1209,7 +1429,11 @@ }, { "modelRef": "google/gemma-3-270m", - "score": 0.1 + "score": 0.0 + }, + { + "modelRef": "alibaba/qwen3-5-0-8b", + "score": 0.0 } ] } \ No newline at end of file diff --git a/src/content/benchmarks/artificial_analysis_intelligence_index.json b/src/content/benchmarks/artificial_analysis_intelligence_index.json index b82b467..3790485 100644 --- a/src/content/benchmarks/artificial_analysis_intelligence_index.json +++ b/src/content/benchmarks/artificial_analysis_intelligence_index.json @@ -7,7 +7,7 @@ "tags": [ "Knowledge" ], - "lastUpdated": "2026-02-12", + "lastUpdated": "2026-03-25", "metrics": { "unit": "Index", "isBetterHigher": true @@ -19,113 +19,193 @@ "initialWeight": 1000 }, "snapshot": [ + { + "modelRef": "openai/gpt-5-4", + "score": 57.2 + }, + { + "modelRef": "google/gemini-3-1-pro-preview", + "score": 57.2 + }, + { + "modelRef": "openai/gpt-5-3-codex", + "score": 54.0 + }, { "modelRef": "anthropic/claude-opus-4-6-adaptive", "score": 53.0 }, + { + "modelRef": "anthropic/claude-sonnet-4-6-adaptive", + "score": 51.7 + }, { "modelRef": "openai/gpt-5-2", - "score": 51.2 + "score": 51.3 + }, + { + "modelRef": "zai/glm-5", + "score": 49.8 }, { "modelRef": "anthropic/claude-opus-4-5-thinking", "score": 49.7 }, { - "modelRef": "zai/glm-5", + "modelRef": "minimax/minimax-m2-7", "score": 49.6 }, + { + "modelRef": "xiaomi/mimo-v2-pro", + "score": 49.2 + }, { "modelRef": "openai/gpt-5-2-codex", "score": 49.0 }, + { + "modelRef": "xai/grok-4-20", + "score": 48.5 + }, { "modelRef": "google/gemini-3-pro", "score": 48.4 }, + { + "modelRef": "openai/gpt-5-4-mini", + "score": 48.1 + }, { "modelRef": "openai/gpt-5-1", - "score": 47.6 + "score": 47.7 + }, + { + "modelRef": "zai/glm-5-turbo", + "score": 46.8 }, { "modelRef": "kimi/kimi-k2-5", - "score": 46.7 + "score": 46.8 }, { "modelRef": "openai/gpt-5-2-medium", "score": 46.6 }, + { + "modelRef": "anthropic/claude-opus-4-6", + "score": 46.5 + }, { "modelRef": "google/gemini-3-flash-reasoning", "score": 46.4 }, { - "modelRef": "anthropic/claude-opus-4-6", - "score": 46.4 + "modelRef": "alibaba/qwen3-5-397b-a17b", + "score": 45.0 + }, + { + "modelRef": "openai/gpt-5-codex", + "score": 44.6 }, { "modelRef": "openai/gpt-5", "score": 44.6 }, { - "modelRef": "openai/gpt-5-codex", - "score": 44.5 + "modelRef": "openai/gpt-5-4-nano", + "score": 44.4 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6", + "score": 44.4 + }, + { + "modelRef": "xiaomi/mimo-v2-omni", + "score": 43.4 + }, + { + "modelRef": "openai/gpt-5-1-codex", + "score": 43.1 }, { "modelRef": "anthropic/claude-opus-4-5", - "score": 43.0 + "score": 43.1 }, { "modelRef": "anthropic/claude-4-5-sonnet-thinking", - "score": 42.9 + "score": 43.0 }, { - "modelRef": "openai/gpt-5-1-codex", - "score": 42.2 + "modelRef": "anthropic/claude-sonnet-4-6-non-reasoning-low-effort", + "score": 42.6 }, { "modelRef": "zai/glm-4-7", - "score": 42.0 + "score": 42.1 + }, + { + "modelRef": "alibaba/qwen3-5-27b", + "score": 42.1 }, { "modelRef": "openai/gpt-5-medium", - "score": 41.8 + "score": 42.0 + }, + { + "modelRef": "anthropic/claude-4-1-opus-thinking", + "score": 42.0 + }, + { + "modelRef": "minimax/minimax-m2-5", + "score": 41.9 }, { "modelRef": "deepseek/deepseek-v3-2-reasoning", + "score": 41.7 + }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b", "score": 41.6 }, { "modelRef": "xiaomi/mimo-v2-0206", - "score": 41.4 + "score": 41.5 }, { "modelRef": "xai/grok-4", - "score": 41.4 + "score": 41.5 }, { "modelRef": "google/gemini-3-pro-low", - "score": 41.1 + "score": 41.3 }, { "modelRef": "openai/gpt-5-mini", - "score": 41.0 + "score": 41.2 + }, + { + "modelRef": "kimi/kimi-k2-thinking", + "score": 40.9 }, { "modelRef": "openai/o3-pro", "score": 40.7 }, { - "modelRef": "kimi/kimi-k2-thinking", - "score": 40.7 + "modelRef": "zai/glm-5-non-reasoning", + "score": 40.6 + }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b-non-reasoning", + "score": 40.1 }, { "modelRef": "alibaba/qwen3-max-thinking", - "score": 39.7 + "score": 39.9 }, { "modelRef": "minimax/minimax-m2-1", - "score": 39.5 + "score": 39.4 }, { "modelRef": "xiaomi/mimo-v2-flash-reasoning", @@ -133,30 +213,50 @@ }, { "modelRef": "openai/gpt-5-low", + "score": 39.2 + }, + { + "modelRef": "anthropic/claude-4-opus-thinking", "score": 39.0 }, { "modelRef": "openai/gpt-5-mini-medium", - "score": 38.8 + "score": 38.9 }, { "modelRef": "anthropic/claude-4-sonnet-thinking", - "score": 38.6 + "score": 38.7 }, { "modelRef": "xai/grok-4-1-fast-reasoning", - "score": 38.5 + "score": 38.6 }, { "modelRef": "openai/gpt-5-1-codex-mini", - "score": 38.5 + "score": 38.6 }, { "modelRef": "openai/o3", - "score": 38.3 + "score": 38.4 + }, + { + "modelRef": "openai/gpt-5-4-nano-medium", + "score": 38.1 + }, + { + "modelRef": "stepfun/step-3-5-flash", + "score": 37.8 + }, + { + "modelRef": "openai/gpt-5-4-mini-medium", + "score": 37.7 }, { "modelRef": "kimi/kimi-k2-5-non-reasoning", + "score": 37.3 + }, + { + "modelRef": "alibaba/qwen3-5-27b-non-reasoning", "score": 37.2 }, { @@ -165,50 +265,70 @@ }, { "modelRef": "anthropic/claude-4-5-haiku-reasoning", - "score": 37.0 + "score": 37.1 }, { - "modelRef": "kwaikat/kat-coder-pro-v1", - "score": 36.1 + "modelRef": "alibaba/qwen3-5-35b-a3b", + "score": 37.1 }, { "modelRef": "minimax/minimax-m2", + "score": 36.1 + }, + { + "modelRef": "nvidia/nvidia-nemotron-3-super-120b-a12b", "score": 36.0 }, + { + "modelRef": "kwaikat/kat-coder-pro-v1", + "score": 36.0 + }, + { + "modelRef": "anthropic/claude-4-1-opus", + "score": 36.0 + }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b-non-reasoning", + "score": 35.9 + }, { "modelRef": "aws/nova-2-0-pro-reasoning-medium", - "score": 35.6 + "score": 35.7 }, { - "modelRef": "google/gemini-3-flash", - "score": 35.1 + "modelRef": "openai/gpt-5-4-non-reasoning", + "score": 35.4 }, { "modelRef": "xai/grok-4-fast-reasoning", - "score": 34.9 + "score": 35.1 + }, + { + "modelRef": "google/gemini-3-flash", + "score": 35.0 }, { "modelRef": "anthropic/claude-3-7-sonnet-thinking", - "score": 34.6 + "score": 34.7 }, { "modelRef": "google/gemini-2-5-pro", - "score": 34.5 + "score": 34.6 }, { "modelRef": "zai/glm-4-7-non-reasoning", - "score": 34.1 - }, - { - "modelRef": "deepseek/deepseek-v3-2-speciale", - "score": 34.1 + "score": 34.2 }, { "modelRef": "deepseek/deepseek-v3-1-terminus-reasoning", - "score": 33.8 + "score": 33.9 }, { "modelRef": "openai/gpt-5-2-non-reasoning", + "score": 33.6 + }, + { + "modelRef": "google/gemini-3-1-flash-lite-preview", "score": 33.5 }, { @@ -221,16 +341,24 @@ }, { "modelRef": "openai/o4-mini", - "score": 33.0 + "score": 33.1 }, { "modelRef": "anthropic/claude-4-sonnet", "score": 33.0 }, + { + "modelRef": "anthropic/claude-4-opus", + "score": 33.0 + }, { "modelRef": "deepseek/deepseek-v3-2-reasoning-0925", "score": 32.9 }, + { + "modelRef": "inception/mercury-2", + "score": 32.8 + }, { "modelRef": "zai/glm-4-6-reasoning", "score": 32.5 @@ -240,28 +368,28 @@ "score": 32.5 }, { - "modelRef": "lg/k-exaone", - "score": 32.1 + "modelRef": "alibaba/qwen3-5-9b", + "score": 32.4 }, { - "modelRef": "deepseek/deepseek-v3-2", + "modelRef": "xai/grok-3-mini-reasoning", "score": 32.1 }, { - "modelRef": "xai/grok-3-mini-reasoning", - "score": 32.0 + "modelRef": "lg/k-exaone", + "score": 32.1 }, { - "modelRef": "aws/nova-2-0-pro-reasoning-low", - "score": 31.9 + "modelRef": "deepseek/deepseek-v3-2", + "score": 32.1 }, { - "modelRef": "anthropic/claude-4-1-opus-thinking", + "modelRef": "aws/nova-2-0-pro-reasoning-low", "score": 31.9 }, { "modelRef": "alibaba/qwen3-max", - "score": 31.3 + "score": 31.4 }, { "modelRef": "google/gemini-2-5-flash-preview-09-2025-reasoning", @@ -269,10 +397,14 @@ }, { "modelRef": "anthropic/claude-4-5-haiku", - "score": 31.0 + "score": 31.1 }, { "modelRef": "kimi/kimi-k2-0905", + "score": 30.9 + }, + { + "modelRef": "openai/o1", "score": 30.8 }, { @@ -280,28 +412,32 @@ "score": 30.8 }, { - "modelRef": "openai/o1", + "modelRef": "alibaba/qwen3-5-35b-a3b-non-reasoning", "score": 30.7 }, { "modelRef": "xiaomi/mimo-v2-flash", - "score": 30.6 + "score": 30.4 }, { "modelRef": "google/gemini-2-5-pro-03-25", "score": 30.3 }, + { + "modelRef": "zai/glm-4-6", + "score": 30.2 + }, { "modelRef": "zai/glm-4-7-flash", "score": 30.1 }, { - "modelRef": "zai/glm-4-6", - "score": 30.1 + "modelRef": "xai/grok-4-20-non-reasoning", + "score": 29.7 }, { "modelRef": "aws/nova-2-0-lite-reasoning-medium", - "score": 29.6 + "score": 29.7 }, { "modelRef": "google/gemini-2-5-pro-05-06", @@ -311,6 +447,10 @@ "modelRef": "alibaba/qwen3-235b-a22b-instruct-2507-reasoning", "score": 29.5 }, + { + "modelRef": "deepseek/deepseek-v3-2-speciale", + "score": 29.4 + }, { "modelRef": "baidu/ernie-5-0-thinking-preview", "score": 29.1 @@ -321,82 +461,98 @@ }, { "modelRef": "deepseek/deepseek-v3-1-terminus", - "score": 28.4 + "score": 28.5 }, { - "modelRef": "servicenow/apriel-v1-5-15b-thinker", - "score": 28.3 + "modelRef": "deepseek/deepseek-v3-2-0925", + "score": 28.4 }, { - "modelRef": "deepseek/deepseek-v3-2-0925", + "modelRef": "servicenow/apriel-v1-5-15b-thinker", "score": 28.3 }, { "modelRef": "alibaba/qwen3-coder-next", - "score": 28.1 + "score": 28.3 }, { "modelRef": "deepseek/deepseek-v3-1", - "score": 28.0 + "score": 28.1 }, { "modelRef": "aws/nova-2-0-omni-reasoning-medium", - "score": 27.9 + "score": 28.0 }, { "modelRef": "deepseek/deepseek-v3-1-reasoning", - "score": 27.6 + "score": 27.7 }, { "modelRef": "servicenow/apriel-v1-6-15b-thinker", - "score": 27.5 + "score": 27.6 }, { "modelRef": "alibaba/qwen3-vl-235b-a22b-reasoning", - "score": 27.5 + "score": 27.6 }, { "modelRef": "openai/gpt-5-1-non-reasoning", "score": 27.4 }, { - "modelRef": "anthropic/claude-4-opus-thinking", - "score": 27.4 + "modelRef": "alibaba/qwen3-5-9b-non-reasoning", + "score": 27.3 }, { "modelRef": "mistral/magistral-medium-2509", - "score": 27.0 + "score": 27.1 }, { "modelRef": "deepseek/deepseek-r1", - "score": 27.0 + "score": 27.1 + }, + { + "modelRef": "alibaba/qwen3-5-4b", + "score": 27.1 }, { "modelRef": "google/gemini-2-5-flash-reasoning", - "score": 26.8 + "score": 27.0 + }, + { + "modelRef": "mistral/mistral-small-4", + "score": 26.9 }, { "modelRef": "openai/gpt-5-nano", - "score": 26.7 + "score": 26.8 }, { "modelRef": "alibaba/qwen3-next-80b-a3b-reasoning", - "score": 26.5 + "score": 26.7 }, { "modelRef": "zai/glm-4-5", - "score": 26.2 + "score": 26.4 + }, + { + "modelRef": "openai/gpt-4-1", + "score": 26.3 }, { "modelRef": "kimi/kimi-k2", - "score": 26.2 + "score": 26.3 + }, + { + "modelRef": "alibaba/qwen3-max-preview", + "score": 26.1 }, { "modelRef": "openai/o3-mini", "score": 25.9 }, { - "modelRef": "alibaba/qwen3-max-preview", + "modelRef": "openai/gpt-5-nano-medium", "score": 25.9 }, { @@ -404,31 +560,31 @@ "score": 25.8 }, { - "modelRef": "openai/gpt-5-nano-medium", + "modelRef": "google/gemini-2-5-flash-preview-09-2025", "score": 25.7 }, { - "modelRef": "openai/gpt-4-1", - "score": 25.6 + "modelRef": "xai/grok-3", + "score": 25.2 }, { - "modelRef": "google/gemini-2-5-flash-preview-09-2025", - "score": 25.5 + "modelRef": "openai/o3-mini-high", + "score": 25.2 }, { - "modelRef": "openai/o3-mini-high", - "score": 25.1 + "modelRef": "bytedance-seed/seed-oss-36b-instruct", + "score": 25.2 }, { - "modelRef": "xai/grok-3", + "modelRef": "alibaba/qwen3-235b-a22b-instruct-2507", "score": 25.0 }, { - "modelRef": "bytedance-seed/seed-oss-36b-instruct", - "score": 25.0 + "modelRef": "alibaba/qwen3-coder-480b-a35b-instruct", + "score": 24.8 }, { - "modelRef": "alibaba/qwen3-235b-a22b-instruct-2507", + "modelRef": "alibaba/qwen3-vl-32b-reasoning", "score": 24.7 }, { @@ -436,7 +592,7 @@ "score": 24.6 }, { - "modelRef": "alibaba/qwen3-coder-480b-a35b-instruct", + "modelRef": "aws/nova-2-0-lite-reasoning-low", "score": 24.6 }, { @@ -444,19 +600,19 @@ "score": 24.5 }, { - "modelRef": "mbzuai/k2-think-v2", + "modelRef": "openai/gpt-oss-120b-low", "score": 24.5 }, { - "modelRef": "alibaba/qwen3-vl-32b-reasoning", - "score": 24.5 + "modelRef": "openai/gpt-5-4-nano-non-reasoning", + "score": 24.4 }, { - "modelRef": "nvidia/nvidia-nemotron-3-nano-30b-a3b-reasoning", - "score": 24.3 + "modelRef": "minimax/minimax-m1-80k", + "score": 24.4 }, { - "modelRef": "minimax/minimax-m1-80k", + "modelRef": "nvidia/nvidia-nemotron-3-nano-30b-a3b-reasoning", "score": 24.3 }, { @@ -464,19 +620,19 @@ "score": 24.3 }, { - "modelRef": "aws/nova-2-0-lite-reasoning-low", - "score": 24.2 + "modelRef": "mbzuai/k2-think-v2", + "score": 24.1 }, { - "modelRef": "openai/gpt-oss-120b-low", + "modelRef": "openai/gpt-5-minimal", "score": 23.9 }, { - "modelRef": "openai/o1-preview", - "score": 23.7 + "modelRef": "longcat/longcat-flash-lite", + "score": 23.9 }, { - "modelRef": "openai/gpt-5-minimal", + "modelRef": "openai/o1-preview", "score": 23.7 }, { @@ -484,16 +640,20 @@ "score": 23.7 }, { - "modelRef": "anthropic/claude-4-1-opus", + "modelRef": "xai/grok-4-1-fast", "score": 23.6 }, { "modelRef": "zai/glm-4-6v-reasoning", - "score": 23.5 + "score": 23.4 }, { - "modelRef": "xai/grok-4-1-fast", - "score": 23.5 + "modelRef": "lg/k-exaone-non-reasoning", + "score": 23.4 + }, + { + "modelRef": "openai/gpt-5-4-mini-non-reasoning", + "score": 23.3 }, { "modelRef": "zai/glm-4-5-air", @@ -504,47 +664,47 @@ "score": 23.2 }, { - "modelRef": "lg/k-exaone-non-reasoning", - "score": 23.0 + "modelRef": "xai/grok-4-fast", + "score": 23.1 }, { "modelRef": "korea-telecom/mi-dm-k-2-5-pro-dec28", - "score": 23.0 + "score": 23.1 }, { "modelRef": "aws/nova-2-0-pro", - "score": 22.9 - }, - { - "modelRef": "mistral/mistral-large-3", - "score": 22.7 + "score": 23.1 }, { - "modelRef": "xai/grok-4-fast", - "score": 22.6 + "modelRef": "openai/gpt-4-1-mini", + "score": 22.9 }, { - "modelRef": "mistral/magistral-small-2509", - "score": 22.5 + "modelRef": "mistral/mistral-large-3", + "score": 22.8 }, { "modelRef": "inclusionai/ring-1t", - "score": 22.5 + "score": 22.8 }, { - "modelRef": "openai/gpt-4-1-mini", - "score": 22.4 + "modelRef": "alibaba/qwen3-5-4b-non-reasoning", + "score": 22.6 }, { "modelRef": "alibaba/qwen3-30b-a3b-2507-reasoning", "score": 22.4 }, { - "modelRef": "anthropic/claude-4-opus", - "score": 22.2 + "modelRef": "deepseek/deepseek-v3-0324", + "score": 22.3 }, { "modelRef": "prime-intellect/intellect-3", + "score": 22.2 + }, + { + "modelRef": "zai/glm-4-7-flash-non-reasoning", "score": 22.1 }, { @@ -556,28 +716,20 @@ "score": 21.8 }, { - "modelRef": "deepseek/deepseek-v3-0324", - "score": 21.8 + "modelRef": "upstage/solar-open-100b-reasoning", + "score": 21.7 }, { "modelRef": "xai/grok-3-reasoning", "score": 21.6 }, - { - "modelRef": "upstage/solar-open-100b-reasoning", - "score": 21.6 - }, { "modelRef": "google/gemini-2-5-flash-lite-preview-09-2025-reasoning", "score": 21.6 }, - { - "modelRef": "zai/glm-4-7-flash-non-reasoning", - "score": 21.5 - }, { "modelRef": "mistral/mistral-medium-3-1", - "score": 21.1 + "score": 21.3 }, { "modelRef": "minimax/minimax-m1-40k", @@ -588,20 +740,20 @@ "score": 20.8 }, { - "modelRef": "openai/gpt-5-mini-minimal", - "score": 20.7 + "modelRef": "alibaba/qwen3-vl-235b-a22b-instruct", + "score": 20.8 }, { - "modelRef": "mbzuai/k2-v2", + "modelRef": "openai/gpt-5-mini-minimal", "score": 20.7 }, { - "modelRef": "alibaba/qwen3-vl-235b-a22b-instruct", + "modelRef": "mbzuai/k2-v2", "score": 20.6 }, { "modelRef": "google/gemini-2-5-flash", - "score": 20.5 + "score": 20.6 }, { "modelRef": "openai/o1-mini", @@ -611,6 +763,10 @@ "modelRef": "alibaba/qwen3-next-80b-a3b-instruct", "score": 20.1 }, + { + "modelRef": "trillionlabs/tri-21b-think-preview", + "score": 20.0 + }, { "modelRef": "openai/gpt-4-5", "score": 20.0 @@ -627,22 +783,22 @@ "modelRef": "alibaba/qwq-32b", "score": 19.7 }, + { + "modelRef": "alibaba/qwen3-vl-30b-a3b-reasoning", + "score": 19.7 + }, { "modelRef": "google/gemini-2-0-flash-thinking-exp-0121", "score": 19.6 }, { - "modelRef": "alibaba/qwen3-vl-30b-a3b-reasoning", - "score": 19.6 + "modelRef": "mistral/devstral-small-2", + "score": 19.5 }, { "modelRef": "google/gemini-2-5-flash-lite-preview-09-2025", "score": 19.4 }, - { - "modelRef": "mistral/devstral-small-2", - "score": 19.3 - }, { "modelRef": "motif-technologies/motif-2-12-7b", "score": 19.1 @@ -653,14 +809,18 @@ }, { "modelRef": "aws/nova-premier", - "score": 18.9 + "score": 19.0 }, { "modelRef": "upstage/solar-pro-2-preview-reasoning", "score": 18.8 }, { - "modelRef": "openai/gpt-4o-2024-08-06", + "modelRef": "mistral/mistral-medium-3", + "score": 18.8 + }, + { + "modelRef": "mistral/magistral-medium", "score": 18.8 }, { @@ -668,7 +828,11 @@ "score": 18.8 }, { - "modelRef": "mistral/magistral-medium", + "modelRef": "nvidia/llama-nemotron-super-49b-v1-5-reasoning", + "score": 18.7 + }, + { + "modelRef": "mistral/devstral-medium", "score": 18.7 }, { @@ -680,23 +844,23 @@ "score": 18.7 }, { - "modelRef": "openai/gpt-4o-chatgpt-03-25", + "modelRef": "trillionlabs/tri-21b-think-v0-5", "score": 18.6 }, { - "modelRef": "nvidia/llama-nemotron-super-49b-v1-5-reasoning", + "modelRef": "openai/gpt-4o-chatgpt-03-25", "score": 18.6 }, { - "modelRef": "nous-research/hermes-4-llama-3-1-405b-reasoning", + "modelRef": "openai/gpt-4o-2024-08-06", "score": 18.6 }, { - "modelRef": "mistral/devstral-medium", + "modelRef": "nous-research/hermes-4-llama-3-1-405b-reasoning", "score": 18.6 }, { - "modelRef": "alibaba/qwen3-4b-2507-instruct-reasoning", + "modelRef": "mistral/mistral-small-4-non-reasoning", "score": 18.6 }, { @@ -709,7 +873,19 @@ }, { "modelRef": "meta/llama-4-maverick", - "score": 18.3 + "score": 18.4 + }, + { + "modelRef": "sarvam/sarvam-105b", + "score": 18.2 + }, + { + "modelRef": "mistral/magistral-small-2509", + "score": 18.2 + }, + { + "modelRef": "alibaba/qwen3-4b-2507-instruct-reasoning", + "score": 18.2 }, { "modelRef": "google/gemini-2-0-pro-experimental-02-05", @@ -720,11 +896,15 @@ "score": 18.0 }, { - "modelRef": "perplexity/sonar-reasoning", - "score": 17.9 + "modelRef": "aws/nova-2-0-lite", + "score": 18.0 }, { - "modelRef": "aws/nova-2-0-lite", + "modelRef": "anthropic/claude-3-opus", + "score": 18.0 + }, + { + "modelRef": "perplexity/sonar-reasoning", "score": 17.9 }, { @@ -732,13 +912,21 @@ "score": 17.8 }, { - "modelRef": "mistral/mistral-medium-3", + "modelRef": "nous-research/hermes-4-llama-3-1-405b", "score": 17.6 }, { "modelRef": "google/gemini-2-5-flash-lite-reasoning", + "score": 17.6 + }, + { + "modelRef": "meta/llama-3-1-instruct-405b", "score": 17.4 }, + { + "modelRef": "openai/gpt-4o", + "score": 17.3 + }, { "modelRef": "deepseek/deepseek-r1-distill-qwen-32b", "score": 17.2 @@ -751,13 +939,9 @@ "modelRef": "zai/glm-4-6v", "score": 17.1 }, - { - "modelRef": "nous-research/hermes-4-llama-3-1-405b", - "score": 17.1 - }, { "modelRef": "alibaba/qwen3-235b-a22b-instruct", - "score": 16.9 + "score": 17.0 }, { "modelRef": "mistral/magistral-small", @@ -769,27 +953,31 @@ }, { "modelRef": "lg/exaone-4-0-32b-reasoning", - "score": 16.6 + "score": 16.7 + }, + { + "modelRef": "alibaba/qwen3-vl-8b-reasoning", + "score": 16.7 }, { "modelRef": "aws/nova-2-0-omni", "score": 16.6 }, { - "modelRef": "alibaba/qwen3-vl-8b-reasoning", - "score": 16.6 + "modelRef": "deepseek/deepseek-v3", + "score": 16.5 }, { "modelRef": "alibaba/qwen3-32b-instruct-reasoning", "score": 16.5 }, { - "modelRef": "deepseek/deepseek-v3", + "modelRef": "deepseek/deepseek-r1-qwen3-8b", "score": 16.4 }, { - "modelRef": "deepseek/deepseek-r1-qwen3-8b", - "score": 16.4 + "modelRef": "alibaba/qwen3-5-2b", + "score": 16.3 }, { "modelRef": "alibaba/qwen-2-5-max", @@ -799,6 +987,14 @@ "modelRef": "alibaba/qwen3-14b-instruct-reasoning", "score": 16.2 }, + { + "modelRef": "nanbeige/nanbeige4-1-3b", + "score": 16.1 + }, + { + "modelRef": "alibaba/qwen3-vl-30b-a3b-instruct", + "score": 16.1 + }, { "modelRef": "upstage/solar-pro-2-preview", "score": 16.0 @@ -819,10 +1015,6 @@ "modelRef": "deepseek/deepseek-r1-distill-llama-70b", "score": 16.0 }, - { - "modelRef": "alibaba/qwen3-vl-30b-a3b-instruct", - "score": 16.0 - }, { "modelRef": "anthropic/claude-35-sonnet", "score": 15.9 @@ -835,6 +1027,10 @@ "modelRef": "deepseek/deepseek-r1-distill-qwen-14b", "score": 15.8 }, + { + "modelRef": "inclusionai/ling-flash-2-0", + "score": 15.7 + }, { "modelRef": "alibaba/qwen3-omni-30b-a3b-reasoning", "score": 15.6 @@ -848,11 +1044,7 @@ "score": 15.5 }, { - "modelRef": "inclusionai/ling-flash-2-0", - "score": 15.5 - }, - { - "modelRef": "stepfun/step3-vl-10b", + "modelRef": "stepfun/step-3-vl-10b", "score": 15.4 }, { @@ -872,11 +1064,15 @@ "score": 15.2 }, { - "modelRef": "mistral/mistral-large-2", + "modelRef": "zai/glm-4-5v-reasoning", "score": 15.1 }, { - "modelRef": "inclusionai/ling-mini-2-0", + "modelRef": "mistral/mistral-small-3-2", + "score": 15.1 + }, + { + "modelRef": "mistral/mistral-large-2", "score": 15.1 }, { @@ -884,47 +1080,43 @@ "score": 15.0 }, { - "modelRef": "mistral/mistral-small-3-2", + "modelRef": "baidu/ernie-4-5-300b-a47b", "score": 15.0 }, { "modelRef": "alibaba/qwen3-30b-a3b-2507", "score": 15.0 }, - { - "modelRef": "zai/glm-4-5v-reasoning", - "score": 14.9 - }, { "modelRef": "upstage/solar-pro-2-reasoning", "score": 14.9 }, { - "modelRef": "baidu/ernie-4-5-300b-a47b", - "score": 14.9 - }, - { - "modelRef": "alibaba/qwen3-vl-4b-reasoning", + "modelRef": "nvidia/nvidia-nemotron-nano-12b-v2-vl-reasoning", "score": 14.9 }, { - "modelRef": "openai/gpt-4o", + "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2-reasoning", "score": 14.8 }, { - "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2-reasoning", + "modelRef": "mistral/ministral-3-8b", "score": 14.8 }, { - "modelRef": "nvidia/nvidia-nemotron-nano-12b-v2-vl-reasoning", - "score": 14.8 + "modelRef": "nvidia/nvidia-nemotron-3-nano-4b", + "score": 14.7 }, { "modelRef": "google/gemini-2-0-flash-lite-001", "score": 14.7 }, { - "modelRef": "mistral/ministral-3-8b", + "modelRef": "alibaba/qwen3-5-2b-non-reasoning", + "score": 14.7 + }, + { + "modelRef": "nvidia/llama-nemotron-super-49b-v1-5", "score": 14.6 }, { @@ -932,7 +1124,11 @@ "score": 14.5 }, { - "modelRef": "nvidia/llama-nemotron-super-49b-v1-5", + "modelRef": "mistral/mistral-small-3-1", + "score": 14.5 + }, + { + "modelRef": "meta/llama-3-3-instruct-70b", "score": 14.5 }, { @@ -955,10 +1151,6 @@ "modelRef": "kimi/kimi-linear-48b-a3b-instruct", "score": 14.4 }, - { - "modelRef": "reka-ai/reka-flash-3", - "score": 14.3 - }, { "modelRef": "nvidia/llama-3-3-nemotron-super-49b", "score": 14.3 @@ -967,14 +1159,6 @@ "modelRef": "alibaba/qwen3-vl-8b-instruct", "score": 14.3 }, - { - "modelRef": "meta/llama-3-3-instruct-70b", - "score": 14.2 - }, - { - "modelRef": "meta/llama-3-1-instruct-405b", - "score": 14.2 - }, { "modelRef": "anthropic/claude-35-sonnet-june-24", "score": 14.2 @@ -983,10 +1167,6 @@ "modelRef": "alibaba/qwen3-4b-instruct-reasoning", "score": 14.2 }, - { - "modelRef": "ai2/olmo-3-1-32b-think", - "score": 14.2 - }, { "modelRef": "openai/gpt-4o-chatgpt", "score": 14.1 @@ -999,10 +1179,6 @@ "modelRef": "mistral/pixtral-large-2411", "score": 14.0 }, - { - "modelRef": "mistral/mistral-small-3-1", - "score": 14.0 - }, { "modelRef": "inclusionai/ring-flash-2-0", "score": 14.0 @@ -1011,28 +1187,36 @@ "modelRef": "xai/grok-2-1212", "score": 13.9 }, + { + "modelRef": "ai2/olmo-3-1-32b-think", + "score": 13.9 + }, + { + "modelRef": "openai/gpt-5-nano-minimal", + "score": 13.8 + }, { "modelRef": "google/gemini-1-5-flash", "score": 13.8 }, { - "modelRef": "openai/gpt-5-nano-minimal", + "modelRef": "openai/gpt-4-turbo", "score": 13.7 }, { - "modelRef": "openai/gpt-4-turbo", + "modelRef": "alibaba/qwen3-vl-4b-reasoning", "score": 13.7 }, { - "modelRef": "nous-research/hermes-4-llama-3-1-70b", + "modelRef": "upstage/solar-pro-2", "score": 13.6 }, { - "modelRef": "upstage/solar-pro-2", + "modelRef": "meta/llama-4-scout", "score": 13.5 }, { - "modelRef": "meta/llama-4-scout", + "modelRef": "cohere/command-a", "score": 13.5 }, { @@ -1043,24 +1227,20 @@ "modelRef": "nvidia/llama-3-1-nemotron-instruct-70b", "score": 13.4 }, - { - "modelRef": "cohere/command-a", - "score": 13.4 - }, { "modelRef": "xai/grok-beta", "score": 13.3 }, { - "modelRef": "nvidia/nvidia-nemotron-3-nano-30b-a3b", - "score": 13.3 + "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2", + "score": 13.2 }, { - "modelRef": "azure/phi-4", + "modelRef": "nvidia/nvidia-nemotron-3-nano-30b-a3b", "score": 13.2 }, { - "modelRef": "alibaba/qwen3-4b-2507-instruct", + "modelRef": "alibaba/qwen3-8b-instruct-reasoning", "score": 13.2 }, { @@ -1068,23 +1248,15 @@ "score": 13.2 }, { - "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2", - "score": 13.1 - }, - { - "modelRef": "meta/llama-3-1-instruct-70b", - "score": 13.1 - }, - { - "modelRef": "alibaba/qwen3-8b-instruct-reasoning", - "score": 13.1 + "modelRef": "openai/gpt-4-1-nano", + "score": 13.0 }, { "modelRef": "mistral/mistral-large-2407", "score": 13.0 }, { - "modelRef": "openai/gpt-4-1-nano", + "modelRef": "alibaba/qwen3-4b-2507-instruct", "score": 12.9 }, { @@ -1095,12 +1267,24 @@ "modelRef": "openai/gpt-4", "score": 12.8 }, + { + "modelRef": "alibaba/qwen3-14b-instruct", + "score": 12.8 + }, + { + "modelRef": "zai/glm-4-5v", + "score": 12.7 + }, { "modelRef": "mistral/mistral-small-3", "score": 12.7 }, { - "modelRef": "alibaba/qwen3-14b-instruct", + "modelRef": "google/gemini-2-5-flash-lite", + "score": 12.7 + }, + { + "modelRef": "aws/nova-lite", "score": 12.7 }, { @@ -1108,32 +1292,28 @@ "score": 12.6 }, { - "modelRef": "zai/glm-4-5v", - "score": 12.5 + "modelRef": "nous-research/hermes-4-llama-3-1-70b", + "score": 12.6 }, { - "modelRef": "google/gemini-2-5-flash-lite", + "modelRef": "meta/llama-3-1-instruct-70b", "score": 12.5 }, { "modelRef": "deepseek/deepseek-v2-5", "score": 12.5 }, - { - "modelRef": "anthropic/claude-3-opus", - "score": 12.5 - }, { "modelRef": "alibaba/qwen3-4b-instruct", "score": 12.5 }, { - "modelRef": "aws/nova-lite", - "score": 12.4 + "modelRef": "alibaba/qwen3-30b-a3b-instruct", + "score": 12.5 }, { - "modelRef": "alibaba/qwen3-30b-a3b-instruct", - "score": 12.4 + "modelRef": "sarvam/sarvam-30b", + "score": 12.3 }, { "modelRef": "google/gemini-2-0-flash-thinking-exp-1219", @@ -1143,6 +1323,14 @@ "modelRef": "deepseek/deepseek-v2-5-sep-2024", "score": 12.3 }, + { + "modelRef": "anthropic/claude-3-haiku", + "score": 12.3 + }, + { + "modelRef": "ai2/olmo-3-1-32b-instruct", + "score": 12.2 + }, { "modelRef": "mistral/mistral-saba", "score": 12.1 @@ -1151,6 +1339,10 @@ "modelRef": "deepseek/deepseek-r1-distill-llama-8b", "score": 12.1 }, + { + "modelRef": "ai2/olmo-3-32b-think", + "score": 12.1 + }, { "modelRef": "reka-ai/reka-flash", "score": 12.0 @@ -1167,14 +1359,6 @@ "modelRef": "alibaba/qwen-turbo", "score": 12.0 }, - { - "modelRef": "ai2/olmo-3-32b-think", - "score": 12.0 - }, - { - "modelRef": "ai2/olmo-3-1-32b-instruct", - "score": 12.0 - }, { "modelRef": "upstage/solar-mini", "score": 11.9 @@ -1183,22 +1367,22 @@ "modelRef": "meta/llama-3-2-instruct-90b-vision", "score": 11.9 }, + { + "modelRef": "meta/llama-3-1-instruct-8b", + "score": 11.8 + }, { "modelRef": "xai/grok-1", "score": 11.7 }, { - "modelRef": "meta/llama-3-1-instruct-8b", + "modelRef": "lg/exaone-4-0-32b", "score": 11.7 }, { "modelRef": "alibaba/qwen2-72b-instruct", "score": 11.7 }, - { - "modelRef": "lg/exaone-4-0-32b", - "score": 11.5 - }, { "modelRef": "mistral/ministral-3-3b", "score": 11.2 @@ -1212,21 +1396,13 @@ "score": 10.9 }, { - "modelRef": "meta/llama-3-2-instruct-11b-vision", - "score": 10.9 - }, - { - "modelRef": "azure/phi-4-mini", + "modelRef": "ai21-labs/jamba-1-7-large", "score": 10.9 }, { "modelRef": "ibm/granite-4-0-h-small", "score": 10.8 }, - { - "modelRef": "ibm/granite-3-3-8b-instruct", - "score": 10.8 - }, { "modelRef": "alibaba/qwen3-omni-30b-a3b-instruct", "score": 10.7 @@ -1255,32 +1431,36 @@ "modelRef": "ai2/olmo-2-32b", "score": 10.6 }, + { + "modelRef": "liquidai/lfm2-24b-a2b", + "score": 10.5 + }, { "modelRef": "google/gemini-1-5-flash-may-2024", "score": 10.5 }, { - "modelRef": "aws/nova-micro", - "score": 10.3 + "modelRef": "alibaba/qwen3-5-0-8b", + "score": 10.5 }, { - "modelRef": "anthropic/claude-3-sonnet", - "score": 10.3 + "modelRef": "azure/phi-4", + "score": 10.4 }, { - "modelRef": "ai21-labs/jamba-reasoning-3b", + "modelRef": "google/gemma-3-27b", "score": 10.3 }, { - "modelRef": "mistral/mistral-small", - "score": 10.2 + "modelRef": "aws/nova-micro", + "score": 10.3 }, { - "modelRef": "meta/llama-3-instruct-70b", - "score": 10.2 + "modelRef": "anthropic/claude-3-sonnet", + "score": 10.3 }, { - "modelRef": "google/gemma-3-27b", + "modelRef": "mistral/mistral-small", "score": 10.2 }, { @@ -1311,6 +1491,10 @@ "modelRef": "mistral/mistral-large", "score": 9.9 }, + { + "modelRef": "alibaba/qwen3-5-0-8b-non-reasoning", + "score": 9.9 + }, { "modelRef": "mistral/mistral-8x22b-instruct", "score": 9.8 @@ -1324,44 +1508,40 @@ "score": 9.7 }, { - "modelRef": "google/gemma-3n-e2b", - "score": 9.7 + "modelRef": "alibaba/qwen3-vl-4b-instruct", + "score": 9.6 }, { - "modelRef": "alibaba/qwen3-vl-4b-instruct", - "score": 9.5 + "modelRef": "ai21-labs/jamba-reasoning-3b", + "score": 9.6 }, { - "modelRef": "alibaba/qwen1-5-110b-chat", + "modelRef": "reka-ai/reka-flash-3", "score": 9.5 }, { - "modelRef": "ai2/olmo-3-7b-think", + "modelRef": "alibaba/qwen1-5-110b-chat", "score": 9.5 }, { - "modelRef": "anthropic/claude-3-haiku", - "score": 9.3 + "modelRef": "ai2/olmo-3-7b-think", + "score": 9.4 }, { "modelRef": "anthropic/claude-21", "score": 9.3 }, - { - "modelRef": "ai21-labs/jamba-1-7-large", - "score": 9.3 - }, { "modelRef": "ai2/olmo-2-7b", "score": 9.3 }, { - "modelRef": "ai2/molmo-7b-d", + "modelRef": "inclusionai/ling-mini-2-0", "score": 9.2 }, { - "modelRef": "meta/llama-3-2-instruct-1b", - "score": 9.1 + "modelRef": "ai2/molmo-7b-d", + "score": 9.2 }, { "modelRef": "deepseek/deepseek-v2", @@ -1387,6 +1567,10 @@ "modelRef": "mistral/mistral-medium", "score": 9.0 }, + { + "modelRef": "meta/llama-3-instruct-70b", + "score": 8.9 + }, { "modelRef": "snowflake/arctic-instruct", "score": 8.8 @@ -1404,17 +1588,13 @@ "score": 8.8 }, { - "modelRef": "meta/llama-3-instruct-8b", + "modelRef": "meta/llama-3-2-instruct-11b-vision", "score": 8.7 }, { "modelRef": "google/palm-2", "score": 8.6 }, - { - "modelRef": "google/gemma-3-1b", - "score": 8.6 - }, { "modelRef": "google/gemini-1-0-pro", "score": 8.5 @@ -1423,6 +1603,10 @@ "modelRef": "deepseek/deepseek-coder-v2-lite", "score": 8.5 }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 8.4 + }, { "modelRef": "meta/llama-2-chat-70b", "score": 8.4 @@ -1432,11 +1616,11 @@ "score": 8.4 }, { - "modelRef": "google/gemma-3-270m", + "modelRef": "deepseek/deepseek-llm-67b-chat", "score": 8.4 }, { - "modelRef": "deepseek/deepseek-llm-67b-chat", + "modelRef": "azure/phi-4-mini", "score": 8.4 }, { @@ -1455,6 +1639,10 @@ "modelRef": "cohere/command-r-plus-04-2024", "score": 8.3 }, + { + "modelRef": "ai2/olmo-3-7b-instruct", + "score": 8.2 + }, { "modelRef": "liquidai/lfm2-5-1-2b-thinking", "score": 8.1 @@ -1464,7 +1652,7 @@ "score": 8.1 }, { - "modelRef": "ai2/olmo-3-7b-instruct", + "modelRef": "ai21-labs/jamba-1-7-mini", "score": 8.1 }, { @@ -1472,25 +1660,29 @@ "score": 8.0 }, { - "modelRef": "ibm/granite-4-0-h-nano-1b", + "modelRef": "liquidai/lfm2-2-6b", "score": 8.0 }, { - "modelRef": "ai21-labs/jamba-1-5-mini", + "modelRef": "ibm/granite-4-0-h-nano-1b", "score": 8.0 }, { - "modelRef": "liquidai/lfm2-2-6b", - "score": 7.9 + "modelRef": "alibaba/qwen3-1-7b-instruct-reasoning", + "score": 8.0 }, { - "modelRef": "alibaba/qwen3-1-7b-instruct-reasoning", - "score": 7.9 + "modelRef": "ai21-labs/jamba-1-5-mini", + "score": 8.0 }, { "modelRef": "ai21-labs/jamba-1-6-mini", "score": 7.9 }, + { + "modelRef": "swiss-ai-initiative/apertus-70b-instruct", + "score": 7.7 + }, { "modelRef": "mistral/mixtral-8x7b-instruct", "score": 7.7 @@ -1499,6 +1691,10 @@ "modelRef": "ibm/granite-4-0-micro", "score": 7.7 }, + { + "modelRef": "google/gemma-3-270m", + "score": 7.7 + }, { "modelRef": "nous-research/deephermes-3-llama-3-1-8b-preview", "score": 7.6 @@ -1528,31 +1724,39 @@ "score": 7.3 }, { - "modelRef": "ai21-labs/jamba-1-7-mini", + "modelRef": "ai2/molmo2-8b", "score": 7.3 }, { "modelRef": "liquidai/lfm2-8b-a1b", - "score": 6.8 + "score": 7.0 + }, + { + "modelRef": "ibm/granite-3-3-8b-instruct", + "score": 7.0 }, { "modelRef": "alibaba/qwen3-1-7b-instruct", "score": 6.8 }, { - "modelRef": "ibm/granite-4-0-350m", - "score": 6.6 + "modelRef": "alibaba/qwen3-0-6b-instruct-reasoning", + "score": 6.5 }, { - "modelRef": "liquidai/lfm2-1-2b", + "modelRef": "meta/llama-3-instruct-8b", "score": 6.4 }, { - "modelRef": "alibaba/qwen3-0-6b-instruct-reasoning", + "modelRef": "google/gemma-3n-e4b", "score": 6.4 }, { - "modelRef": "google/gemma-3n-e4b", + "modelRef": "meta/llama-3-2-instruct-1b", + "score": 6.3 + }, + { + "modelRef": "liquidai/lfm2-1-2b", "score": 6.3 }, { @@ -1561,15 +1765,35 @@ }, { "modelRef": "liquidai/lfm2-5-vl-1-6b", + "score": 6.2 + }, + { + "modelRef": "ibm/granite-4-0-350m", "score": 6.1 }, + { + "modelRef": "swiss-ai-initiative/apertus-8b-instruct", + "score": 5.9 + }, { "modelRef": "alibaba/qwen3-0-6b-instruct", - "score": 5.6 + "score": 5.7 + }, + { + "modelRef": "google/gemma-3-1b", + "score": 5.5 }, { "modelRef": "ibm/granite-4-0-h-350m", - "score": 5.3 + "score": 5.4 + }, + { + "modelRef": "google/gemma-3n-e2b", + "score": 4.8 + }, + { + "modelRef": "cohere/tiny-aya-global", + "score": 4.7 } ] } \ No newline at end of file diff --git a/src/content/benchmarks/gpqa.json b/src/content/benchmarks/gpqa.json index b3ac076..9114e0e 100644 --- a/src/content/benchmarks/gpqa.json +++ b/src/content/benchmarks/gpqa.json @@ -7,7 +7,7 @@ "tags": [ "Knowledge" ], - "lastUpdated": "2026-02-12", + "lastUpdated": "2026-03-25", "metrics": { "unit": "pass@1(%)", "isBetterHigher": true @@ -19,6 +19,18 @@ "initialWeight": 1000 }, "snapshot": [ + { + "modelRef": "google/gemini-3-1-pro-preview", + "score": 94.1 + }, + { + "modelRef": "openai/gpt-5-4", + "score": 92.0 + }, + { + "modelRef": "openai/gpt-5-3-codex", + "score": 91.5 + }, { "modelRef": "google/gemini-3-pro", "score": 90.8 @@ -39,10 +51,18 @@ "modelRef": "anthropic/claude-opus-4-6-adaptive", "score": 89.6 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b", + "score": 89.3 + }, { "modelRef": "google/gemini-3-pro-low", "score": 88.7 }, + { + "modelRef": "xai/grok-4-20", + "score": 88.5 + }, { "modelRef": "kimi/kimi-k2-5", "score": 87.9 @@ -51,6 +71,18 @@ "modelRef": "xai/grok-4", "score": 87.7 }, + { + "modelRef": "openai/gpt-5-4-mini", + "score": 87.5 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6-adaptive", + "score": 87.5 + }, + { + "modelRef": "minimax/minimax-m2-7", + "score": 87.4 + }, { "modelRef": "openai/gpt-5-1", "score": 87.3 @@ -59,6 +91,10 @@ "modelRef": "deepseek/deepseek-v3-2-speciale", "score": 87.1 }, + { + "modelRef": "xiaomi/mimo-v2-pro", + "score": 87.0 + }, { "modelRef": "anthropic/claude-opus-4-5-thinking", "score": 86.6 @@ -71,6 +107,10 @@ "modelRef": "alibaba/qwen3-max-thinking", "score": 86.1 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b-non-reasoning", + "score": 86.1 + }, { "modelRef": "openai/gpt-5-1-codex", "score": 86.0 @@ -79,6 +119,14 @@ "modelRef": "zai/glm-4-7", "score": 85.9 }, + { + "modelRef": "alibaba/qwen3-5-27b", + "score": 85.8 + }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b", + "score": 85.7 + }, { "modelRef": "openai/gpt-5", "score": 85.4 @@ -87,6 +135,18 @@ "modelRef": "xai/grok-4-1-fast-reasoning", "score": 85.3 }, + { + "modelRef": "nanbeige/nanbeige4-1-3b", + "score": 84.9 + }, + { + "modelRef": "minimax/minimax-m2-5", + "score": 84.8 + }, + { + "modelRef": "zai/glm-5-turbo", + "score": 84.7 + }, { "modelRef": "xai/grok-4-fast-reasoning", "score": 84.7 @@ -99,6 +159,10 @@ "modelRef": "openai/o3-pro", "score": 84.5 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b", + "score": 84.5 + }, { "modelRef": "google/gemini-2-5-pro", "score": 84.4 @@ -107,6 +171,10 @@ "modelRef": "openai/gpt-5-medium", "score": 84.2 }, + { + "modelRef": "alibaba/qwen3-5-27b-non-reasoning", + "score": 84.2 + }, { "modelRef": "deepseek/deepseek-v3-2-reasoning", "score": 84.0 @@ -135,10 +203,18 @@ "modelRef": "anthropic/claude-4-5-sonnet-thinking", "score": 83.4 }, + { + "modelRef": "stepfun/step-3-5-flash", + "score": 83.1 + }, { "modelRef": "minimax/minimax-m2-1", "score": 83.0 }, + { + "modelRef": "xiaomi/mimo-v2-omni", + "score": 82.8 + }, { "modelRef": "openai/gpt-5-mini", "score": 82.8 @@ -147,6 +223,18 @@ "modelRef": "openai/o3", "score": 82.7 }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b-non-reasoning", + "score": 82.7 + }, + { + "modelRef": "openai/gpt-5-4-mini-medium", + "score": 82.3 + }, + { + "modelRef": "google/gemini-3-1-flash-lite-preview", + "score": 82.2 + }, { "modelRef": "google/gemini-2-5-pro-05-06", "score": 82.2 @@ -155,6 +243,14 @@ "modelRef": "zai/glm-5", "score": 82.0 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b-non-reasoning", + "score": 81.9 + }, + { + "modelRef": "openai/gpt-5-4-nano", + "score": 81.7 + }, { "modelRef": "openai/gpt-5-1-codex-mini", "score": 81.3 @@ -183,14 +279,30 @@ "modelRef": "openai/gpt-5-low", "score": 80.8 }, + { + "modelRef": "alibaba/qwen3-5-9b", + "score": 80.6 + }, { "modelRef": "openai/gpt-5-mini-medium", "score": 80.3 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-super-120b-a12b", + "score": 80.0 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6", + "score": 79.9 + }, { "modelRef": "deepseek/deepseek-v3-2-reasoning-0925", "score": 79.7 }, + { + "modelRef": "anthropic/claude-sonnet-4-6-non-reasoning-low-effort", + "score": 79.7 + }, { "modelRef": "anthropic/claude-4-opus-thinking", "score": 79.6 @@ -219,6 +331,14 @@ "modelRef": "kimi/kimi-k2-5-non-reasoning", "score": 78.9 }, + { + "modelRef": "alibaba/qwen3-5-9b-non-reasoning", + "score": 78.6 + }, + { + "modelRef": "xai/grok-4-20-non-reasoning", + "score": 78.5 + }, { "modelRef": "aws/nova-2-0-pro-reasoning-medium", "score": 78.5 @@ -279,6 +399,18 @@ "modelRef": "alibaba/qwen3-vl-235b-a22b-reasoning", "score": 77.2 }, + { + "modelRef": "alibaba/qwen3-5-4b", + "score": 77.1 + }, + { + "modelRef": "inception/mercury-2", + "score": 77.0 + }, + { + "modelRef": "mistral/mistral-small-4", + "score": 76.9 + }, { "modelRef": "deepcogito/cogito-v2-1-reasoning", "score": 76.8 @@ -319,6 +451,10 @@ "modelRef": "prime-intellect/intellect-3", "score": 76.1 }, + { + "modelRef": "openai/gpt-5-4-nano-medium", + "score": 76.1 + }, { "modelRef": "aws/nova-2-0-omni-reasoning-medium", "score": 76.0 @@ -351,6 +487,10 @@ "modelRef": "openai/o3-mini", "score": 74.8 }, + { + "modelRef": "openai/gpt-5-4-non-reasoning", + "score": 74.8 + }, { "modelRef": "nvidia/llama-nemotron-super-49b-v1-5-reasoning", "score": 74.8 @@ -367,6 +507,10 @@ "modelRef": "lg/exaone-4-0-32b-reasoning", "score": 73.9 }, + { + "modelRef": "sarvam/sarvam-105b", + "score": 73.8 + }, { "modelRef": "deepseek/deepseek-v3-2-0925", "score": 73.8 @@ -455,6 +599,10 @@ "modelRef": "alibaba/qwen3-vl-235b-a22b-instruct", "score": 71.2 }, + { + "modelRef": "alibaba/qwen3-5-4b-non-reasoning", + "score": 71.2 + }, { "modelRef": "google/gemini-2-5-flash-lite-preview-09-2025-reasoning", "score": 70.9 @@ -520,7 +668,7 @@ "score": 69.3 }, { - "modelRef": "stepfun/step3-vl-10b", + "modelRef": "stepfun/step-3-vl-10b", "score": 69.0 }, { @@ -603,6 +751,10 @@ "modelRef": "alibaba/qwen3-4b-2507-instruct-reasoning", "score": 66.7 }, + { + "modelRef": "zai/glm-5-non-reasoning", + "score": 66.6 + }, { "modelRef": "openai/gpt-4-1", "score": 66.6 @@ -675,6 +827,10 @@ "modelRef": "xai/grok-4-1-fast", "score": 63.7 }, + { + "modelRef": "longcat/longcat-flash-lite", + "score": 63.6 + }, { "modelRef": "google/gemini-2-0-flash-experimental", "score": 63.6 @@ -683,6 +839,10 @@ "modelRef": "aws/nova-2-0-pro", "score": 63.6 }, + { + "modelRef": "sarvam/sarvam-30b", + "score": 63.3 + }, { "modelRef": "zai/glm-4-6", "score": 63.2 @@ -747,6 +907,10 @@ "modelRef": "xai/grok-4-fast", "score": 60.6 }, + { + "modelRef": "openai/gpt-5-4-mini-non-reasoning", + "score": 60.6 + }, { "modelRef": "alibaba/qwen3-14b-instruct-reasoning", "score": 60.4 @@ -759,6 +923,10 @@ "modelRef": "aws/nova-2-0-lite", "score": 60.3 }, + { + "modelRef": "trillionlabs/tri-21b-think-v0-5", + "score": 60.1 + }, { "modelRef": "anthropic/claude-35-sonnet", "score": 59.9 @@ -839,6 +1007,10 @@ "modelRef": "mistral/ministral-3-14b", "score": 57.2 }, + { + "modelRef": "mistral/mistral-small-4-non-reasoning", + "score": 57.1 + }, { "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2-reasoning", "score": 57.0 @@ -863,6 +1035,10 @@ "modelRef": "anthropic/claude-35-sonnet-june-24", "score": 56.0 }, + { + "modelRef": "openai/gpt-5-4-nano-non-reasoning", + "score": 55.8 + }, { "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2", "score": 55.7 @@ -899,6 +1075,10 @@ "modelRef": "ai2/olmo-3-1-32b-instruct", "score": 53.9 }, + { + "modelRef": "trillionlabs/tri-21b-think-preview", + "score": 53.8 + }, { "modelRef": "nous-research/hermes-4-llama-3-1-405b", "score": 53.6 @@ -967,6 +1147,10 @@ "modelRef": "alibaba/qwen3-30b-a3b-instruct", "score": 51.5 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-nano-4b", + "score": 51.3 + }, { "modelRef": "openai/gpt-4-1-nano", "score": 51.2 @@ -1027,6 +1211,10 @@ "modelRef": "nvidia/llama-nemotron-super-49b-v1-5", "score": 48.1 }, + { + "modelRef": "liquidai/lfm2-24b-a2b", + "score": 47.4 + }, { "modelRef": "google/gemini-2-5-flash-lite", "score": 47.4 @@ -1067,6 +1255,10 @@ "modelRef": "mistral/mistral-small-3", "score": 46.2 }, + { + "modelRef": "alibaba/qwen3-5-2b", + "score": 45.6 + }, { "modelRef": "mistral/mistral-small-3-1", "score": 45.4 @@ -1083,6 +1275,10 @@ "modelRef": "nvidia/nvidia-nemotron-nano-12b-v2-vl", "score": 43.9 }, + { + "modelRef": "alibaba/qwen3-5-2b-non-reasoning", + "score": 43.8 + }, { "modelRef": "mistral/devstral-small-2505", "score": 43.4 @@ -1131,6 +1327,10 @@ "modelRef": "alibaba/qwen2-5-coder-32b-instruct", "score": 41.7 }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 41.6 + }, { "modelRef": "ibm/granite-4-0-h-small", "score": 41.6 @@ -1203,6 +1403,10 @@ "modelRef": "meta/llama-3-instruct-70b", "score": 37.9 }, + { + "modelRef": "anthropic/claude-3-haiku", + "score": 37.4 + }, { "modelRef": "google/gemini-1-5-pro-may-2024", "score": 37.1 @@ -1339,6 +1543,10 @@ "modelRef": "liquidai/lfm2-2-6b", "score": 30.6 }, + { + "modelRef": "cohere/tiny-aya-global", + "score": 30.5 + }, { "modelRef": "mistral/mistral-small-2402", "score": 30.2 @@ -1407,6 +1615,10 @@ "modelRef": "google/gemini-1-0-pro", "score": 27.7 }, + { + "modelRef": "swiss-ai-initiative/apertus-70b-instruct", + "score": 27.2 + }, { "modelRef": "nous-research/deephermes-3-llama-3-1-8b-preview", "score": 27.0 @@ -1427,6 +1639,10 @@ "modelRef": "ibm/granite-4-0-h-350m", "score": 25.7 }, + { + "modelRef": "swiss-ai-initiative/apertus-8b-instruct", + "score": 25.6 + }, { "modelRef": "meta/llama-3-2-instruct-3b", "score": 25.5 @@ -1443,6 +1659,10 @@ "modelRef": "google/gemma-3-1b", "score": 23.7 }, + { + "modelRef": "alibaba/qwen3-5-0-8b-non-reasoning", + "score": 23.6 + }, { "modelRef": "alibaba/qwen3-0-6b-instruct", "score": 23.1 @@ -1479,6 +1699,10 @@ "modelRef": "mistral/mistral-7b-instruct", "score": 17.7 }, + { + "modelRef": "alibaba/qwen3-5-0-8b", + "score": 11.1 + }, { "modelRef": "deepseek/deepseek-r1-distill-qwen-1-5b", "score": 9.8 diff --git a/src/content/benchmarks/hle.json b/src/content/benchmarks/hle.json index e2860b4..5d66d63 100644 --- a/src/content/benchmarks/hle.json +++ b/src/content/benchmarks/hle.json @@ -8,7 +8,7 @@ "Knowledge", "Multi-Modal" ], - "lastUpdated": "2026-02-12", + "lastUpdated": "2026-03-25", "metrics": { "unit": "pass@1(%)", "isBetterHigher": true @@ -20,6 +20,18 @@ "initialWeight": 1000 }, "snapshot": [ + { + "modelRef": "google/gemini-3-1-pro-preview", + "score": 44.7 + }, + { + "modelRef": "openai/gpt-5-4", + "score": 41.6 + }, + { + "modelRef": "openai/gpt-5-3-codex", + "score": 39.9 + }, { "modelRef": "google/gemini-3-pro", "score": 37.2 @@ -44,6 +56,14 @@ "modelRef": "kwaikat/kat-coder-pro-v1", "score": 33.4 }, + { + "modelRef": "xai/grok-4-20", + "score": 30.0 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6-adaptive", + "score": 30.0 + }, { "modelRef": "kimi/kimi-k2-5", "score": 29.4 @@ -52,14 +72,34 @@ "modelRef": "anthropic/claude-opus-4-5-thinking", "score": 28.4 }, + { + "modelRef": "xiaomi/mimo-v2-pro", + "score": 28.3 + }, + { + "modelRef": "minimax/minimax-m2-7", + "score": 28.1 + }, { "modelRef": "google/gemini-3-pro-low", "score": 27.6 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b", + "score": 27.3 + }, { "modelRef": "zai/glm-5", "score": 27.2 }, + { + "modelRef": "openai/gpt-5-4-mini", + "score": 26.6 + }, + { + "modelRef": "openai/gpt-5-4-nano", + "score": 26.5 + }, { "modelRef": "openai/gpt-5-1", "score": 26.5 @@ -80,6 +120,10 @@ "modelRef": "openai/gpt-5-codex", "score": 25.6 }, + { + "modelRef": "zai/glm-5-turbo", + "score": 25.4 + }, { "modelRef": "zai/glm-4-7", "score": 25.1 @@ -100,6 +144,14 @@ "modelRef": "openai/gpt-5-1-codex", "score": 23.4 }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b", + "score": 23.4 + }, + { + "modelRef": "xai/grok-4-20-non-reasoning", + "score": 22.5 + }, { "modelRef": "kimi/kimi-k2-thinking", "score": 22.3 @@ -112,6 +164,10 @@ "modelRef": "deepseek/deepseek-v3-2-reasoning", "score": 22.2 }, + { + "modelRef": "alibaba/qwen3-5-27b", + "score": 22.2 + }, { "modelRef": "xiaomi/mimo-v2-flash-reasoning", "score": 21.1 @@ -128,10 +184,34 @@ "modelRef": "openai/o3", "score": 20.0 }, + { + "modelRef": "xiaomi/mimo-v2-omni", + "score": 19.9 + }, { "modelRef": "openai/gpt-5-mini", "score": 19.7 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b", + "score": 19.7 + }, + { + "modelRef": "nvidia/nvidia-nemotron-3-super-120b-a12b", + "score": 19.2 + }, + { + "modelRef": "stepfun/step-3-5-flash", + "score": 19.1 + }, + { + "modelRef": "minimax/minimax-m2-5", + "score": 19.1 + }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b-non-reasoning", + "score": 18.8 + }, { "modelRef": "anthropic/claude-opus-4-6", "score": 18.6 @@ -156,6 +236,10 @@ "modelRef": "anthropic/claude-4-5-sonnet-thinking", "score": 17.3 }, + { + "modelRef": "openai/gpt-5-4-mini-medium", + "score": 17.1 + }, { "modelRef": "google/gemini-2-5-pro-03-25", "score": 17.1 @@ -168,6 +252,14 @@ "modelRef": "openai/gpt-5-1-codex-mini", "score": 16.9 }, + { + "modelRef": "google/gemini-3-1-flash-lite-preview", + "score": 16.2 + }, + { + "modelRef": "inception/mercury-2", + "score": 15.5 + }, { "modelRef": "google/gemini-2-5-pro-05-06", "score": 15.4 @@ -184,6 +276,14 @@ "modelRef": "deepseek/deepseek-r1", "score": 14.9 }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b-non-reasoning", + "score": 14.8 + }, + { + "modelRef": "openai/gpt-5-4-nano-medium", + "score": 14.7 + }, { "modelRef": "openai/gpt-5-mini-medium", "score": 14.6 @@ -204,6 +304,18 @@ "modelRef": "bytedance-seed/doubao-seed-code", "score": 13.3 }, + { + "modelRef": "alibaba/qwen3-5-9b", + "score": 13.3 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6", + "score": 13.2 + }, + { + "modelRef": "alibaba/qwen3-5-27b-non-reasoning", + "score": 13.2 + }, { "modelRef": "lg/k-exaone", "score": 13.1 @@ -216,6 +328,10 @@ "modelRef": "anthropic/claude-opus-4-5", "score": 12.9 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b-non-reasoning", + "score": 12.8 + }, { "modelRef": "google/gemini-2-5-flash-preview-09-2025-reasoning", "score": 12.7 @@ -292,6 +408,14 @@ "modelRef": "tii-uae/falcon-h1r-7b", "score": 10.8 }, + { + "modelRef": "anthropic/claude-sonnet-4-6-non-reasoning-low-effort", + "score": 10.8 + }, + { + "modelRef": "openai/gpt-5-4-non-reasoning", + "score": 10.6 + }, { "modelRef": "alibaba/qwen3-235b-a22b-instruct-2507", "score": 10.6 @@ -313,7 +437,7 @@ "score": 10.3 }, { - "modelRef": "stepfun/step3-vl-10b", + "modelRef": "stepfun/step-3-vl-10b", "score": 10.2 }, { @@ -324,10 +448,18 @@ "modelRef": "inclusionai/ring-1t", "score": 10.2 }, + { + "modelRef": "sarvam/sarvam-105b", + "score": 10.1 + }, { "modelRef": "alibaba/qwen3-vl-235b-a22b-reasoning", "score": 10.1 }, + { + "modelRef": "nanbeige/nanbeige4-1-3b", + "score": 10.0 + }, { "modelRef": "servicenow/apriel-v1-6-15b-thinker", "score": 9.8 @@ -360,6 +492,10 @@ "modelRef": "alibaba/qwen3-vl-32b-reasoning", "score": 9.6 }, + { + "modelRef": "mistral/mistral-small-4", + "score": 9.5 + }, { "modelRef": "mistral/magistral-medium", "score": 9.5 @@ -420,6 +556,10 @@ "modelRef": "aws/nova-2-0-lite-reasoning-medium", "score": 8.6 }, + { + "modelRef": "alibaba/qwen3-5-9b-non-reasoning", + "score": 8.6 + }, { "modelRef": "deepseek/deepseek-v3-1-terminus", "score": 8.4 @@ -464,6 +604,10 @@ "modelRef": "google/gemini-2-5-flash-preview-09-2025", "score": 7.8 }, + { + "modelRef": "alibaba/qwen3-5-4b", + "score": 7.8 + }, { "modelRef": "openai/o1", "score": 7.7 @@ -484,6 +628,10 @@ "modelRef": "minimax/minimax-m1-40k", "score": 7.5 }, + { + "modelRef": "alibaba/qwen3-5-4b-non-reasoning", + "score": 7.5 + }, { "modelRef": "perplexity/sonar", "score": 7.3 @@ -500,6 +648,10 @@ "modelRef": "alibaba/qwen3-next-80b-a3b-instruct", "score": 7.3 }, + { + "modelRef": "zai/glm-5-non-reasoning", + "score": 7.2 + }, { "modelRef": "mistral/magistral-small", "score": 7.2 @@ -524,6 +676,10 @@ "modelRef": "upstage/solar-pro-2-reasoning", "score": 7.0 }, + { + "modelRef": "sarvam/sarvam-30b", + "score": 7.0 + }, { "modelRef": "kimi/kimi-k2", "score": 7.0 @@ -604,6 +760,10 @@ "modelRef": "zai/glm-4-7-non-reasoning", "score": 6.1 }, + { + "modelRef": "trillionlabs/tri-21b-think-v0-5", + "score": 6.1 + }, { "modelRef": "mistral/magistral-small-2509", "score": 6.1 @@ -616,6 +776,10 @@ "modelRef": "deepseek/deepseek-r1-distill-llama-70b", "score": 6.1 }, + { + "modelRef": "longcat/longcat-flash-lite", + "score": 6.0 + }, { "modelRef": "ai2/olmo-3-1-32b-think", "score": 6.0 @@ -660,6 +824,14 @@ "modelRef": "upstage/solar-pro-2-preview-reasoning", "score": 5.7 }, + { + "modelRef": "trillionlabs/tri-21b-think-preview", + "score": 5.7 + }, + { + "modelRef": "openai/gpt-5-4-mini-non-reasoning", + "score": 5.7 + }, { "modelRef": "liquidai/lfm2-1-2b", "score": 5.7 @@ -680,6 +852,10 @@ "modelRef": "deepseek/deepseek-r1-qwen3-8b", "score": 5.6 }, + { + "modelRef": "swiss-ai-initiative/apertus-70b-instruct", + "score": 5.5 + }, { "modelRef": "naver/hyperclova-x-seed-think-32b", "score": 5.5 @@ -756,6 +932,10 @@ "modelRef": "deepseek/deepseek-v3-0324", "score": 5.2 }, + { + "modelRef": "cohere/tiny-aya-global", + "score": 5.2 + }, { "modelRef": "aws/nova-2-0-pro-reasoning-low", "score": 5.2 @@ -832,6 +1012,10 @@ "modelRef": "xai/grok-4-1-fast", "score": 5.0 }, + { + "modelRef": "swiss-ai-initiative/apertus-8b-instruct", + "score": 5.0 + }, { "modelRef": "openai/gpt-5-mini-minimal", "score": 5.0 @@ -888,6 +1072,14 @@ "modelRef": "google/gemini-1-5-pro", "score": 4.9 }, + { + "modelRef": "alibaba/qwen3-5-2b-non-reasoning", + "score": 4.9 + }, + { + "modelRef": "alibaba/qwen3-5-0-8b-non-reasoning", + "score": 4.9 + }, { "modelRef": "ai2/olmo-3-1-32b-instruct", "score": 4.9 @@ -896,6 +1088,10 @@ "modelRef": "openchat/openchat-35", "score": 4.8 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-nano-4b", + "score": 4.8 + }, { "modelRef": "mistral/mistral-small-3-1", "score": 4.8 @@ -1056,6 +1252,10 @@ "modelRef": "mbzuai/k2-v2-medium", "score": 4.4 }, + { + "modelRef": "liquidai/lfm2-24b-a2b", + "score": 4.4 + }, { "modelRef": "google/gemma-3n-e4b", "score": 4.4 @@ -1132,6 +1332,10 @@ "modelRef": "alibaba/qwen3-14b-instruct-reasoning", "score": 4.3 }, + { + "modelRef": "openai/gpt-5-4-nano-non-reasoning", + "score": 4.2 + }, { "modelRef": "nous-research/hermes-4-llama-3-1-405b", "score": 4.2 @@ -1284,6 +1488,10 @@ "modelRef": "anthropic/claude-35-sonnet", "score": 3.9 }, + { + "modelRef": "anthropic/claude-3-haiku", + "score": 3.9 + }, { "modelRef": "xai/grok-2-1212", "score": 3.8 @@ -1328,6 +1536,10 @@ "modelRef": "openai/gpt-4o-chatgpt", "score": 3.7 }, + { + "modelRef": "mistral/mistral-small-4-non-reasoning", + "score": 3.7 + }, { "modelRef": "mistral/devstral-small", "score": 3.7 @@ -1420,6 +1632,10 @@ "modelRef": "aws/nova-pro", "score": 3.4 }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 3.3 + }, { "modelRef": "openai/gpt-4o", "score": 3.3 @@ -1467,6 +1683,14 @@ { "modelRef": "kimi/kimi-linear-48b-a3b-instruct", "score": 2.7 + }, + { + "modelRef": "alibaba/qwen3-5-2b", + "score": 2.1 + }, + { + "modelRef": "alibaba/qwen3-5-0-8b", + "score": 1.2 } ] } \ No newline at end of file diff --git a/src/content/benchmarks/ifbench.json b/src/content/benchmarks/ifbench.json index 5138909..2388aec 100644 --- a/src/content/benchmarks/ifbench.json +++ b/src/content/benchmarks/ifbench.json @@ -7,7 +7,7 @@ "tags": [ "Agent" ], - "lastUpdated": "2026-02-12", + "lastUpdated": "2026-03-25", "metrics": { "unit": "pass@1(%)", "isBetterHigher": true @@ -19,6 +19,10 @@ "initialWeight": 1000 }, "snapshot": [ + { + "modelRef": "xai/grok-4-20", + "score": 82.9 + }, { "modelRef": "aws/nova-2-0-pro-reasoning-low", "score": 79.6 @@ -27,6 +31,10 @@ "modelRef": "aws/nova-2-0-pro-reasoning-medium", "score": 79.0 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b", + "score": 78.8 + }, { "modelRef": "google/gemini-3-flash-reasoning", "score": 78.0 @@ -35,10 +43,38 @@ "modelRef": "openai/gpt-5-2-codex", "score": 77.6 }, + { + "modelRef": "google/gemini-3-1-flash-lite-preview", + "score": 77.2 + }, + { + "modelRef": "google/gemini-3-1-pro-preview", + "score": 77.1 + }, + { + "modelRef": "openai/gpt-5-4-nano", + "score": 75.9 + }, + { + "modelRef": "minimax/minimax-m2-7", + "score": 75.7 + }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b", + "score": 75.7 + }, + { + "modelRef": "alibaba/qwen3-5-27b", + "score": 75.6 + }, { "modelRef": "openai/gpt-5-mini", "score": 75.4 }, + { + "modelRef": "openai/gpt-5-3-codex", + "score": 75.4 + }, { "modelRef": "openai/gpt-5-2", "score": 75.4 @@ -47,6 +83,18 @@ "modelRef": "openai/gpt-5-codex", "score": 74.1 }, + { + "modelRef": "openai/gpt-5-4", + "score": 73.9 + }, + { + "modelRef": "openai/gpt-5-4-mini", + "score": 73.3 + }, + { + "modelRef": "zai/glm-5-turbo", + "score": 73.2 + }, { "modelRef": "openai/gpt-5", "score": 73.1 @@ -55,6 +103,10 @@ "modelRef": "openai/gpt-5-1", "score": 72.9 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b", + "score": 72.5 + }, { "modelRef": "zai/glm-5", "score": 72.3 @@ -67,6 +119,14 @@ "modelRef": "xiaomi/mimo-v2-0206", "score": 71.8 }, + { + "modelRef": "minimax/minimax-m2-5", + "score": 71.6 + }, + { + "modelRef": "nvidia/nvidia-nemotron-3-super-120b-a12b", + "score": 71.5 + }, { "modelRef": "openai/o3", "score": 71.4 @@ -107,6 +167,10 @@ "modelRef": "minimax/minimax-m2-1", "score": 69.9 }, + { + "modelRef": "inception/mercury-2", + "score": 69.8 + }, { "modelRef": "servicenow/apriel-v1-6-15b-thinker", "score": 69.1 @@ -115,6 +179,10 @@ "modelRef": "openai/gpt-oss-120b", "score": 69.0 }, + { + "modelRef": "xiaomi/mimo-v2-pro", + "score": 68.8 + }, { "modelRef": "openai/o4-mini", "score": 68.7 @@ -147,6 +215,10 @@ "modelRef": "openai/o3-mini-high", "score": 67.1 }, + { + "modelRef": "alibaba/qwen3-5-9b", + "score": 66.7 + }, { "modelRef": "openai/gpt-5-low", "score": 66.6 @@ -171,10 +243,22 @@ "modelRef": "openai/gpt-oss-20b", "score": 65.1 }, + { + "modelRef": "openai/gpt-5-4-mini-medium", + "score": 64.8 + }, { "modelRef": "lg/k-exaone", "score": 64.7 }, + { + "modelRef": "stepfun/step-3-5-flash", + "score": 64.6 + }, + { + "modelRef": "openai/gpt-5-4-nano-medium", + "score": 64.4 + }, { "modelRef": "xiaomi/mimo-v2-flash-reasoning", "score": 64.2 @@ -223,6 +307,10 @@ "modelRef": "openai/gpt-oss-120b-low", "score": 58.3 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-nano-4b", + "score": 58.2 + }, { "modelRef": "anthropic/claude-opus-4-5-thinking", "score": 58.0 @@ -247,6 +335,10 @@ "modelRef": "deepseek/deepseek-v3-1-terminus-reasoning", "score": 57.0 }, + { + "modelRef": "anthropic/claude-sonnet-4-6-adaptive", + "score": 56.6 + }, { "modelRef": "alibaba/qwen3-vl-235b-a22b-reasoning", "score": 56.5 @@ -255,6 +347,10 @@ "modelRef": "anthropic/claude-4-1-opus-thinking", "score": 55.4 }, + { + "modelRef": "zai/glm-5-non-reasoning", + "score": 55.2 + }, { "modelRef": "mbzuai/k2-v2-medium", "score": 55.1 @@ -271,6 +367,10 @@ "modelRef": "zai/glm-4-7-non-reasoning", "score": 54.6 }, + { + "modelRef": "trillionlabs/tri-21b-think-v0-5", + "score": 54.6 + }, { "modelRef": "tii-uae/falcon-h1r-7b", "score": 54.4 @@ -295,6 +395,10 @@ "modelRef": "anthropic/claude-4-opus-thinking", "score": 53.7 }, + { + "modelRef": "xiaomi/mimo-v2-omni", + "score": 53.5 + }, { "modelRef": "anthropic/claude-opus-4-6-adaptive", "score": 53.1 @@ -319,6 +423,14 @@ "modelRef": "aws/nova-2-0-pro", "score": 52.0 }, + { + "modelRef": "alibaba/qwen3-5-4b", + "score": 52.0 + }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b-non-reasoning", + "score": 51.6 + }, { "modelRef": "bytedance-seed/doubao-seed-code", "score": 51.4 @@ -327,6 +439,10 @@ "modelRef": "alibaba/qwen3-235b-a22b-instruct-2507-reasoning", "score": 51.2 }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b-non-reasoning", + "score": 50.8 + }, { "modelRef": "alibaba/qwen3-30b-a3b-2507-reasoning", "score": 50.7 @@ -340,7 +456,7 @@ "score": 50.3 }, { - "modelRef": "stepfun/step3-vl-10b", + "modelRef": "stepfun/step-3-vl-10b", "score": 50.2 }, { @@ -371,18 +487,34 @@ "modelRef": "google/gemini-2-5-pro", "score": 48.7 }, + { + "modelRef": "openai/gpt-5-4-non-reasoning", + "score": 48.4 + }, { "modelRef": "anthropic/claude-3-7-sonnet-thinking", "score": 48.3 }, + { + "modelRef": "mistral/mistral-small-4", + "score": 48.2 + }, { "modelRef": "alibaba/qwen3-max-preview", "score": 48.0 }, + { + "modelRef": "xai/grok-4-20-non-reasoning", + "score": 47.8 + }, { "modelRef": "openai/gpt-5-2-non-reasoning", "score": 47.4 }, + { + "modelRef": "trillionlabs/tri-21b-think-preview", + "score": 47.1 + }, { "modelRef": "meta/llama-3-3-instruct-70b", "score": 47.1 @@ -391,6 +523,10 @@ "modelRef": "xai/grok-3", "score": 46.9 }, + { + "modelRef": "alibaba/qwen3-5-27b-non-reasoning", + "score": 46.9 + }, { "modelRef": "zai/glm-4-7-flash-non-reasoning", "score": 46.3 @@ -407,6 +543,10 @@ "modelRef": "xai/grok-3-mini-reasoning", "score": 45.9 }, + { + "modelRef": "liquidai/lfm2-24b-a2b", + "score": 45.9 + }, { "modelRef": "openai/gpt-5-minimal", "score": 45.6 @@ -439,6 +579,10 @@ "modelRef": "anthropic/claude-opus-4-6", "score": 44.6 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b-non-reasoning", + "score": 44.5 + }, { "modelRef": "mistral/magistral-small-2509", "score": 44.4 @@ -487,6 +631,10 @@ "modelRef": "openai/gpt-5-1-non-reasoning", "score": 43.2 }, + { + "modelRef": "longcat/longcat-flash-lite", + "score": 43.1 + }, { "modelRef": "deepseek/deepseek-v3-2-0925", "score": 43.1 @@ -519,6 +667,10 @@ "modelRef": "alibaba/qwen3-vl-235b-a22b-instruct", "score": 42.7 }, + { + "modelRef": "anthropic/claude-sonnet-4-6-non-reasoning-low-effort", + "score": 42.4 + }, { "modelRef": "anthropic/claude-4-5-haiku", "score": 42.0 @@ -575,6 +727,10 @@ "modelRef": "deepseek/deepseek-v3-1-terminus", "score": 41.2 }, + { + "modelRef": "anthropic/claude-sonnet-4-6", + "score": 41.2 + }, { "modelRef": "aws/nova-2-0-omni", "score": 41.1 @@ -663,6 +819,10 @@ "modelRef": "deepseek/deepseek-r1-0120", "score": 39.0 }, + { + "modelRef": "openai/gpt-5-4-mini-non-reasoning", + "score": 38.8 + }, { "modelRef": "alibaba/qwq-32b", "score": 38.8 @@ -703,6 +863,10 @@ "modelRef": "deepseek/deepseek-v3-1", "score": 37.8 }, + { + "modelRef": "alibaba/qwen3-5-9b-non-reasoning", + "score": 37.8 + }, { "modelRef": "xai/grok-4-fast", "score": 37.7 @@ -771,10 +935,18 @@ "modelRef": "aws/nova-premier", "score": 36.2 }, + { + "modelRef": "anthropic/claude-3-haiku", + "score": 36.1 + }, { "modelRef": "openai/gpt-4o-2024-08-06", "score": 36.0 }, + { + "modelRef": "nanbeige/nanbeige4-1-3b", + "score": 35.4 + }, { "modelRef": "alibaba/qwen3-coder-next", "score": 35.2 @@ -803,6 +975,10 @@ "modelRef": "mistral/pixtral-large-2411", "score": 34.5 }, + { + "modelRef": "sarvam/sarvam-105b", + "score": 34.4 + }, { "modelRef": "meta/llama-3-1-instruct-70b", "score": 34.4 @@ -847,6 +1023,10 @@ "modelRef": "alibaba/qwen3-4b-2507-instruct", "score": 33.5 }, + { + "modelRef": "alibaba/qwen3-5-4b-non-reasoning", + "score": 33.3 + }, { "modelRef": "liquidai/lfm2-5-vl-1-6b", "score": 33.1 @@ -863,10 +1043,18 @@ "modelRef": "nvidia/llama-nemotron-super-49b-v1-5", "score": 32.9 }, + { + "modelRef": "mistral/mistral-small-4-non-reasoning", + "score": 32.8 + }, { "modelRef": "ai2/olmo-3-7b-instruct", "score": 32.8 }, + { + "modelRef": "openai/gpt-5-4-nano-non-reasoning", + "score": 32.7 + }, { "modelRef": "nous-research/hermes-4-llama-3-1-405b-reasoning", "score": 32.7 @@ -903,6 +1091,10 @@ "modelRef": "alibaba/qwen3-30b-a3b-instruct", "score": 31.9 }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 31.8 + }, { "modelRef": "google/gemma-3-27b", "score": 31.8 @@ -927,6 +1119,10 @@ "modelRef": "google/gemini-2-5-flash-lite", "score": 31.5 }, + { + "modelRef": "alibaba/qwen3-5-2b", + "score": 31.5 + }, { "modelRef": "alibaba/qwen3-32b-instruct", "score": 31.5 @@ -987,6 +1183,10 @@ "modelRef": "mistral/ministral-3-8b", "score": 29.1 }, + { + "modelRef": "alibaba/qwen3-5-2b-non-reasoning", + "score": 29.1 + }, { "modelRef": "nous-research/hermes-4-llama-3-1-70b", "score": 29.0 @@ -1043,6 +1243,10 @@ "modelRef": "mistral/ministral-3-3b", "score": 26.8 }, + { + "modelRef": "sarvam/sarvam-30b", + "score": 26.5 + }, { "modelRef": "mistral/mistral-small-3", "score": 26.4 @@ -1059,6 +1263,10 @@ "modelRef": "ibm/granite-4-0-h-nano-1b", "score": 26.2 }, + { + "modelRef": "swiss-ai-initiative/apertus-70b-instruct", + "score": 25.9 + }, { "modelRef": "nvidia/nvidia-nemotron-nano-12b-v2-vl", "score": 25.9 @@ -1123,6 +1331,10 @@ "modelRef": "meta/llama-3-2-instruct-1b", "score": 22.8 }, + { + "modelRef": "swiss-ai-initiative/apertus-8b-instruct", + "score": 22.4 + }, { "modelRef": "ibm/granite-3-3-8b-instruct", "score": 22.4 @@ -1143,6 +1355,14 @@ "modelRef": "alibaba/qwen3-0-6b-instruct", "score": 21.9 }, + { + "modelRef": "alibaba/qwen3-5-0-8b-non-reasoning", + "score": 21.6 + }, + { + "modelRef": "alibaba/qwen3-5-0-8b", + "score": 21.5 + }, { "modelRef": "azure/phi-4-mini", "score": 21.1 @@ -1155,6 +1375,10 @@ "modelRef": "ibm/granite-4-0-nano-1b", "score": 20.5 }, + { + "modelRef": "cohere/tiny-aya-global", + "score": 20.1 + }, { "modelRef": "mistral/mistral-7b-instruct", "score": 19.9 diff --git a/src/content/benchmarks/lcr.json b/src/content/benchmarks/lcr.json index 4bdc529..b191cf0 100644 --- a/src/content/benchmarks/lcr.json +++ b/src/content/benchmarks/lcr.json @@ -8,7 +8,7 @@ "Long-Context", "Reasoning" ], - "lastUpdated": "2026-02-12", + "lastUpdated": "2026-03-25", "metrics": { "unit": "pass@1(%)", "isBetterHigher": true @@ -32,6 +32,14 @@ "modelRef": "openai/gpt-5-1", "score": 75.0 }, + { + "modelRef": "openai/gpt-5-4", + "score": 74.0 + }, + { + "modelRef": "openai/gpt-5-3-codex", + "score": 74.0 + }, { "modelRef": "kwaikat/kat-coder-pro-v1", "score": 74.0 @@ -48,10 +56,18 @@ "modelRef": "openai/gpt-5-2", "score": 72.7 }, + { + "modelRef": "google/gemini-3-1-pro-preview", + "score": 72.7 + }, { "modelRef": "google/gemini-3-pro", "score": 70.7 }, + { + "modelRef": "anthropic/claude-sonnet-4-6-adaptive", + "score": 70.7 + }, { "modelRef": "anthropic/claude-opus-4-6-adaptive", "score": 70.7 @@ -64,6 +80,10 @@ "modelRef": "openai/o3", "score": 69.3 }, + { + "modelRef": "openai/gpt-5-4-mini", + "score": 69.3 + }, { "modelRef": "openai/gpt-5-codex", "score": 69.0 @@ -72,6 +92,10 @@ "modelRef": "deepseek/deepseek-v3-2-reasoning-0925", "score": 69.0 }, + { + "modelRef": "minimax/minimax-m2-7", + "score": 68.7 + }, { "modelRef": "xai/grok-4-1-fast-reasoning", "score": 68.0 @@ -92,10 +116,22 @@ "modelRef": "google/gemini-3-pro-low", "score": 67.3 }, + { + "modelRef": "alibaba/qwen3-5-27b", + "score": 67.3 + }, { "modelRef": "alibaba/qwen3-235b-a22b-instruct-2507-reasoning", "score": 67.0 }, + { + "modelRef": "xiaomi/mimo-v2-omni", + "score": 66.7 + }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b", + "score": 66.7 + }, { "modelRef": "kimi/kimi-k2-thinking", "score": 66.3 @@ -112,6 +148,14 @@ "modelRef": "openai/gpt-5-mini-medium", "score": 66.0 }, + { + "modelRef": "openai/gpt-5-4-nano", + "score": 66.0 + }, + { + "modelRef": "minimax/minimax-m2-5", + "score": 66.0 + }, { "modelRef": "google/gemini-2-5-pro", "score": 66.0 @@ -124,10 +168,18 @@ "modelRef": "anthropic/claude-4-5-sonnet-thinking", "score": 65.7 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b", + "score": 65.7 + }, { "modelRef": "kimi/kimi-k2-5", "score": 65.3 }, + { + "modelRef": "google/gemini-3-1-flash-lite-preview", + "score": 65.3 + }, { "modelRef": "bytedance-seed/doubao-seed-code", "score": 65.3 @@ -184,6 +236,10 @@ "modelRef": "openai/gpt-5-1-codex-mini", "score": 62.7 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b", + "score": 62.7 + }, { "modelRef": "google/gemini-2-5-flash-reasoning", "score": 61.7 @@ -192,6 +248,10 @@ "modelRef": "aws/nova-2-0-pro-reasoning-low", "score": 61.7 }, + { + "modelRef": "openai/gpt-5-4-mini-medium", + "score": 61.3 + }, { "modelRef": "openai/gpt-4-1", "score": 61.0 @@ -200,6 +260,14 @@ "modelRef": "minimax/minimax-m2", "score": 61.0 }, + { + "modelRef": "zai/glm-5-turbo", + "score": 60.7 + }, + { + "modelRef": "xiaomi/mimo-v2-pro", + "score": 60.7 + }, { "modelRef": "anthropic/claude-3-7-sonnet-thinking", "score": 60.7 @@ -208,6 +276,10 @@ "modelRef": "alibaba/qwen3-next-80b-a3b-reasoning", "score": 60.3 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-super-120b-a12b", + "score": 60.0 + }, { "modelRef": "openai/o1", "score": 59.3 @@ -216,6 +288,10 @@ "modelRef": "deepseek/deepseek-v3-2-speciale", "score": 59.3 }, + { + "modelRef": "xai/grok-4-20", + "score": 59.0 + }, { "modelRef": "minimax/minimax-m2-1", "score": 59.0 @@ -228,6 +304,10 @@ "modelRef": "google/gemini-2-5-flash-lite-preview-09-2025-reasoning", "score": 59.0 }, + { + "modelRef": "alibaba/qwen3-5-9b", + "score": 59.0 + }, { "modelRef": "alibaba/qwen3-30b-a3b-2507-reasoning", "score": 59.0 @@ -236,6 +316,10 @@ "modelRef": "openai/gpt-5-low", "score": 58.7 }, + { + "modelRef": "anthropic/claude-sonnet-4-6-non-reasoning-low-effort", + "score": 58.7 + }, { "modelRef": "alibaba/qwen3-vl-235b-a22b-reasoning", "score": 58.7 @@ -248,26 +332,54 @@ "modelRef": "anthropic/claude-opus-4-6", "score": 58.3 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b-non-reasoning", + "score": 58.0 + }, { "modelRef": "bytedance-seed/seed-oss-36b-instruct", "score": 57.7 }, + { + "modelRef": "anthropic/claude-sonnet-4-6", + "score": 57.7 + }, { "modelRef": "alibaba/qwen3-max-thinking-preview", "score": 57.7 }, + { + "modelRef": "openai/gpt-5-4-nano-medium", + "score": 57.3 + }, { "modelRef": "google/gemini-2-5-flash-preview-09-2025", "score": 56.7 }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b-non-reasoning", + "score": 56.0 + }, { "modelRef": "lg/k-exaone", "score": 55.7 }, + { + "modelRef": "alibaba/qwen3-5-4b", + "score": 55.7 + }, + { + "modelRef": "alibaba/qwen3-5-27b-non-reasoning", + "score": 55.7 + }, { "modelRef": "alibaba/qwen3-vl-32b-reasoning", "score": 55.3 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b-non-reasoning", + "score": 55.3 + }, { "modelRef": "openai/o4-mini", "score": 55.0 @@ -380,6 +492,10 @@ "modelRef": "google/gemini-2-5-flash-lite-preview-09-2025", "score": 48.0 }, + { + "modelRef": "openai/gpt-5-4-non-reasoning", + "score": 47.3 + }, { "modelRef": "lg/k-exaone-non-reasoning", "score": 47.0 @@ -404,6 +520,10 @@ "modelRef": "deepseek/deepseek-v3-1", "score": 45.0 }, + { + "modelRef": "mistral/mistral-small-4", + "score": 44.7 + }, { "modelRef": "anthropic/claude-4-sonnet", "score": 44.3 @@ -428,6 +548,10 @@ "modelRef": "deepseek/deepseek-v3-1-terminus", "score": 43.3 }, + { + "modelRef": "stepfun/step-3-5-flash", + "score": 43.0 + }, { "modelRef": "deepseek/deepseek-v3-2-0925", "score": 43.0 @@ -484,14 +608,26 @@ "modelRef": "openai/gpt-5-2-non-reasoning", "score": 38.0 }, + { + "modelRef": "alibaba/qwen3-5-9b-non-reasoning", + "score": 38.0 + }, { "modelRef": "alibaba/qwen3-4b-2507-instruct-reasoning", "score": 37.7 }, + { + "modelRef": "zai/glm-5-non-reasoning", + "score": 37.0 + }, { "modelRef": "zai/glm-4-7-non-reasoning", "score": 36.3 }, + { + "modelRef": "inception/mercury-2", + "score": 36.3 + }, { "modelRef": "upstage/solar-open-100b-reasoning", "score": 36.0 @@ -572,6 +708,10 @@ "modelRef": "openai/gpt-oss-20b", "score": 30.7 }, + { + "modelRef": "openai/gpt-5-4-mini-non-reasoning", + "score": 30.3 + }, { "modelRef": "mistral/devstral-2", "score": 30.0 @@ -600,6 +740,10 @@ "modelRef": "aws/nova-2-0-pro", "score": 28.3 }, + { + "modelRef": "alibaba/qwen3-5-4b-non-reasoning", + "score": 28.3 + }, { "modelRef": "mistral/mistral-medium-3", "score": 28.0 @@ -620,6 +764,10 @@ "modelRef": "meta/llama-4-scout", "score": 25.8 }, + { + "modelRef": "longcat/longcat-flash-lite", + "score": 25.7 + }, { "modelRef": "kimi/kimi-linear-48b-a3b-instruct", "score": 25.7 @@ -632,6 +780,10 @@ "modelRef": "alibaba/qwq-32b", "score": 25.0 }, + { + "modelRef": "openai/gpt-5-4-nano-non-reasoning", + "score": 24.7 + }, { "modelRef": "meta/llama-3-1-instruct-405b", "score": 24.3 @@ -648,6 +800,10 @@ "modelRef": "alibaba/qwen3-vl-30b-a3b-instruct", "score": 23.7 }, + { + "modelRef": "alibaba/qwen3-5-2b", + "score": 23.7 + }, { "modelRef": "anthropic/claude-3-5-haiku", "score": 23.3 @@ -680,6 +836,10 @@ "modelRef": "deepcogito/cogito-v2-1-reasoning", "score": 21.7 }, + { + "modelRef": "mistral/mistral-small-4-non-reasoning", + "score": 21.3 + }, { "modelRef": "alibaba/qwen3-vl-4b-reasoning", "score": 21.3 @@ -692,6 +852,10 @@ "modelRef": "inclusionai/ring-flash-2-0", "score": 21.0 }, + { + "modelRef": "anthropic/claude-3-haiku", + "score": 21.0 + }, { "modelRef": "nous-research/hermes-4-llama-3-1-405b-reasoning", "score": 20.7 @@ -732,6 +896,10 @@ "modelRef": "aws/nova-pro", "score": 19.0 }, + { + "modelRef": "xai/grok-4-20-non-reasoning", + "score": 18.0 + }, { "modelRef": "cohere/command-a", "score": 18.0 @@ -768,6 +936,10 @@ "modelRef": "mistral/devstral-small", "score": 17.0 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-nano-4b", + "score": 16.7 + }, { "modelRef": "mistral/magistral-small-2509", "score": 16.3 @@ -792,6 +964,10 @@ "modelRef": "zai/glm-4-7-flash-non-reasoning", "score": 14.7 }, + { + "modelRef": "trillionlabs/tri-21b-think-preview", + "score": 14.7 + }, { "modelRef": "lg/exaone-4-0-32b-reasoning", "score": 14.0 @@ -800,6 +976,10 @@ "modelRef": "azure/phi-4-mini", "score": 13.7 }, + { + "modelRef": "alibaba/qwen3-5-2b-non-reasoning", + "score": 13.7 + }, { "modelRef": "motif-technologies/motif-2-12-7b", "score": 13.0 @@ -836,6 +1016,10 @@ "modelRef": "nvidia/llama-3-3-nemotron-super-49b", "score": 11.3 }, + { + "modelRef": "trillionlabs/tri-21b-think-v0-5", + "score": 11.0 + }, { "modelRef": "korea-telecom/midm-250-pro-rsnsft", "score": 11.0 @@ -912,6 +1096,10 @@ "modelRef": "baidu/ernie-5-0-thinking-preview", "score": 6.7 }, + { + "modelRef": "alibaba/qwen3-5-0-8b-non-reasoning", + "score": 6.7 + }, { "modelRef": "meta/llama-3-1-instruct-70b", "score": 6.3 @@ -932,6 +1120,10 @@ "modelRef": "mistral/mistral-large-2", "score": 5.3 }, + { + "modelRef": "alibaba/qwen3-5-0-8b", + "score": 5.3 + }, { "modelRef": "meta/llama-3-2-instruct-1b", "score": 5.0 @@ -989,7 +1181,27 @@ "score": 0.0 }, { - "modelRef": "stepfun/step3-vl-10b", + "modelRef": "swiss-ai-initiative/apertus-8b-instruct", + "score": 0.0 + }, + { + "modelRef": "swiss-ai-initiative/apertus-70b-instruct", + "score": 0.0 + }, + { + "modelRef": "stepfun/step-3-vl-10b", + "score": 0.0 + }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 0.0 + }, + { + "modelRef": "sarvam/sarvam-30b", + "score": 0.0 + }, + { + "modelRef": "sarvam/sarvam-105b", "score": 0.0 }, { @@ -1004,6 +1216,10 @@ "modelRef": "nvidia/llama-3-1-nemotron-nano-4b-reasoning", "score": 0.0 }, + { + "modelRef": "nanbeige/nanbeige4-1-3b", + "score": 0.0 + }, { "modelRef": "mistral/mistral-small-3", "score": 0.0 @@ -1044,6 +1260,10 @@ "modelRef": "liquidai/lfm2-5-1-2b-instruct", "score": 0.0 }, + { + "modelRef": "liquidai/lfm2-24b-a2b", + "score": 0.0 + }, { "modelRef": "liquidai/lfm2-2-6b", "score": 0.0 @@ -1088,6 +1308,10 @@ "modelRef": "deepseek/deepseek-r1-distill-llama-8b", "score": 0.0 }, + { + "modelRef": "cohere/tiny-aya-global", + "score": 0.0 + }, { "modelRef": "azure/phi-4", "score": 0.0 diff --git a/src/content/benchmarks/livecodebench.json b/src/content/benchmarks/livecodebench.json index 897d1c8..3e0073f 100644 --- a/src/content/benchmarks/livecodebench.json +++ b/src/content/benchmarks/livecodebench.json @@ -7,7 +7,7 @@ "tags": [ "Coding" ], - "lastUpdated": "2026-01-28", + "lastUpdated": "2026-03-25", "metrics": { "unit": "pass@1(%)", "isBetterHigher": true @@ -903,6 +903,10 @@ "modelRef": "meta/llama-4-scout", "score": 29.9 }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 29.5 + }, { "modelRef": "perplexity/sonar", "score": 29.5 diff --git a/src/content/benchmarks/math_500.json b/src/content/benchmarks/math_500.json index e6fb578..8b3cb9d 100644 --- a/src/content/benchmarks/math_500.json +++ b/src/content/benchmarks/math_500.json @@ -7,7 +7,7 @@ "tags": [ "Reasoning" ], - "lastUpdated": "2026-01-25", + "lastUpdated": "2026-03-25", "metrics": { "unit": "pass@1(%)", "isBetterHigher": true @@ -403,6 +403,10 @@ "modelRef": "openai/gpt-4-1-nano", "score": 84.8 }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 84.7 + }, { "modelRef": "meta/llama-4-scout", "score": 84.4 diff --git a/src/content/benchmarks/mmlu_pro.json b/src/content/benchmarks/mmlu_pro.json index dfe86f8..66d2e14 100644 --- a/src/content/benchmarks/mmlu_pro.json +++ b/src/content/benchmarks/mmlu_pro.json @@ -7,7 +7,7 @@ "tags": [ "Knowledge" ], - "lastUpdated": "2026-01-28", + "lastUpdated": "2026-03-25", "metrics": { "unit": "pass@1(%)", "isBetterHigher": true @@ -907,6 +907,10 @@ "modelRef": "alibaba/qwen2-5-32b-instruct", "score": 69.7 }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 69.6 + }, { "modelRef": "anthropic/claude-3-opus", "score": 69.6 diff --git a/src/content/benchmarks/scicode.json b/src/content/benchmarks/scicode.json index 063fdfe..31e2371 100644 --- a/src/content/benchmarks/scicode.json +++ b/src/content/benchmarks/scicode.json @@ -8,7 +8,7 @@ "Reasoning", "Knowledge" ], - "lastUpdated": "2026-02-12", + "lastUpdated": "2026-03-25", "metrics": { "unit": "pass@1(%)", "isBetterHigher": true @@ -20,6 +20,14 @@ "initialWeight": 1000 }, "snapshot": [ + { + "modelRef": "google/gemini-3-1-pro-preview", + "score": 58.9 + }, + { + "modelRef": "openai/gpt-5-4", + "score": 56.6 + }, { "modelRef": "google/gemini-3-pro", "score": 56.1 @@ -28,6 +36,10 @@ "modelRef": "openai/gpt-5-2-codex", "score": 54.6 }, + { + "modelRef": "openai/gpt-5-3-codex", + "score": 53.2 + }, { "modelRef": "openai/gpt-5-2", "score": 52.1 @@ -40,6 +52,10 @@ "modelRef": "google/gemini-3-flash-reasoning", "score": 50.6 }, + { + "modelRef": "openai/gpt-5-4-mini", + "score": 49.9 + }, { "modelRef": "google/gemini-3-pro-low", "score": 49.9 @@ -56,10 +72,30 @@ "modelRef": "kimi/kimi-k2-5", "score": 49.0 }, + { + "modelRef": "openai/gpt-5-4-non-reasoning", + "score": 47.1 + }, + { + "modelRef": "minimax/minimax-m2-7", + "score": 47.0 + }, { "modelRef": "anthropic/claude-opus-4-5", "score": 47.0 }, + { + "modelRef": "openai/gpt-5-4-nano", + "score": 46.9 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6", + "score": 46.9 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6-adaptive", + "score": 46.8 + }, { "modelRef": "openai/o4-mini", "score": 46.5 @@ -84,6 +120,10 @@ "modelRef": "zai/glm-4-7", "score": 45.1 }, + { + "modelRef": "xai/grok-4-20", + "score": 44.7 + }, { "modelRef": "anthropic/claude-4-5-sonnet-thinking", "score": 44.7 @@ -96,10 +136,22 @@ "modelRef": "xai/grok-4-1-fast-reasoning", "score": 44.2 }, + { + "modelRef": "openai/gpt-5-4-mini-medium", + "score": 44.2 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6-non-reasoning-low-effort", + "score": 44.1 + }, { "modelRef": "deepseek/deepseek-v3-2-speciale", "score": 44.0 }, + { + "modelRef": "zai/glm-5-turbo", + "score": 43.6 + }, { "modelRef": "openai/gpt-5-1", "score": 43.3 @@ -132,6 +184,14 @@ "modelRef": "openai/gpt-5-1-codex-mini", "score": 42.6 }, + { + "modelRef": "minimax/minimax-m2-5", + "score": 42.6 + }, + { + "modelRef": "xiaomi/mimo-v2-pro", + "score": 42.5 + }, { "modelRef": "kimi/kimi-k2-thinking", "score": 42.4 @@ -140,6 +200,18 @@ "modelRef": "alibaba/qwen3-235b-a22b-instruct-2507-reasoning", "score": 42.4 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b", + "score": 42.0 + }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b", + "score": 42.0 + }, + { + "modelRef": "google/gemini-3-1-flash-lite-preview", + "score": 41.9 + }, { "modelRef": "google/gemini-2-5-pro-05-06", "score": 41.6 @@ -148,6 +220,10 @@ "modelRef": "openai/gpt-5-medium", "score": 41.1 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b-non-reasoning", + "score": 41.1 + }, { "modelRef": "openai/o3", "score": 41.0 @@ -192,6 +268,10 @@ "modelRef": "google/gemini-2-5-flash-preview-09-2025-reasoning", "score": 40.5 }, + { + "modelRef": "stepfun/step-3-5-flash", + "score": 40.4 + }, { "modelRef": "openai/gpt-5-2-non-reasoning", "score": 40.4 @@ -240,6 +320,10 @@ "modelRef": "anthropic/claude-4-opus-thinking", "score": 39.8 }, + { + "modelRef": "openai/gpt-5-4-mini-non-reasoning", + "score": 39.6 + }, { "modelRef": "kimi/kimi-k2-5-non-reasoning", "score": 39.6 @@ -248,6 +332,10 @@ "modelRef": "google/gemini-2-5-pro-03-25", "score": 39.5 }, + { + "modelRef": "alibaba/qwen3-5-27b", + "score": 39.5 + }, { "modelRef": "xiaomi/mimo-v2-flash-reasoning", "score": 39.4 @@ -292,6 +380,10 @@ "modelRef": "alibaba/qwen3-next-80b-a3b-reasoning", "score": 38.8 }, + { + "modelRef": "inception/mercury-2", + "score": 38.7 + }, { "modelRef": "deepseek/deepseek-v3-2", "score": 38.7 @@ -308,6 +400,14 @@ "modelRef": "zai/glm-4-6-reasoning", "score": 38.4 }, + { + "modelRef": "openai/gpt-5-4-nano-medium", + "score": 38.4 + }, + { + "modelRef": "zai/glm-5-non-reasoning", + "score": 38.3 + }, { "modelRef": "xiaomi/mimo-v2-0206", "score": 38.3 @@ -320,6 +420,10 @@ "modelRef": "openai/gpt-4-1", "score": 38.1 }, + { + "modelRef": "mistral/mistral-small-4", + "score": 38.0 + }, { "modelRef": "openai/gpt-5-chatgpt", "score": 37.8 @@ -332,6 +436,10 @@ "modelRef": "deepseek/deepseek-v3-2-reasoning-0925", "score": 37.7 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b", + "score": 37.7 + }, { "modelRef": "deepseek/deepseek-r1-distill-qwen-32b", "score": 37.6 @@ -376,6 +484,10 @@ "modelRef": "aws/nova-2-0-lite-reasoning-medium", "score": 36.8 }, + { + "modelRef": "xiaomi/mimo-v2-omni", + "score": 36.7 + }, { "modelRef": "inclusionai/ring-1t", "score": 36.7 @@ -384,6 +496,10 @@ "modelRef": "deepseek/deepseek-v3-1", "score": 36.7 }, + { + "modelRef": "alibaba/qwen3-5-27b-non-reasoning", + "score": 36.7 + }, { "modelRef": "openai/gpt-5-nano", "score": 36.6 @@ -428,6 +544,10 @@ "modelRef": "openai/gpt-oss-120b-low", "score": 36.0 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-super-120b-a12b", + "score": 36.0 + }, { "modelRef": "alibaba/qwen3-235b-a22b-instruct-2507", "score": 36.0 @@ -464,6 +584,10 @@ "modelRef": "lg/k-exaone", "score": 35.6 }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b-non-reasoning", + "score": 35.6 + }, { "modelRef": "zai/glm-4-7-non-reasoning", "score": 35.4 @@ -476,6 +600,10 @@ "modelRef": "alibaba/qwen3-32b-instruct-reasoning", "score": 35.4 }, + { + "modelRef": "openai/gpt-5-4-nano-non-reasoning", + "score": 35.2 + }, { "modelRef": "mistral/magistral-small-2509", "score": 35.2 @@ -616,6 +744,10 @@ "modelRef": "alibaba/qwen3-coder-next", "score": 32.3 }, + { + "modelRef": "xai/grok-4-20-non-reasoning", + "score": 32.2 + }, { "modelRef": "deepseek/deepseek-v3-1-terminus", "score": 32.1 @@ -645,7 +777,7 @@ "score": 31.2 }, { - "modelRef": "stepfun/step3-vl-10b", + "modelRef": "stepfun/step-3-vl-10b", "score": 31.1 }, { @@ -728,6 +860,10 @@ "modelRef": "mistral/devstral-medium", "score": 29.4 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b-non-reasoning", + "score": 29.3 + }, { "modelRef": "ai2/olmo-3-1-32b-think", "score": 29.3 @@ -792,6 +928,10 @@ "modelRef": "naver/hyperclova-x-seed-think-32b", "score": 28.4 }, + { + "modelRef": "longcat/longcat-flash-lite", + "score": 28.4 + }, { "modelRef": "nvidia/llama-3-3-nemotron-super-49b-reasoning", "score": 28.2 @@ -800,6 +940,10 @@ "modelRef": "motif-technologies/motif-2-12-7b", "score": 28.2 }, + { + "modelRef": "mistral/mistral-small-4-non-reasoning", + "score": 28.1 + }, { "modelRef": "cohere/command-a", "score": 28.1 @@ -828,6 +972,14 @@ "modelRef": "nous-research/hermes-4-llama-3-1-70b", "score": 27.7 }, + { + "modelRef": "alibaba/qwen3-5-9b-non-reasoning", + "score": 27.7 + }, + { + "modelRef": "alibaba/qwen3-5-9b", + "score": 27.5 + }, { "modelRef": "google/gemini-1-5-pro-may-2024", "score": 27.4 @@ -876,6 +1028,10 @@ "modelRef": "alibaba/qwen2-5-72b-instruct", "score": 26.7 }, + { + "modelRef": "nanbeige/nanbeige4-1-3b", + "score": 26.6 + }, { "modelRef": "mistral/mistral-small-3-1", "score": 26.5 @@ -884,6 +1040,10 @@ "modelRef": "alibaba/qwen3-14b-instruct", "score": 26.5 }, + { + "modelRef": "sarvam/sarvam-105b", + "score": 26.4 + }, { "modelRef": "mistral/mistral-small-3-2", "score": 26.4 @@ -1108,6 +1268,10 @@ "modelRef": "google/gemini-2-5-flash-lite-reasoning", "score": 19.3 }, + { + "modelRef": "sarvam/sarvam-30b", + "score": 19.2 + }, { "modelRef": "meta/llama-3-instruct-70b", "score": 18.9 @@ -1140,6 +1304,10 @@ "modelRef": "ai21-labs/jamba-1-6-large", "score": 18.4 }, + { + "modelRef": "alibaba/qwen3-5-4b-non-reasoning", + "score": 18.3 + }, { "modelRef": "google/gemini-1-5-flash-may-2024", "score": 18.1 @@ -1148,6 +1316,14 @@ "modelRef": "alibaba/qwen3-4b-2507-instruct", "score": 18.1 }, + { + "modelRef": "trillionlabs/tri-21b-think-preview", + "score": 17.8 + }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 17.8 + }, { "modelRef": "google/gemini-2-5-flash-lite", "score": 17.7 @@ -1156,6 +1332,10 @@ "modelRef": "nvidia/nvidia-nemotron-nano-12b-v2-vl", "score": 17.6 }, + { + "modelRef": "trillionlabs/tri-21b-think-v0-5", + "score": 17.4 + }, { "modelRef": "google/gemma-3-12b", "score": 17.4 @@ -1192,10 +1372,18 @@ "modelRef": "upstage/solar-pro-2-preview-reasoning", "score": 16.4 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-nano-4b", + "score": 16.4 + }, { "modelRef": "ai21-labs/jamba-1-5-large", "score": 16.3 }, + { + "modelRef": "alibaba/qwen3-5-4b", + "score": 16.1 + }, { "modelRef": "mistral/mistral-small", "score": 15.6 @@ -1280,6 +1468,10 @@ "modelRef": "azure/phi-4-multimodal", "score": 11.0 }, + { + "modelRef": "liquidai/lfm2-24b-a2b", + "score": 10.9 + }, { "modelRef": "azure/phi-4-mini", "score": 10.8 @@ -1352,6 +1544,10 @@ "modelRef": "google/gemma-3-4b", "score": 7.3 }, + { + "modelRef": "alibaba/qwen3-5-2b-non-reasoning", + "score": 7.2 + }, { "modelRef": "liquidai/lfm-40b", "score": 7.1 @@ -1376,6 +1572,10 @@ "modelRef": "ai21-labs/jamba-reasoning-3b", "score": 5.9 }, + { + "modelRef": "swiss-ai-initiative/apertus-70b-instruct", + "score": 5.7 + }, { "modelRef": "meta/llama-3-2-instruct-3b", "score": 5.2 @@ -1392,6 +1592,10 @@ "modelRef": "liquidai/lfm2-5-1-2b-thinking", "score": 4.2 }, + { + "modelRef": "swiss-ai-initiative/apertus-8b-instruct", + "score": 4.1 + }, { "modelRef": "alibaba/qwen3-0-6b-instruct", "score": 4.1 @@ -1404,6 +1608,10 @@ "modelRef": "ai2/olmo-2-7b", "score": 3.7 }, + { + "modelRef": "cohere/tiny-aya-global", + "score": 3.6 + }, { "modelRef": "ai2/molmo-7b-d", "score": 3.6 @@ -1416,10 +1624,18 @@ "modelRef": "liquidai/lfm2-5-vl-1-6b", "score": 3.0 }, + { + "modelRef": "alibaba/qwen3-5-0-8b-non-reasoning", + "score": 2.9 + }, { "modelRef": "mistral/mixtral-8x7b-instruct", "score": 2.8 }, + { + "modelRef": "alibaba/qwen3-5-2b", + "score": 2.8 + }, { "modelRef": "alibaba/qwen3-0-6b-instruct-reasoning", "score": 2.8 @@ -1463,6 +1679,10 @@ { "modelRef": "google/gemma-3-270m", "score": 0.0 + }, + { + "modelRef": "alibaba/qwen3-5-0-8b", + "score": 0.0 } ] } \ No newline at end of file diff --git a/src/content/benchmarks/tau2.json b/src/content/benchmarks/tau2.json index 44f9153..7f4e115 100644 --- a/src/content/benchmarks/tau2.json +++ b/src/content/benchmarks/tau2.json @@ -8,7 +8,7 @@ "Reasoning", "Knowledge" ], - "lastUpdated": "2026-02-12", + "lastUpdated": "2026-03-25", "metrics": { "unit": "pass@1(%)", "isBetterHigher": true @@ -24,10 +24,22 @@ "modelRef": "zai/glm-4-7-flash", "score": 98.8 }, + { + "modelRef": "zai/glm-5-turbo", + "score": 98.5 + }, { "modelRef": "zai/glm-5", "score": 98.2 }, + { + "modelRef": "zai/glm-5-non-reasoning", + "score": 97.4 + }, + { + "modelRef": "xai/grok-4-20", + "score": 96.5 + }, { "modelRef": "zai/glm-4-7", "score": 95.9 @@ -36,14 +48,42 @@ "modelRef": "kimi/kimi-k2-5", "score": 95.9 }, + { + "modelRef": "google/gemini-3-1-pro-preview", + "score": 95.6 + }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b", + "score": 95.6 + }, + { + "modelRef": "minimax/minimax-m2-5", + "score": 95.3 + }, + { + "modelRef": "xiaomi/mimo-v2-pro", + "score": 95.0 + }, { "modelRef": "xiaomi/mimo-v2-flash-reasoning", "score": 95.0 }, + { + "modelRef": "stepfun/step-3-5-flash", + "score": 94.4 + }, { "modelRef": "zai/glm-4-7-non-reasoning", "score": 94.2 }, + { + "modelRef": "alibaba/qwen3-5-27b", + "score": 93.9 + }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b", + "score": 93.6 + }, { "modelRef": "xiaomi/mimo-v2-0206", "score": 93.3 @@ -52,6 +92,10 @@ "modelRef": "xai/grok-4-1-fast-reasoning", "score": 93.3 }, + { + "modelRef": "trillionlabs/tri-21b-think-preview", + "score": 93.3 + }, { "modelRef": "kimi/kimi-k2-thinking", "score": 93.0 @@ -68,10 +112,26 @@ "modelRef": "anthropic/claude-opus-4-6-adaptive", "score": 92.1 }, + { + "modelRef": "alibaba/qwen3-5-4b", + "score": 92.1 + }, { "modelRef": "zai/glm-4-7-flash-non-reasoning", "score": 91.8 }, + { + "modelRef": "openai/gpt-5-4", + "score": 91.5 + }, + { + "modelRef": "xiaomi/mimo-v2-omni", + "score": 91.2 + }, + { + "modelRef": "openai/gpt-5-3-codex", + "score": 90.9 + }, { "modelRef": "deepseek/deepseek-v3-2-reasoning", "score": 90.6 @@ -88,10 +148,18 @@ "modelRef": "anthropic/claude-opus-4-5-thinking", "score": 89.5 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b", + "score": 89.2 + }, { "modelRef": "kwaikat/kat-coder-pro-v1", "score": 88.6 }, + { + "modelRef": "alibaba/qwen3-5-4b-non-reasoning", + "score": 87.7 + }, { "modelRef": "naver/hyperclova-x-seed-think-32b", "score": 87.4 @@ -100,6 +168,10 @@ "modelRef": "google/gemini-3-pro", "score": 87.1 }, + { + "modelRef": "alibaba/qwen3-5-27b-non-reasoning", + "score": 87.1 + }, { "modelRef": "openai/gpt-5-codex", "score": 86.8 @@ -108,6 +180,10 @@ "modelRef": "minimax/minimax-m2", "score": 86.8 }, + { + "modelRef": "alibaba/qwen3-5-9b", + "score": 86.8 + }, { "modelRef": "openai/gpt-5-medium", "score": 86.5 @@ -120,10 +196,18 @@ "modelRef": "anthropic/claude-opus-4-5", "score": 86.3 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b-non-reasoning", + "score": 86.3 + }, { "modelRef": "minimax/minimax-m2-1", "score": 85.4 }, + { + "modelRef": "alibaba/qwen3-5-9b-non-reasoning", + "score": 85.1 + }, { "modelRef": "openai/gpt-5-2", "score": 84.8 @@ -132,10 +216,18 @@ "modelRef": "openai/gpt-5", "score": 84.8 }, + { + "modelRef": "minimax/minimax-m2-7", + "score": 84.8 + }, { "modelRef": "anthropic/claude-opus-4-6", "score": 84.8 }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b-non-reasoning", + "score": 84.5 + }, { "modelRef": "openai/gpt-5-low", "score": 84.2 @@ -148,6 +240,10 @@ "modelRef": "baidu/ernie-5-0-thinking-preview", "score": 83.9 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b-non-reasoning", + "score": 83.9 + }, { "modelRef": "alibaba/qwen3-max-thinking-preview", "score": 83.6 @@ -164,10 +260,22 @@ "modelRef": "openai/gpt-5-1", "score": 81.9 }, + { + "modelRef": "alibaba/qwen3-5-2b-non-reasoning", + "score": 81.6 + }, { "modelRef": "kimi/kimi-k2-5-non-reasoning", "score": 81.3 }, + { + "modelRef": "trillionlabs/tri-21b-think-v0-5", + "score": 81.0 + }, + { + "modelRef": "openai/gpt-5-4-nano", + "score": 81.0 + }, { "modelRef": "openai/o3", "score": 80.7 @@ -180,6 +288,14 @@ "modelRef": "aws/nova-2-0-omni-reasoning-medium", "score": 80.4 }, + { + "modelRef": "longcat/longcat-flash-lite", + "score": 79.5 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6", + "score": 79.5 + }, { "modelRef": "alibaba/qwen3-coder-next", "score": 79.5 @@ -188,6 +304,10 @@ "modelRef": "deepseek/deepseek-v3-2", "score": 78.9 }, + { + "modelRef": "anthropic/claude-sonnet-4-6-non-reasoning-low-effort", + "score": 78.9 + }, { "modelRef": "anthropic/claude-4-5-sonnet-thinking", "score": 78.1 @@ -204,6 +324,10 @@ "modelRef": "aws/nova-2-0-lite-reasoning-medium", "score": 75.7 }, + { + "modelRef": "anthropic/claude-sonnet-4-6-adaptive", + "score": 75.7 + }, { "modelRef": "xai/grok-4", "score": 74.9 @@ -220,10 +344,18 @@ "modelRef": "alibaba/qwen3-max", "score": 74.3 }, + { + "modelRef": "openai/gpt-5-4-mini", + "score": 73.7 + }, { "modelRef": "kimi/kimi-k2-0905", "score": 73.4 }, + { + "modelRef": "anthropic/claude-4-opus-thinking", + "score": 73.4 + }, { "modelRef": "aws/nova-2-0-lite-reasoning-low", "score": 71.9 @@ -241,21 +373,29 @@ "score": 71.1 }, { - "modelRef": "zai/glm-4-6-reasoning", - "score": 70.5 + "modelRef": "inception/mercury-2", + "score": 70.8 }, { - "modelRef": "anthropic/claude-4-opus-thinking", + "modelRef": "zai/glm-4-6-reasoning", "score": 70.5 }, { "modelRef": "anthropic/claude-4-5-sonnet", "score": 70.5 }, + { + "modelRef": "xai/grok-4-20-non-reasoning", + "score": 69.6 + }, { "modelRef": "servicenow/apriel-v1-6-15b-thinker", "score": 69.3 }, + { + "modelRef": "alibaba/qwen3-5-2b", + "score": 69.0 + }, { "modelRef": "servicenow/apriel-v1-5-15b-thinker", "score": 68.4 @@ -268,6 +408,10 @@ "modelRef": "google/gemini-3-pro-low", "score": 68.1 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-super-120b-a12b", + "score": 67.8 + }, { "modelRef": "aws/nova-2-0-omni-reasoning-low", "score": 67.8 @@ -284,6 +428,10 @@ "modelRef": "openai/gpt-oss-120b", "score": 65.8 }, + { + "modelRef": "alibaba/qwen3-5-0-8b-non-reasoning", + "score": 65.2 + }, { "modelRef": "anthropic/claude-4-sonnet-thinking", "score": 64.6 @@ -352,6 +500,10 @@ "modelRef": "openai/gpt-4-1-mini", "score": 52.9 }, + { + "modelRef": "openai/gpt-5-4-nano-medium", + "score": 52.6 + }, { "modelRef": "anthropic/claude-4-sonnet", "score": 52.3 @@ -384,6 +536,10 @@ "modelRef": "upstage/solar-open-100b-reasoning", "score": 48.2 }, + { + "modelRef": "alibaba/qwen3-5-0-8b", + "score": 47.7 + }, { "modelRef": "openai/gpt-4-1", "score": 47.1 @@ -392,6 +548,10 @@ "modelRef": "deepseek/deepseek-v3-0324", "score": 47.1 }, + { + "modelRef": "sarvam/sarvam-105b", + "score": 46.8 + }, { "modelRef": "zai/glm-4-5-air", "score": 46.5 @@ -472,6 +632,10 @@ "modelRef": "openai/gpt-5-nano", "score": 36.5 }, + { + "modelRef": "openai/gpt-5-4-mini-medium", + "score": 36.5 + }, { "modelRef": "mistral/pixtral-large-2411", "score": 36.5 @@ -480,14 +644,26 @@ "modelRef": "deepseek/deepseek-r1", "score": 36.5 }, + { + "modelRef": "openai/gpt-5-4-non-reasoning", + "score": 35.1 + }, { "modelRef": "alibaba/qwen3-vl-235b-a22b-instruct", "score": 35.1 }, + { + "modelRef": "openai/gpt-5-4-nano-non-reasoning", + "score": 34.8 + }, { "modelRef": "deepseek/deepseek-v3-1", "score": 34.8 }, + { + "modelRef": "sarvam/sarvam-30b", + "score": 34.5 + }, { "modelRef": "alibaba/qwen3-coder-30b-a3b-instruct", "score": 34.5 @@ -560,6 +736,14 @@ "modelRef": "openai/o3-mini-high", "score": 31.3 }, + { + "modelRef": "google/gemini-3-1-flash-lite-preview", + "score": 31.3 + }, + { + "modelRef": "mistral/mistral-small-4", + "score": 31.0 + }, { "modelRef": "zai/glm-4-6v", "score": 30.7 @@ -620,6 +804,10 @@ "modelRef": "upstage/solar-pro-2-reasoning", "score": 28.1 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-nano-4b", + "score": 28.1 + }, { "modelRef": "nvidia/llama-nemotron-super-49b-v1-5-reasoning", "score": 28.1 @@ -652,6 +840,10 @@ "modelRef": "alibaba/qwen3-235b-a22b-instruct", "score": 27.2 }, + { + "modelRef": "nvidia/llama-3-3-nemotron-super-49b-reasoning", + "score": 26.9 + }, { "modelRef": "prime-intellect/intellect-3", "score": 26.6 @@ -748,6 +940,10 @@ "modelRef": "alibaba/qwen3-235b-a22b-instruct-reasoning", "score": 24.0 }, + { + "modelRef": "openai/gpt-5-4-mini-non-reasoning", + "score": 23.4 + }, { "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2", "score": 23.4 @@ -808,6 +1004,10 @@ "modelRef": "nous-research/hermes-4-llama-3-1-70b", "score": 21.6 }, + { + "modelRef": "nanbeige/nanbeige4-1-3b", + "score": 21.6 + }, { "modelRef": "alibaba/qwen3-next-80b-a3b-instruct", "score": 21.6 @@ -832,6 +1032,10 @@ "modelRef": "meta/llama-3-2-instruct-3b", "score": 21.1 }, + { + "modelRef": "anthropic/claude-3-haiku", + "score": 21.1 + }, { "modelRef": "alibaba/qwen3-0-6b-instruct-reasoning", "score": 21.1 @@ -896,6 +1100,10 @@ "modelRef": "alibaba/qwen3-4b-instruct-reasoning", "score": 19.0 }, + { + "modelRef": "mistral/mistral-small-4-non-reasoning", + "score": 18.4 + }, { "modelRef": "google/gemini-2-5-flash-lite-reasoning", "score": 18.4 @@ -933,7 +1141,7 @@ "score": 16.4 }, { - "modelRef": "stepfun/step3-vl-10b", + "modelRef": "stepfun/step-3-vl-10b", "score": 16.1 }, { @@ -996,6 +1204,10 @@ "modelRef": "ibm/granite-4-0-350m", "score": 13.2 }, + { + "modelRef": "swiss-ai-initiative/apertus-70b-instruct", + "score": 12.9 + }, { "modelRef": "liquidai/lfm2-1-2b", "score": 12.6 @@ -1012,14 +1224,14 @@ "modelRef": "ai2/olmo-3-7b-instruct", "score": 12.6 }, - { - "modelRef": "meta/llama-3-2-instruct-1b", - "score": 12.3 - }, { "modelRef": "nvidia/llama-3-1-nemotron-nano-4b-reasoning", "score": 11.7 }, + { + "modelRef": "swiss-ai-initiative/apertus-8b-instruct", + "score": 11.4 + }, { "modelRef": "nvidia/llama-3-1-nemotron-ultra-253b-v1-reasoning", "score": 11.4 @@ -1028,6 +1240,10 @@ "modelRef": "deepseek/deepseek-r1-0120", "score": 11.4 }, + { + "modelRef": "liquidai/lfm2-24b-a2b", + "score": 11.1 + }, { "modelRef": "liquidai/lfm2-5-1-2b-instruct", "score": 10.8 @@ -1048,6 +1264,10 @@ "modelRef": "google/gemma-3-27b", "score": 10.5 }, + { + "modelRef": "google/gemma-3-1b", + "score": 10.5 + }, { "modelRef": "alibaba/qwen3-30b-a3b-2507", "score": 10.2 @@ -1076,6 +1296,10 @@ "modelRef": "lg/exaone-4-0-32b", "score": 4.1 }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 0.0 + }, { "modelRef": "reka-ai/reka-flash-3", "score": 0.0 @@ -1088,10 +1312,18 @@ "modelRef": "mistral/mistral-7b-instruct", "score": 0.0 }, + { + "modelRef": "meta/llama-3-instruct-8b", + "score": 0.0 + }, { "modelRef": "meta/llama-3-instruct-70b", "score": 0.0 }, + { + "modelRef": "meta/llama-3-2-instruct-1b", + "score": 0.0 + }, { "modelRef": "kimi/kimi-linear-48b-a3b-instruct", "score": 0.0 @@ -1105,15 +1337,15 @@ "score": 0.0 }, { - "modelRef": "google/gemma-3-1b", + "modelRef": "deepseek/deepseek-v3-2-speciale", "score": 0.0 }, { - "modelRef": "deepseek/deepseek-v3-2-speciale", + "modelRef": "deepseek/deepseek-r1-qwen3-8b", "score": 0.0 }, { - "modelRef": "deepseek/deepseek-r1-qwen3-8b", + "modelRef": "cohere/tiny-aya-global", "score": 0.0 }, { diff --git a/src/content/benchmarks/terminalbench_hard.json b/src/content/benchmarks/terminalbench_hard.json index 0ed4fb7..2991e6e 100644 --- a/src/content/benchmarks/terminalbench_hard.json +++ b/src/content/benchmarks/terminalbench_hard.json @@ -8,7 +8,7 @@ "Agent", "Coding" ], - "lastUpdated": "2026-02-12", + "lastUpdated": "2026-03-25", "metrics": { "unit": "pass@1(%)", "isBetterHigher": true @@ -20,6 +20,26 @@ "initialWeight": 1000 }, "snapshot": [ + { + "modelRef": "openai/gpt-5-4", + "score": 57.6 + }, + { + "modelRef": "google/gemini-3-1-pro-preview", + "score": 53.8 + }, + { + "modelRef": "openai/gpt-5-3-codex", + "score": 53.0 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6-adaptive", + "score": 53.0 + }, + { + "modelRef": "openai/gpt-5-4-mini", + "score": 52.3 + }, { "modelRef": "anthropic/claude-opus-4-6", "score": 48.5 @@ -32,6 +52,10 @@ "modelRef": "anthropic/claude-opus-4-5-thinking", "score": 47.0 }, + { + "modelRef": "anthropic/claude-sonnet-4-6", + "score": 46.2 + }, { "modelRef": "anthropic/claude-opus-4-6-adaptive", "score": 46.2 @@ -48,14 +72,42 @@ "modelRef": "openai/gpt-5-2-medium", "score": 43.2 }, + { + "modelRef": "openai/gpt-5-4-nano", + "score": 42.4 + }, + { + "modelRef": "anthropic/claude-sonnet-4-6-non-reasoning-low-effort", + "score": 42.4 + }, { "modelRef": "google/gemini-3-pro", "score": 41.7 }, + { + "modelRef": "xiaomi/mimo-v2-pro", + "score": 40.9 + }, + { + "modelRef": "xai/grok-4-20", + "score": 40.9 + }, { "modelRef": "anthropic/claude-opus-4-5", "score": 40.9 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b", + "score": 40.9 + }, + { + "modelRef": "zai/glm-5-non-reasoning", + "score": 39.4 + }, + { + "modelRef": "minimax/minimax-m2-7", + "score": 39.4 + }, { "modelRef": "google/gemini-3-flash-reasoning", "score": 38.6 @@ -72,6 +124,10 @@ "modelRef": "openai/gpt-5-codex", "score": 37.9 }, + { + "modelRef": "openai/gpt-5-4-non-reasoning", + "score": 37.9 + }, { "modelRef": "openai/o3", "score": 37.1 @@ -88,10 +144,22 @@ "modelRef": "anthropic/claude-4-5-sonnet-thinking", "score": 35.6 }, + { + "modelRef": "alibaba/qwen3-5-397b-a17b-non-reasoning", + "score": 35.6 + }, + { + "modelRef": "xiaomi/mimo-v2-omni", + "score": 34.8 + }, { "modelRef": "openai/gpt-5-1-codex", "score": 34.8 }, + { + "modelRef": "minimax/minimax-m2-5", + "score": 34.8 + }, { "modelRef": "kimi/kimi-k2-5", "score": 34.8 @@ -104,14 +172,26 @@ "modelRef": "anthropic/claude-4-1-opus-thinking", "score": 34.3 }, + { + "modelRef": "openai/gpt-5-4-mini-medium", + "score": 34.1 + }, { "modelRef": "google/gemini-3-pro-low", "score": 34.1 }, + { + "modelRef": "zai/glm-5-turbo", + "score": 33.3 + }, { "modelRef": "openai/gpt-5-mini", "score": 33.3 }, + { + "modelRef": "openai/gpt-5-4-nano-medium", + "score": 33.3 + }, { "modelRef": "openai/gpt-5-1-codex-mini", "score": 33.3 @@ -124,6 +204,10 @@ "modelRef": "deepseek/deepseek-v3-2", "score": 32.6 }, + { + "modelRef": "alibaba/qwen3-5-27b", + "score": 32.6 + }, { "modelRef": "zai/glm-4-7", "score": 31.8 @@ -140,6 +224,10 @@ "modelRef": "deepseek/deepseek-v3-1-terminus", "score": 31.8 }, + { + "modelRef": "alibaba/qwen3-5-27b-non-reasoning", + "score": 31.8 + }, { "modelRef": "xiaomi/mimo-v2-0206", "score": 31.1 @@ -160,6 +248,10 @@ "modelRef": "anthropic/claude-4-opus-thinking", "score": 31.1 }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b", + "score": 31.1 + }, { "modelRef": "zai/glm-4-7-non-reasoning", "score": 30.3 @@ -168,6 +260,10 @@ "modelRef": "deepseek/deepseek-v3-1-terminus-reasoning", "score": 30.3 }, + { + "modelRef": "alibaba/qwen3-5-122b-a10b-non-reasoning", + "score": 29.5 + }, { "modelRef": "zai/glm-4-6", "score": 28.8 @@ -176,6 +272,10 @@ "modelRef": "openai/gpt-5-mini-medium", "score": 28.8 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-super-120b-a12b", + "score": 28.8 + }, { "modelRef": "minimax/minimax-m2-1", "score": 28.8 @@ -188,6 +288,10 @@ "modelRef": "xiaomi/mimo-v2-flash-reasoning", "score": 28.0 }, + { + "modelRef": "stepfun/step-3-5-flash", + "score": 27.3 + }, { "modelRef": "anthropic/claude-4-sonnet", "score": 27.3 @@ -204,6 +308,10 @@ "modelRef": "openai/gpt-5-low", "score": 26.5 }, + { + "modelRef": "inception/mercury-2", + "score": 26.5 + }, { "modelRef": "google/gemini-2-5-pro", "score": 26.5 @@ -212,6 +320,10 @@ "modelRef": "bytedance-seed/doubao-seed-code", "score": 26.5 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b", + "score": 26.5 + }, { "modelRef": "xiaomi/mimo-v2-flash", "score": 25.8 @@ -240,6 +352,14 @@ "modelRef": "xai/grok-4-1-fast-reasoning", "score": 24.2 }, + { + "modelRef": "openai/gpt-5-4-nano-non-reasoning", + "score": 24.2 + }, + { + "modelRef": "google/gemini-3-1-flash-lite-preview", + "score": 24.2 + }, { "modelRef": "deepseek/deepseek-v3-1", "score": 24.2 @@ -252,6 +372,10 @@ "modelRef": "alibaba/qwen3-max-thinking", "score": 24.2 }, + { + "modelRef": "alibaba/qwen3-5-9b", + "score": 24.2 + }, { "modelRef": "openai/gpt-oss-120b", "score": 23.5 @@ -277,7 +401,7 @@ "score": 22.0 }, { - "modelRef": "bytedance-seed/doubao-seed-1-8", + "modelRef": "xai/grok-4-20-non-reasoning", "score": 22.0 }, { @@ -320,10 +444,22 @@ "modelRef": "openai/gpt-5-minimal", "score": 18.2 }, + { + "modelRef": "openai/gpt-5-4-mini-non-reasoning", + "score": 18.2 + }, { "modelRef": "alibaba/qwen3-coder-next", "score": 18.2 }, + { + "modelRef": "alibaba/qwen3-5-9b-non-reasoning", + "score": 18.2 + }, + { + "modelRef": "alibaba/qwen3-5-4b", + "score": 18.2 + }, { "modelRef": "xai/grok-code-fast-1", "score": 17.4 @@ -336,6 +472,10 @@ "modelRef": "openai/gpt-5-nano-medium", "score": 17.4 }, + { + "modelRef": "mistral/mistral-small-4", + "score": 17.4 + }, { "modelRef": "aws/nova-2-0-pro-reasoning-low", "score": 17.4 @@ -480,6 +620,10 @@ "modelRef": "alibaba/qwen3-vl-235b-a22b-reasoning", "score": 11.4 }, + { + "modelRef": "alibaba/qwen3-5-4b-non-reasoning", + "score": 11.4 + }, { "modelRef": "servicenow/apriel-v1-5-15b-thinker", "score": 10.6 @@ -488,10 +632,18 @@ "modelRef": "openai/gpt-oss-20b", "score": 10.6 }, + { + "modelRef": "mistral/mistral-small-4-non-reasoning", + "score": 10.6 + }, { "modelRef": "mistral/mistral-medium-3-1", "score": 10.6 }, + { + "modelRef": "longcat/longcat-flash-lite", + "score": 10.6 + }, { "modelRef": "inclusionai/ling-flash-2-0", "score": 10.6 @@ -500,6 +652,10 @@ "modelRef": "inclusionai/ling-1t", "score": 10.6 }, + { + "modelRef": "alibaba/qwen3-5-35b-a3b-non-reasoning", + "score": 10.6 + }, { "modelRef": "nous-research/hermes-4-llama-3-1-405b", "score": 9.8 @@ -580,6 +736,10 @@ "modelRef": "openai/gpt-5-nano-minimal", "score": 6.8 }, + { + "modelRef": "nvidia/nvidia-nemotron-3-nano-4b", + "score": 6.8 + }, { "modelRef": "mistral/mistral-small-3-2", "score": 6.8 @@ -681,7 +841,7 @@ "score": 5.3 }, { - "modelRef": "stepfun/step3-vl-10b", + "modelRef": "stepfun/step-3-vl-10b", "score": 5.3 }, { @@ -812,6 +972,14 @@ "modelRef": "alibaba/qwen3-omni-30b-a3b-reasoning", "score": 3.8 }, + { + "modelRef": "alibaba/qwen3-5-2b-non-reasoning", + "score": 3.8 + }, + { + "modelRef": "alibaba/qwen3-5-2b", + "score": 3.8 + }, { "modelRef": "alibaba/qwen3-14b-instruct-reasoning", "score": 3.8 @@ -848,10 +1016,22 @@ "modelRef": "upstage/solar-open-100b-reasoning", "score": 2.3 }, + { + "modelRef": "trillionlabs/tri-21b-think-preview", + "score": 2.3 + }, { "modelRef": "tii-uae/falcon-h1r-7b", "score": 2.3 }, + { + "modelRef": "sarvam/sarvam-m-reasoning", + "score": 2.3 + }, + { + "modelRef": "sarvam/sarvam-30b", + "score": 2.3 + }, { "modelRef": "nvidia/llama-3-1-nemotron-ultra-253b-v1-reasoning", "score": 2.3 @@ -900,6 +1080,10 @@ "modelRef": "ai21-labs/jamba-1-7-large", "score": 2.3 }, + { + "modelRef": "sarvam/sarvam-105b", + "score": 1.5 + }, { "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2-reasoning", "score": 1.5 @@ -944,6 +1128,10 @@ "modelRef": "ai2/olmo-3-32b-think", "score": 1.5 }, + { + "modelRef": "trillionlabs/tri-21b-think-v0-5", + "score": 0.8 + }, { "modelRef": "nvidia/nvidia-nemotron-nano-9b-v2", "score": 0.8 @@ -988,6 +1176,10 @@ "modelRef": "aws/nova-lite", "score": 0.8 }, + { + "modelRef": "anthropic/claude-3-haiku", + "score": 0.8 + }, { "modelRef": "ai21-labs/jamba-reasoning-3b", "score": 0.8 @@ -996,6 +1188,14 @@ "modelRef": "ai2/olmo-3-7b-think", "score": 0.8 }, + { + "modelRef": "swiss-ai-initiative/apertus-8b-instruct", + "score": 0.0 + }, + { + "modelRef": "swiss-ai-initiative/apertus-70b-instruct", + "score": 0.0 + }, { "modelRef": "reka-ai/reka-flash-3", "score": 0.0 @@ -1016,6 +1216,10 @@ "modelRef": "nous-research/hermes-4-llama-3-1-70b", "score": 0.0 }, + { + "modelRef": "nanbeige/nanbeige4-1-3b", + "score": 0.0 + }, { "modelRef": "mistral/ministral-3-3b", "score": 0.0 @@ -1044,6 +1248,10 @@ "modelRef": "liquidai/lfm2-5-1-2b-instruct", "score": 0.0 }, + { + "modelRef": "liquidai/lfm2-24b-a2b", + "score": 0.0 + }, { "modelRef": "liquidai/lfm2-1-2b", "score": 0.0 @@ -1084,6 +1292,10 @@ "modelRef": "google/gemma-3-1b", "score": 0.0 }, + { + "modelRef": "cohere/tiny-aya-global", + "score": 0.0 + }, { "modelRef": "azure/phi-4-mini", "score": 0.0 @@ -1096,6 +1308,14 @@ "modelRef": "alibaba/qwen3-vl-4b-instruct", "score": 0.0 }, + { + "modelRef": "alibaba/qwen3-5-0-8b-non-reasoning", + "score": 0.0 + }, + { + "modelRef": "alibaba/qwen3-5-0-8b", + "score": 0.0 + }, { "modelRef": "alibaba/qwen3-1-7b-instruct-reasoning", "score": 0.0 diff --git a/src/content/models/alibaba/qwen3-5-0-8b-non-reasoning.json b/src/content/models/alibaba/qwen3-5-0-8b-non-reasoning.json new file mode 100644 index 0000000..d660344 --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-0-8b-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 0.8B (Non-reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-03-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-0-8b.json b/src/content/models/alibaba/qwen3-5-0-8b.json new file mode 100644 index 0000000..6e16b12 --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-0-8b.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 0.8B (Reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-03-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-122b-a10b-non-reasoning.json b/src/content/models/alibaba/qwen3-5-122b-a10b-non-reasoning.json new file mode 100644 index 0000000..489cb65 --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-122b-a10b-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 122B A10B (Non-reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-02-24", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-122b-a10b.json b/src/content/models/alibaba/qwen3-5-122b-a10b.json new file mode 100644 index 0000000..de4558d --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-122b-a10b.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 122B A10B (Reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-02-24", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-27b-non-reasoning.json b/src/content/models/alibaba/qwen3-5-27b-non-reasoning.json new file mode 100644 index 0000000..f2cee5f --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-27b-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 27B (Non-reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-02-24", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-27b.json b/src/content/models/alibaba/qwen3-5-27b.json new file mode 100644 index 0000000..a8d319c --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-27b.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 27B (Reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-02-24", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-2b-non-reasoning.json b/src/content/models/alibaba/qwen3-5-2b-non-reasoning.json new file mode 100644 index 0000000..a391aec --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-2b-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 2B (Non-reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-03-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-2b.json b/src/content/models/alibaba/qwen3-5-2b.json new file mode 100644 index 0000000..5f65472 --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-2b.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 2B (Reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-03-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-35b-a3b-non-reasoning.json b/src/content/models/alibaba/qwen3-5-35b-a3b-non-reasoning.json new file mode 100644 index 0000000..757e1b6 --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-35b-a3b-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 35B A3B (Non-reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-02-24", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-35b-a3b.json b/src/content/models/alibaba/qwen3-5-35b-a3b.json new file mode 100644 index 0000000..fc7ea4d --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-35b-a3b.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 35B A3B (Reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-02-24", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-397b-a17b-non-reasoning.json b/src/content/models/alibaba/qwen3-5-397b-a17b-non-reasoning.json new file mode 100644 index 0000000..301ce0b --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-397b-a17b-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 397B A17B (Non-reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-02-16", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-397b-a17b.json b/src/content/models/alibaba/qwen3-5-397b-a17b.json new file mode 100644 index 0000000..732b8c0 --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-397b-a17b.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 397B A17B (Reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-02-16", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-4b-non-reasoning.json b/src/content/models/alibaba/qwen3-5-4b-non-reasoning.json new file mode 100644 index 0000000..f3f0dfc --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-4b-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 4B (Non-reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-03-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-4b.json b/src/content/models/alibaba/qwen3-5-4b.json new file mode 100644 index 0000000..ec26a2a --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-4b.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 4B (Reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-03-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-9b-non-reasoning.json b/src/content/models/alibaba/qwen3-5-9b-non-reasoning.json new file mode 100644 index 0000000..cfd09a0 --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-9b-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 9B (Non-reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-03-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-5-9b.json b/src/content/models/alibaba/qwen3-5-9b.json new file mode 100644 index 0000000..866cd4f --- /dev/null +++ b/src/content/models/alibaba/qwen3-5-9b.json @@ -0,0 +1,9 @@ +{ + "name": "Qwen3.5 9B (Reasoning)", + "publisher": "alibaba", + "releaseDate": "2026-03-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/alibaba/qwen3-coder-next.json b/src/content/models/alibaba/qwen3-coder-next.json index 7872689..28cdfc0 100644 --- a/src/content/models/alibaba/qwen3-coder-next.json +++ b/src/content/models/alibaba/qwen3-coder-next.json @@ -1,5 +1,5 @@ { - "name": "Qwen3-Coder-Next", + "name": "Qwen3 Coder Next", "publisher": "alibaba", "releaseDate": "2026-02-03", "params": "", diff --git a/src/content/models/anthropic/claude-opus-4-6-adaptive.json b/src/content/models/anthropic/claude-opus-4-6-adaptive.json index d7284b4..5d75780 100644 --- a/src/content/models/anthropic/claude-opus-4-6-adaptive.json +++ b/src/content/models/anthropic/claude-opus-4-6-adaptive.json @@ -1,5 +1,5 @@ { - "name": "Claude Opus 4.6 (Adaptive Reasoning)", + "name": "Claude Opus 4.6 (Adaptive Reasoning, Max Effort)", "publisher": "anthropic", "releaseDate": "2026-02-05", "params": "", diff --git a/src/content/models/anthropic/claude-opus-4-6.json b/src/content/models/anthropic/claude-opus-4-6.json index 823aa54..71ea14a 100644 --- a/src/content/models/anthropic/claude-opus-4-6.json +++ b/src/content/models/anthropic/claude-opus-4-6.json @@ -1,5 +1,5 @@ { - "name": "Claude Opus 4.6 (Non-reasoning)", + "name": "Claude Opus 4.6 (Non-reasoning, High Effort)", "publisher": "anthropic", "releaseDate": "2026-02-05", "params": "", diff --git a/src/content/models/anthropic/claude-sonnet-4-6-adaptive.json b/src/content/models/anthropic/claude-sonnet-4-6-adaptive.json new file mode 100644 index 0000000..88a6ddc --- /dev/null +++ b/src/content/models/anthropic/claude-sonnet-4-6-adaptive.json @@ -0,0 +1,9 @@ +{ + "name": "Claude Sonnet 4.6 (Adaptive Reasoning, Max Effort)", + "publisher": "anthropic", + "releaseDate": "2026-02-17", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/anthropic/claude-sonnet-4-6-non-reasoning-low-effort.json b/src/content/models/anthropic/claude-sonnet-4-6-non-reasoning-low-effort.json new file mode 100644 index 0000000..29daa4e --- /dev/null +++ b/src/content/models/anthropic/claude-sonnet-4-6-non-reasoning-low-effort.json @@ -0,0 +1,9 @@ +{ + "name": "Claude Sonnet 4.6 (Non-reasoning, Low Effort)", + "publisher": "anthropic", + "releaseDate": "2026-02-17", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/anthropic/claude-sonnet-4-6.json b/src/content/models/anthropic/claude-sonnet-4-6.json new file mode 100644 index 0000000..3eb48e0 --- /dev/null +++ b/src/content/models/anthropic/claude-sonnet-4-6.json @@ -0,0 +1,9 @@ +{ + "name": "Claude Sonnet 4.6 (Non-reasoning, High Effort)", + "publisher": "anthropic", + "releaseDate": "2026-02-17", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/cohere/tiny-aya-global.json b/src/content/models/cohere/tiny-aya-global.json new file mode 100644 index 0000000..ef6f0f0 --- /dev/null +++ b/src/content/models/cohere/tiny-aya-global.json @@ -0,0 +1,9 @@ +{ + "name": "Tiny Aya Global", + "publisher": "cohere", + "releaseDate": "2026-02-17", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/google/gemini-3-1-flash-lite-preview.json b/src/content/models/google/gemini-3-1-flash-lite-preview.json new file mode 100644 index 0000000..6293a9f --- /dev/null +++ b/src/content/models/google/gemini-3-1-flash-lite-preview.json @@ -0,0 +1,9 @@ +{ + "name": "Gemini 3.1 Flash-Lite Preview", + "publisher": "google", + "releaseDate": "2026-03-03", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/google/gemini-3-1-pro-preview.json b/src/content/models/google/gemini-3-1-pro-preview.json new file mode 100644 index 0000000..0d51424 --- /dev/null +++ b/src/content/models/google/gemini-3-1-pro-preview.json @@ -0,0 +1,9 @@ +{ + "name": "Gemini 3.1 Pro Preview", + "publisher": "google", + "releaseDate": "2026-02-19", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/inception/mercury-2.json b/src/content/models/inception/mercury-2.json new file mode 100644 index 0000000..012da74 --- /dev/null +++ b/src/content/models/inception/mercury-2.json @@ -0,0 +1,9 @@ +{ + "name": "Mercury 2", + "publisher": "inception", + "releaseDate": "2026-02-20", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/liquidai/lfm2-24b-a2b.json b/src/content/models/liquidai/lfm2-24b-a2b.json new file mode 100644 index 0000000..2a58c7f --- /dev/null +++ b/src/content/models/liquidai/lfm2-24b-a2b.json @@ -0,0 +1,9 @@ +{ + "name": "LFM2 24B A2B", + "publisher": "liquidai", + "releaseDate": "2026-02-25", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/longcat/longcat-flash-lite.json b/src/content/models/longcat/longcat-flash-lite.json new file mode 100644 index 0000000..e7af753 --- /dev/null +++ b/src/content/models/longcat/longcat-flash-lite.json @@ -0,0 +1,9 @@ +{ + "name": "LongCat Flash Lite", + "publisher": "longcat", + "releaseDate": "2026-01-28", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/minimax/minimax-m2-5.json b/src/content/models/minimax/minimax-m2-5.json new file mode 100644 index 0000000..43310a3 --- /dev/null +++ b/src/content/models/minimax/minimax-m2-5.json @@ -0,0 +1,9 @@ +{ + "name": "MiniMax-M2.5", + "publisher": "minimax", + "releaseDate": "2026-02-12", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/minimax/minimax-m2-7.json b/src/content/models/minimax/minimax-m2-7.json new file mode 100644 index 0000000..e38e49d --- /dev/null +++ b/src/content/models/minimax/minimax-m2-7.json @@ -0,0 +1,9 @@ +{ + "name": "MiniMax-M2.7", + "publisher": "minimax", + "releaseDate": "2026-03-18", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/mistral/mistral-small-4-non-reasoning.json b/src/content/models/mistral/mistral-small-4-non-reasoning.json new file mode 100644 index 0000000..49ef147 --- /dev/null +++ b/src/content/models/mistral/mistral-small-4-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Mistral Small 4 (Non-reasoning)", + "publisher": "mistral", + "releaseDate": "2026-03-16", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/mistral/mistral-small-4.json b/src/content/models/mistral/mistral-small-4.json new file mode 100644 index 0000000..229dbb7 --- /dev/null +++ b/src/content/models/mistral/mistral-small-4.json @@ -0,0 +1,9 @@ +{ + "name": "Mistral Small 4 (Reasoning)", + "publisher": "mistral", + "releaseDate": "2026-03-16", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/nanbeige/nanbeige4-1-3b.json b/src/content/models/nanbeige/nanbeige4-1-3b.json new file mode 100644 index 0000000..b1ed3b9 --- /dev/null +++ b/src/content/models/nanbeige/nanbeige4-1-3b.json @@ -0,0 +1,9 @@ +{ + "name": "Nanbeige4.1-3B", + "publisher": "nanbeige", + "releaseDate": "2026-02-11", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/nvidia/nvidia-nemotron-3-nano-4b.json b/src/content/models/nvidia/nvidia-nemotron-3-nano-4b.json new file mode 100644 index 0000000..67d43c0 --- /dev/null +++ b/src/content/models/nvidia/nvidia-nemotron-3-nano-4b.json @@ -0,0 +1,9 @@ +{ + "name": "NVIDIA Nemotron 3 Nano 4B", + "publisher": "nvidia", + "releaseDate": "2026-03-16", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/nvidia/nvidia-nemotron-3-super-120b-a12b.json b/src/content/models/nvidia/nvidia-nemotron-3-super-120b-a12b.json new file mode 100644 index 0000000..001bfb9 --- /dev/null +++ b/src/content/models/nvidia/nvidia-nemotron-3-super-120b-a12b.json @@ -0,0 +1,9 @@ +{ + "name": "NVIDIA Nemotron 3 Super 120B A12B (Reasoning)", + "publisher": "nvidia", + "releaseDate": "2026-03-11", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/openai/gpt-5-3-codex.json b/src/content/models/openai/gpt-5-3-codex.json new file mode 100644 index 0000000..21e6901 --- /dev/null +++ b/src/content/models/openai/gpt-5-3-codex.json @@ -0,0 +1,9 @@ +{ + "name": "GPT-5.3 Codex (xhigh)", + "publisher": "openai", + "releaseDate": "2026-02-05", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/openai/gpt-5-4-mini-medium.json b/src/content/models/openai/gpt-5-4-mini-medium.json new file mode 100644 index 0000000..61315e1 --- /dev/null +++ b/src/content/models/openai/gpt-5-4-mini-medium.json @@ -0,0 +1,9 @@ +{ + "name": "GPT-5.4 mini (medium)", + "publisher": "openai", + "releaseDate": "2026-03-17", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/openai/gpt-5-4-mini-non-reasoning.json b/src/content/models/openai/gpt-5-4-mini-non-reasoning.json new file mode 100644 index 0000000..3ba6c88 --- /dev/null +++ b/src/content/models/openai/gpt-5-4-mini-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "GPT-5.4 mini (Non-Reasoning)", + "publisher": "openai", + "releaseDate": "2026-03-17", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/openai/gpt-5-4-mini.json b/src/content/models/openai/gpt-5-4-mini.json new file mode 100644 index 0000000..a5fffb1 --- /dev/null +++ b/src/content/models/openai/gpt-5-4-mini.json @@ -0,0 +1,9 @@ +{ + "name": "GPT-5.4 mini (xhigh)", + "publisher": "openai", + "releaseDate": "2026-03-17", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/openai/gpt-5-4-nano-medium.json b/src/content/models/openai/gpt-5-4-nano-medium.json new file mode 100644 index 0000000..adcd981 --- /dev/null +++ b/src/content/models/openai/gpt-5-4-nano-medium.json @@ -0,0 +1,9 @@ +{ + "name": "GPT-5.4 nano (medium)", + "publisher": "openai", + "releaseDate": "2026-03-17", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/openai/gpt-5-4-nano-non-reasoning.json b/src/content/models/openai/gpt-5-4-nano-non-reasoning.json new file mode 100644 index 0000000..0918aa2 --- /dev/null +++ b/src/content/models/openai/gpt-5-4-nano-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "GPT-5.4 nano (Non-Reasoning)", + "publisher": "openai", + "releaseDate": "2026-03-17", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/openai/gpt-5-4-nano.json b/src/content/models/openai/gpt-5-4-nano.json new file mode 100644 index 0000000..75555fc --- /dev/null +++ b/src/content/models/openai/gpt-5-4-nano.json @@ -0,0 +1,9 @@ +{ + "name": "GPT-5.4 nano (xhigh)", + "publisher": "openai", + "releaseDate": "2026-03-17", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/openai/gpt-5-4-non-reasoning.json b/src/content/models/openai/gpt-5-4-non-reasoning.json new file mode 100644 index 0000000..9f3af84 --- /dev/null +++ b/src/content/models/openai/gpt-5-4-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "GPT-5.4 (Non-reasoning)", + "publisher": "openai", + "releaseDate": "2026-03-05", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/openai/gpt-5-4-pro.json b/src/content/models/openai/gpt-5-4-pro.json new file mode 100644 index 0000000..3ef8cf0 --- /dev/null +++ b/src/content/models/openai/gpt-5-4-pro.json @@ -0,0 +1,9 @@ +{ + "name": "GPT-5.4 Pro (xhigh)", + "publisher": "openai", + "releaseDate": "2026-03-05", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/openai/gpt-5-4.json b/src/content/models/openai/gpt-5-4.json new file mode 100644 index 0000000..c15d3bd --- /dev/null +++ b/src/content/models/openai/gpt-5-4.json @@ -0,0 +1,9 @@ +{ + "name": "GPT-5.4 (xhigh)", + "publisher": "openai", + "releaseDate": "2026-03-05", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/sarvam/sarvam-105b.json b/src/content/models/sarvam/sarvam-105b.json new file mode 100644 index 0000000..5f88972 --- /dev/null +++ b/src/content/models/sarvam/sarvam-105b.json @@ -0,0 +1,9 @@ +{ + "name": "Sarvam 105B (Reasoning)", + "publisher": "sarvam", + "releaseDate": "2026-03-06", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/sarvam/sarvam-30b.json b/src/content/models/sarvam/sarvam-30b.json new file mode 100644 index 0000000..ecad40d --- /dev/null +++ b/src/content/models/sarvam/sarvam-30b.json @@ -0,0 +1,9 @@ +{ + "name": "Sarvam 30B (Reasoning)", + "publisher": "sarvam", + "releaseDate": "2026-03-06", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/sarvam/sarvam-m-reasoning.json b/src/content/models/sarvam/sarvam-m-reasoning.json new file mode 100644 index 0000000..b277ad7 --- /dev/null +++ b/src/content/models/sarvam/sarvam-m-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Sarvam M (Reasoning)", + "publisher": "sarvam", + "releaseDate": "2025-05-23", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/stepfun/step-3-5-flash.json b/src/content/models/stepfun/step-3-5-flash.json new file mode 100644 index 0000000..6f77a9c --- /dev/null +++ b/src/content/models/stepfun/step-3-5-flash.json @@ -0,0 +1,9 @@ +{ + "name": "Step 3.5 Flash", + "publisher": "stepfun", + "releaseDate": "2026-02-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/stepfun/step-3-vl-10b.json b/src/content/models/stepfun/step-3-vl-10b.json new file mode 100644 index 0000000..1b6c063 --- /dev/null +++ b/src/content/models/stepfun/step-3-vl-10b.json @@ -0,0 +1,9 @@ +{ + "name": "Step3 VL 10B", + "publisher": "stepfun", + "releaseDate": "2026-01-20", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/swiss-ai-initiative/apertus-70b-instruct.json b/src/content/models/swiss-ai-initiative/apertus-70b-instruct.json new file mode 100644 index 0000000..aefa41f --- /dev/null +++ b/src/content/models/swiss-ai-initiative/apertus-70b-instruct.json @@ -0,0 +1,9 @@ +{ + "name": "Apertus 70B Instruct", + "publisher": "swiss-ai-initiative", + "releaseDate": "2025-09-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/swiss-ai-initiative/apertus-8b-instruct.json b/src/content/models/swiss-ai-initiative/apertus-8b-instruct.json new file mode 100644 index 0000000..7a357f9 --- /dev/null +++ b/src/content/models/swiss-ai-initiative/apertus-8b-instruct.json @@ -0,0 +1,9 @@ +{ + "name": "Apertus 8B Instruct", + "publisher": "swiss-ai-initiative", + "releaseDate": "2025-09-02", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/trillionlabs/tri-21b-think-preview.json b/src/content/models/trillionlabs/tri-21b-think-preview.json new file mode 100644 index 0000000..780402c --- /dev/null +++ b/src/content/models/trillionlabs/tri-21b-think-preview.json @@ -0,0 +1,9 @@ +{ + "name": "Tri-21B-think Preview", + "publisher": "trillionlabs", + "releaseDate": "2026-02-10", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/trillionlabs/tri-21b-think-v0-5.json b/src/content/models/trillionlabs/tri-21b-think-v0-5.json new file mode 100644 index 0000000..a40854f --- /dev/null +++ b/src/content/models/trillionlabs/tri-21b-think-v0-5.json @@ -0,0 +1,9 @@ +{ + "name": "Tri-21B-Think", + "publisher": "trillionlabs", + "releaseDate": "2026-02-10", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/xai/grok-4-20-non-reasoning.json b/src/content/models/xai/grok-4-20-non-reasoning.json new file mode 100644 index 0000000..c60e6c1 --- /dev/null +++ b/src/content/models/xai/grok-4-20-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "Grok 4.20 Beta 0309 (Non-reasoning)", + "publisher": "xai", + "releaseDate": "2026-03-10", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/xai/grok-4-20.json b/src/content/models/xai/grok-4-20.json new file mode 100644 index 0000000..e184c6c --- /dev/null +++ b/src/content/models/xai/grok-4-20.json @@ -0,0 +1,9 @@ +{ + "name": "Grok 4.20 Beta 0309 (Reasoning)", + "publisher": "xai", + "releaseDate": "2026-03-10", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/xiaomi/mimo-v2-omni.json b/src/content/models/xiaomi/mimo-v2-omni.json new file mode 100644 index 0000000..825c637 --- /dev/null +++ b/src/content/models/xiaomi/mimo-v2-omni.json @@ -0,0 +1,9 @@ +{ + "name": "mimo-v2-omni", + "publisher": "xiaomi", + "releaseDate": "2026-03-19", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/xiaomi/mimo-v2-pro.json b/src/content/models/xiaomi/mimo-v2-pro.json new file mode 100644 index 0000000..85c7bf5 --- /dev/null +++ b/src/content/models/xiaomi/mimo-v2-pro.json @@ -0,0 +1,9 @@ +{ + "name": "MiMo-V2-Pro", + "publisher": "xiaomi", + "releaseDate": "2026-03-18", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/zai/glm-5-non-reasoning.json b/src/content/models/zai/glm-5-non-reasoning.json new file mode 100644 index 0000000..fbfad3d --- /dev/null +++ b/src/content/models/zai/glm-5-non-reasoning.json @@ -0,0 +1,9 @@ +{ + "name": "GLM-5 (Non-reasoning)", + "publisher": "zai", + "releaseDate": "2026-02-11", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/models/zai/glm-5-turbo.json b/src/content/models/zai/glm-5-turbo.json new file mode 100644 index 0000000..ecbf870 --- /dev/null +++ b/src/content/models/zai/glm-5-turbo.json @@ -0,0 +1,9 @@ +{ + "name": "GLM 5 Turbo", + "publisher": "zai", + "releaseDate": "2026-03-15", + "params": "", + "license": "", + "website": "", + "discussionId": "" +} \ No newline at end of file diff --git a/src/content/publishers/inception.json b/src/content/publishers/inception.json new file mode 100644 index 0000000..17292d9 --- /dev/null +++ b/src/content/publishers/inception.json @@ -0,0 +1,6 @@ +{ + "name": "Inception", + "color": "#94a3b8", + "logo": "/logos/unknown.svg", + "website": "" +} \ No newline at end of file diff --git a/src/content/publishers/longcat.json b/src/content/publishers/longcat.json new file mode 100644 index 0000000..e7a64ae --- /dev/null +++ b/src/content/publishers/longcat.json @@ -0,0 +1,6 @@ +{ + "name": "LongCat", + "color": "#94a3b8", + "logo": "/logos/unknown.svg", + "website": "" +} \ No newline at end of file diff --git a/src/content/publishers/nanbeige.json b/src/content/publishers/nanbeige.json new file mode 100644 index 0000000..1eb9c0a --- /dev/null +++ b/src/content/publishers/nanbeige.json @@ -0,0 +1,6 @@ +{ + "name": "Nanbeige", + "color": "#94a3b8", + "logo": "/logos/unknown.svg", + "website": "" +} \ No newline at end of file diff --git a/src/content/publishers/sarvam.json b/src/content/publishers/sarvam.json new file mode 100644 index 0000000..e21542e --- /dev/null +++ b/src/content/publishers/sarvam.json @@ -0,0 +1,6 @@ +{ + "name": "Sarvam", + "color": "#94a3b8", + "logo": "/logos/unknown.svg", + "website": "" +} \ No newline at end of file diff --git a/src/content/publishers/swiss-ai-initiative.json b/src/content/publishers/swiss-ai-initiative.json new file mode 100644 index 0000000..8fd3c83 --- /dev/null +++ b/src/content/publishers/swiss-ai-initiative.json @@ -0,0 +1,6 @@ +{ + "name": "Swiss AI Initiative", + "color": "#94a3b8", + "logo": "/logos/unknown.svg", + "website": "" +} \ No newline at end of file diff --git a/src/content/publishers/trillionlabs.json b/src/content/publishers/trillionlabs.json new file mode 100644 index 0000000..c221b34 --- /dev/null +++ b/src/content/publishers/trillionlabs.json @@ -0,0 +1,6 @@ +{ + "name": "Trillion Labs", + "color": "#94a3b8", + "logo": "/logos/unknown.svg", + "website": "" +} \ No newline at end of file