-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathomnigirl_leaderboard.html
More file actions
209 lines (185 loc) · 10.4 KB
/
omnigirl_leaderboard.html
File metadata and controls
209 lines (185 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
<!doctype html>
<html lang="en">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@100;400&display=swap" rel="stylesheet">
<head>
<meta charset="UTF-8">
<title>OmniGIRL Leaderboard</title>
<script src="https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.0/papaparse.min.js"></script>
<link rel="icon" href="https://images.emojiterra.com/google/noto-emoji/unicode-15/color/1024px/1f4da.png">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0/dist/css/bootstrap.min.css">
<style>
body{font-family:"JetBrains Mono",monospace;background:#fff;color:#000}
#content{width:85%}
th{background:#f2f2f2;text-align:left}
td{text-align:left;vertical-align:middle}
#notes{font-size:1em}
#notes h3{margin-top:1em;font-size:2em;text-align:center}
#notes li{font-size:1.2em;font-weight:300;margin:1em}
.table-striped tbody tr:nth-of-type(odd){background:#f8f9fa}
@media(max-width:1400px){
body{font-size:1.6vw}
#content{width:100%}
h1{font-size:2em}h3{font-size:1.2em}
table{font-size:small}
}
/* 新增:Org / Site 图标尺寸 */
.icon-cell{text-align:center}
.icon-cell img{height:1.1em}
.icon-cell a{display:inline-block;font-size:1.1em;line-height:1;text-decoration:none}
</style>
</head>
<body>
<div id="content" class="container-fluid d-flex flex-column align-items-center gap-3">
<h1 class="text-nowrap mt-5">🏆 OmniGIRL Leaderboard 🏆</h1>
<h3 class="fw-light text-nowrap">
<small id="warning">A Multilingual & Multimodal Benchmark for GitHub Issue Resolution<br></small>
</h3>
<!-- 徽章:GitHub / Paper / HF -->
<div class="d-flex flex-row justify-content-center gap-3">
<a href="https://github.com/DeepSoftwareAnalytics/OmniGIRL" target="_blank">
<img src="https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white" alt="github" class="img-fluid">
</a>
<a href="https://conf.researchr.org/details/issta-2025/issta-2025-papers/2/OmniGIRL-A-Multilingual-and-Multimodal-Benchmark-for-GitHub-Issue-Resolution" target="_blank">
<img src="https://img.shields.io/badge/Paper-ISSTA%2725-a55fed.svg?style=for-the-badge" alt="paper" class="img-fluid">
</a>
<!-- 新增 arXiv 链接 -->
<a href="http://arxiv.org/abs/2505.04606" target="_blank">
<img src="https://img.shields.io/badge/arXiv-2505.04606-B31B1B.svg?style=for-the-badge&logo=arxiv&logoColor=white" alt="arxiv" class="img-fluid">
</a>
<a href="https://huggingface.co/datasets/Deep-Software-Analytics/OmniGIRL" target="_blank">
<img src="https://img.shields.io/badge/HuggingFace-%23f9ac00.svg?style=for-the-badge&logo=huggingface" alt="hf" class="img-fluid">
</a>
</div>
<!-- 语言切换 -->
<div class="btn-group mt-3" role="group" id="Language">
<input type="radio" class="btn-check" name="langradio" id="Full" value="full" checked>
<label class="btn btn-outline-primary" for="Full">Full</label>
<input type="radio" class="btn-check" name="langradio" id="Python" value="python">
<label class="btn btn-outline-primary" for="Python">Python</label>
<input type="radio" class="btn-check" name="langradio" id="Java" value="java">
<label class="btn btn-outline-primary" for="Java">Java</label>
<input type="radio" class="btn-check" name="langradio" id="JavaScript" value="javascript">
<label class="btn btn-outline-primary" for="JavaScript">JavaScript</label>
<input type="radio" class="btn-check" name="langradio" id="TypeScript" value="typescript">
<label class="btn btn-outline-primary" for="TypeScript">TypeScript</label>
</div>
<!-- Leaderboard 表格 -->
<table id="origin" class="table table-striped table-bordered border border-primary border-3 mt-4 w-100">
<thead>
<tr>
<th style="width:40%">Method</th>
<th style="width:25%">Model</th>
<th style="width:10%" class="text-center">%Resolved</th>
<th style="width:5%" class="text-center">Org</th>
<th style="width:5%" class="text-center">Site</th>
<th style="width:15%" class="text-center">Date</th>
</tr>
</thead>
<tbody id="leaderboard-body"></tbody>
</table>
<!-- 📝 Notes -->
<div id="notes" class="w-100">
<h3>📝 Notes</h3>
<div class="inline-block mt-3">
<ol>
<li><strong>OmniGIRL</strong> is a multilingual & multimodal GitHub-issue-resolution benchmark with <strong>959 tasks</strong> spanning four programming languages. Inputs may include text, screenshots, rendered web pages and other modalities.</li>
<li>For realistic evaluation, <em>we recommend</em> that methods automatically examine each task’s raw input to detect available modalities (e.g., embedded webpages, images), retrieve the relevant content by themselves, and invoke the appropriate tools— instead of relying on manual hints. Doing so better assesses a solver’s <strong>general-purpose issue-resolution ability in real-world scenarios</strong>.</li>
<li>Our baseline system is released <em>for research purposes only</em>; please cite OmniGIRL if you use it.</li>
</ol>
</div>
</div>
<!-- 📨 How to Submit -->
<div id="notes" class="w-100">
<h3>📨 How to Submit</h3>
<div class="inline-block mt-3">
<ol>
<li>Prepare a <code>.json</code> or <code>.jsonl</code> file. Each record must contain at least the keys <code>instance_id</code>, <code>model_name_or_path</code>, and <code>model_patch</code>.</li>
<li>Email the file to <a href="mailto:guolh8@mail2.sysu.edu.cn?subject=OmniGIRL%20Submission">guolh8@mail2.sysu.edu.cn</a>.</li>
<li>We will evaluate your submission locally and update the leaderboard once the results are verified.</li>
</ol>
</div>
</div>
<!-- 🤗 More Leaderboards -->
<div id="notes" class="w-100">
<h3>🤗 More Leaderboards</h3>
<div class="inline-block mt-3">
<ol>
<li><a href="https://bigcode-bench.github.io/">BigCodeBench</a></li>
<li><a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">Big Code Models</a></li>
<li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">Chatbot Arena</a></li>
<li><a href="https://github.com/amazon-science/cceval">CrossCodeEval</a></li>
<li><a href="https://fudanselab-classeval.github.io/">ClassEval</a></li>
<li><a href="https://crux-eval.github.io/leaderboard.html">CRUXEval</a></li>
<li><a href="https://codetlingua.github.io/leaderboard.html">Code Lingua</a></li>
<li><a href="https://evo-eval.github.io/">Evo-Eval</a></li>
<li><a href="https://huggingface.co/spaces/EffiBench/effibench-leaderboard">EffiBench</a></li>
<li><a href="https://github.com/01-ai/HumanEval.jl">HumanEval.jl</a></li>
<li><a href="https://livecodebench.github.io/leaderboard.html">LiveCodeBench</a></li>
<li><a href="https://sparksofagi.github.io/MHPP/">MHPP</a></li>
<li><a href="https://github.com/THUDM/NaturalCodeBench">NaturalCodeBench</a></li>
<li><a href="https://github.com/Leolty/repobench">RepoBench</a></li>
<li><a href="https://www.swebench.com/">SWE-bench</a></li>
<li><a href="https://leaderboard.tabbyml.com/">TabbyML</a></li>
<li><a href="https://llm4softwaretesting.github.io/">TestEval</a></li>
</ol>
</div>
</div>
<!-- 🙏 Acknowledgements -->
<div id="notes" class="w-100 mb-5">
<h3>🙏 Acknowledgements</h3>
<div class="inline-block mt-3">
<ol>
<li>We build on prior work — <strong><a href="https://arxiv.org/abs/2310.06770" target="_blank">SWE-bench</a></strong>, <strong><a href="https://arxiv.org/abs/2407.01489" target="_blank">Agentless</a></strong>, and <strong><a href="https://arxiv.org/abs/2404.05427" target="_blank">AutoCodeRover</a></strong> — which laid the groundwork for this study.</li>
<li>We thank the <strong><a href="https://github.com/evalplus/evalplus" target="_blank">EvalPlus leaderboard</a></strong> team for releasing the elegant page template that inspired this site.</li>
<li>Finally, we are grateful to the <strong>open-source developer community</strong> for their invaluable contributions.</li>
</ol>
</div>
</div>
</div><!-- /#content -->
<!-- 渲染脚本 -->
<script>
(async () => {
const res = await fetch('results/results.json');
if (!res.ok){ alert('Failed to load results.json'); return; }
const raw = Object.values(await res.json());
const keyMap = {
full:'%resolved_full',
python:'%resolved_python',
java:'%resolved_java',
javascript:'%resolved_javascript',
typescript:'%resolved_typescript'
};
const tbody = document.getElementById('leaderboard-body');
const radios = document.querySelectorAll('input[name="langradio"]');
const toPercent = v => v==null ? '--' : (v<1?v*100:v).toFixed(1)+'%';
function render(lang){
const k = keyMap[lang];
tbody.innerHTML = '';
raw.filter(r=>r[k]!=null)
.sort((a,b)=>b[k]-a[k])
.forEach((r,i)=>{
const medal = i===0?'🥇 ':i===1?'🥈 ':i===2?'🥉 ':'';
const orgUrl = (r.org ||'').replace(/&/g,'&');
const siteUrl = (r.site ||'').replace(/&/g,'&');
const orgIcon = orgUrl ? `<img src="${orgUrl}">` : '-';
const siteLink = siteUrl? `<a href="${siteUrl}" target="_blank">🔗</a>` : '-';
tbody.insertAdjacentHTML('beforeend',`
<tr>
<td>${medal}${r.method}</td>
<td>${r.model}</td>
<td class="text-center">${toPercent(r[k])}</td>
<td class="icon-cell">${orgIcon}</td>
<td class="icon-cell">${siteLink}</td>
<td class="text-center">${r.date ?? '--'}</td>
</tr>
`);
});
}
render('full');
radios.forEach(r=>r.addEventListener('change',()=>r.checked&&render(r.value)));
})();
</script>
</body>
</html>