zhwang4ai commited on
Commit
f49345e
·
1 Parent(s): b816ec1

init leaderboard

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
Makefile DELETED
@@ -1,13 +0,0 @@
1
- .PHONY: style format
2
-
3
-
4
- style:
5
- python -m black --line-length 119 .
6
- python -m isort .
7
- ruff check --fix .
8
-
9
-
10
- quality:
11
- python -m black --check --line-length 119 .
12
- python -m isort --check-only .
13
- ruff check .
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,46 +1,22 @@
1
  ---
2
- title: GenerativeReasoningBenchmark
3
- emoji: 🥇
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
- short_description: Duplicate this leaderboard to initialize your own!
11
- sdk_version: 5.19.0
12
  ---
13
 
14
- # Start the configuration
15
 
16
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
17
-
18
- Results files should have the following format and be stored as json files:
19
- ```json
20
- {
21
- "config": {
22
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
23
- "model_name": "path of the model on the hub: org/model",
24
- "model_sha": "revision on the hub",
25
- },
26
- "results": {
27
- "task_name": {
28
- "metric_name": score,
29
- },
30
- "task_name2": {
31
- "metric_name": score,
32
- }
33
- }
34
- }
35
  ```
36
 
37
- Request files are created automatically by this tool.
38
-
39
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
40
-
41
- # Code logic for more complex edits
42
-
43
- You'll find
44
- - the main table' columns names and properties in `src/display/utils.py`
45
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
1
  ---
2
+ title: Reward Bench Leaderboard
3
+ emoji: 📐
4
+ colorFrom: pink
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.36.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
+ tags:
12
+ - leaderboard
13
  ---
14
 
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
16
 
17
+ To develop this app, it can be run with:
18
+ ```
19
+ gradio app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  ```
21
 
22
+ Paper: https://arxiv.org/abs/2403.13787
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,204 +1,146 @@
 
 
 
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
 
92
  demo = gr.Blocks(css=custom_css)
 
93
  with demo:
94
  gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
  )
 
 
 
 
 
 
 
 
 
190
 
191
  with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
 
 
 
 
 
197
  elem_id="citation-button",
198
  show_copy_button=True,
199
  )
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
+ import json
2
+ from pathlib import Path
3
+
4
  import gradio as gr
 
5
  import pandas as pd
6
+
7
+ from texts import TITLE, DESCRIPTION
8
+ from process_data import load_average_data, load_hard_data, load_easy_data
9
+ from display import custom_css
10
+ BENCHMARKS_TO_SKIP = []
11
+
12
+ color_map = {
13
+ "Pretrained": "#7497db",
14
+ "RL": "#E8ECF2",
15
+ "Finetuned": "#ffcd75",
16
+ # "DPO": "#75809c",
17
+ }
18
+
19
+ model_name_map = {
20
+ "qwen2.5-3b-instruct": "Qwen/Qwen2.5-3B-Instruct",
21
+ "qwen2.5-7b-instruct": "Qwen/Qwen2.5-7B-Instruct",
22
+ "qwen2.5-14b-instruct": "Qwen/Qwen2.5-14B-Instruct",
23
+ "qwen2.5-32b-instruct": "Qwen/Qwen2.5-32B-Instruct",
24
+ "qwen2.5-72b-instruct": "Qwen/Qwen2.5-72B-Instruct",
25
+ "llama-3.1-8b-instruct": "Meta-Llama/Llama-3.1-8B-Instruct",
26
+ "llama-3.1-70b-instruct": "Meta-Llama/Llama-3.1-70B-Instruct",
27
+ "llama-3.2-3b-instruct": "Meta-Llama/Llama-3.2-3B-Instruct",
28
+ "llama-3.3-70b-instruct": "Meta-Llama/Llama-3.3-70B-Instruct",
29
+ "mistral-large-instruct-2411": "Mistral/Mistral-Large-2411",
30
+ "gemma-2-27b-it": "google/gemma-2-27b-it",
31
+ "gemma-2-9b-it": "google/gemma-2-9b-it",
32
+ "deepseek-v3": "deepseek-ai/DeepSeek-V3",
33
+ "deepseek-r1": "deepseek-ai/DeepSeek-R1",
34
+ "qwq-32b": "Qwen/QwQ-32B",
35
+ "yi-lightning": "Yi/Yi-Lightning",
36
+ 'gpt-3.5-turbo': "openai/gpt-3.5-turbo",
37
+ 'gpt-4o': "openai/gpt-4o",
38
+ 'gpt-4o-mini': "openai/gpt-4o-mini",
39
+ 'o1-mini': "openai/o1-mini",
40
+ 'claude-3.5-haiku': "anthropic/claude-3.5-haiku",
41
+ 'claude-3.5-sonnet': "anthropic/claude-3.5-sonnet",
42
+ }
43
+
44
+ def map_model_name(model_id):
45
+ if model_id not in model_name_map.keys():
46
+ return model_id
47
+ else:
48
+ return model_name_map[model_id]
49
+
50
+ # 定义函数,将模型名称转换为带有链接的 HTML 格式
51
+ def model_hyperlink(link, model_name):
52
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
53
+
54
+ def make_clickable_model(model_name):
55
+ link = f"https://huggingface.co/{model_name}"
56
+ return model_hyperlink(link, model_name)
57
+
58
+ rl_models = ['deepseek-r1', 'o1-mini']
59
+ def map_model_type(model_name):
60
+ if model_name in rl_models:
61
+ return "RL"
62
+ else:
63
+ return "Pretrained"
64
+
65
+
66
+ def prep_leaderboard_df():
67
+ average_df = load_average_data()
68
+ hard_df = load_hard_data()
69
+ easy_df = load_easy_data()
70
+ df = pd.concat([easy_df, hard_df, average_df], axis=1)
71
+ # insert a column named "Model" at the first position
72
+ df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
73
+ df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
74
+ # Model 列应用函数,将模型名称转换为链接形式
75
+ # df['Model'] = df['Model'].apply(make_clickable_model)
76
+ df = df.round(2)
77
+ return df
78
+
79
+ leaderboard_df = prep_leaderboard_df()
80
+
81
+
82
+ # Function to update the table based on search query
83
+ def filter_and_search(cols: list[str], search_query: str, agg: str):
84
+ print("filter")
85
+ df = leaderboard_df
86
+ search_terms = "Model"
87
+ if len(search_query) > 0:
88
+ search_terms = search_query.split(";")
89
+ search_terms = [term.strip().lower() for term in search_terms]
90
+ pattern = "|".join(search_terms)
91
+ df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
92
+ # Drop any columns which are all NaN
93
+ df = df.dropna(how="all", axis=1)
94
+
95
+ if len(cols) > 0:
96
+ index_cols = list(leaderboard_df.columns[:1])
97
+ new_cols = index_cols + cols
98
+ df = df.copy()[new_cols]
99
+ df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
100
+
101
+ df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
102
+ df = df.sort_values(by=cols, ascending=False, na_position='last')
103
+ df[cols] = df[cols].astype(str)
104
+ return df
105
 
106
 
107
  demo = gr.Blocks(css=custom_css)
108
+
109
  with demo:
110
  gr.HTML(TITLE)
111
+ with gr.Column():
112
+ gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
113
+ with gr.Row():
114
+ search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
115
+
116
+ cols_bar = gr.CheckboxGroup(
117
+ choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
118
+ show_label=False,
119
+ # info="Select columns to display",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  )
121
+ with gr.Group():
122
+ leaderboard_table = gr.Dataframe(
123
+ value=leaderboard_df,
124
+ wrap=True,
125
+ # column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
126
+ )
127
+
128
+ cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
129
+ search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
130
 
131
  with gr.Row():
132
+ with gr.Accordion("📚 Citation", open=False):
133
  citation_button = gr.Textbox(
134
+ value=r"""@article{lin2025generative,
135
+ title={Generative Evaluation of Complex Reasoning in Large Language Models},
136
+ author={Lin, Haowei and Wang, Xiangyu and Yan, Ruilin and Huang, Baizhou and Ye, Haotian and Zhu, Jianhua and Wang, Zihao and Zou, James and Ma, Jianzhu and Liang, Yitao},
137
+ journal={arXiv preprint arXiv:2504.02810},
138
+ year={2025}
139
+ }""",
140
+ lines=7,
141
+ label="Copy the following to cite these results.",
142
  elem_id="citation-button",
143
  show_copy_button=True,
144
  )
145
 
146
+ demo.launch()
 
 
 
data/action_count_0402.json ADDED
@@ -0,0 +1,1819 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "RelicEnv": {
3
+ "qwen2.5-3b-instruct": 0.576,
4
+ "qwen2.5-7b-instruct": 0.5228666666666666,
5
+ "qwen2.5-14b-instruct": 0.3816,
6
+ "qwen2.5-32b-instruct": 0.4269333333333333,
7
+ "qwen2.5-72b-instruct": 0.3848666666666667,
8
+ "llama-3.1-8b-instruct": 0.6459999999999999,
9
+ "llama-3.1-70b-instruct": 0.41696190476190476,
10
+ "llama-3.2-3b-instruct": 0.5766095238095238,
11
+ "llama-3.3-70b-instruct": 0.33466666666666656,
12
+ "mistral-large-instruct-2411": 0.492,
13
+ "gemma-2-27b-it": 0.48513333333333336,
14
+ "gemma-2-9b-it": 0.668695238095238,
15
+ "deepseek-v3": 0.5289999999999999,
16
+ "deepseek-r1": 0.523295238095238,
17
+ "qwq-32b": 0.5080190476190476,
18
+ "Average": 0.49817650793650786
19
+ },
20
+ "HerbEnv": {
21
+ "qwen2.5-3b-instruct": 0.6345333333333334,
22
+ "qwen2.5-7b-instruct": 0.6564,
23
+ "qwen2.5-14b-instruct": 0.4304,
24
+ "qwen2.5-32b-instruct": 0.4564666666666667,
25
+ "qwen2.5-72b-instruct": 0.43260000000000004,
26
+ "llama-3.1-8b-instruct": 0.7072,
27
+ "llama-3.1-70b-instruct": 0.4986,
28
+ "llama-3.2-3b-instruct": 0.7223333333333333,
29
+ "llama-3.3-70b-instruct": 0.49833333333333335,
30
+ "mistral-large-instruct-2411": 0.5494,
31
+ "gemma-2-27b-it": 0.5511999999999999,
32
+ "gemma-2-9b-it": 0.7503333333333334,
33
+ "deepseek-v3": 0.42873333333333336,
34
+ "deepseek-r1": 0.5064666666666666,
35
+ "qwq-32b": 0.5062666666666666,
36
+ "Average": 0.5552844444444445
37
+ },
38
+ "TransdimensionalEnv": {
39
+ "qwen2.5-3b-instruct": 0.8419333333333332,
40
+ "qwen2.5-7b-instruct": 0.7645333333333333,
41
+ "qwen2.5-14b-instruct": 0.5994666666666667,
42
+ "qwen2.5-32b-instruct": 0.5703333333333334,
43
+ "qwen2.5-72b-instruct": 0.5725333333333333,
44
+ "llama-3.1-8b-instruct": 0.8210666666666666,
45
+ "llama-3.1-70b-instruct": 0.5205333333333333,
46
+ "llama-3.2-3b-instruct": 0.7700666666666665,
47
+ "llama-3.3-70b-instruct": 0.5580666666666667,
48
+ "mistral-large-instruct-2411": 0.6012666666666666,
49
+ "gemma-2-27b-it": 0.7089999999999999,
50
+ "gemma-2-9b-it": 0.9037333333333333,
51
+ "deepseek-v3": 0.6178,
52
+ "deepseek-r1": 0.5913999999999999,
53
+ "qwq-32b": 0.653,
54
+ "Average": 0.6729822222222223
55
+ },
56
+ "SorcererEnv": {
57
+ "qwen2.5-3b-instruct": 1.0192666666666665,
58
+ "qwen2.5-7b-instruct": 1.0209333333333332,
59
+ "qwen2.5-14b-instruct": 0.7593333333333334,
60
+ "qwen2.5-32b-instruct": 0.8110666666666667,
61
+ "qwen2.5-72b-instruct": 0.7878666666666667,
62
+ "llama-3.1-8b-instruct": 1.0720666666666667,
63
+ "llama-3.1-70b-instruct": 0.7602666666666668,
64
+ "llama-3.2-3b-instruct": 1.0208666666666666,
65
+ "llama-3.3-70b-instruct": 0.7425333333333334,
66
+ "mistral-large-instruct-2411": 0.8440666666666667,
67
+ "gemma-2-27b-it": 0.8615333333333333,
68
+ "gemma-2-9b-it": 1.1598666666666666,
69
+ "deepseek-v3": 0.8091333333333333,
70
+ "deepseek-r1": 0.8958,
71
+ "qwq-32b": 0.8321999999999999,
72
+ "Average": 0.89312
73
+ },
74
+ "QuantumEnv": {
75
+ "qwen2.5-3b-instruct": 1.0699999999999998,
76
+ "qwen2.5-7b-instruct": 0.8955333333333334,
77
+ "qwen2.5-14b-instruct": 0.7378666666666667,
78
+ "qwen2.5-32b-instruct": 0.7390666666666666,
79
+ "qwen2.5-72b-instruct": 0.649,
80
+ "llama-3.1-8b-instruct": 1.083,
81
+ "llama-3.1-70b-instruct": 0.7020666666666668,
82
+ "llama-3.2-3b-instruct": 1.0911333333333335,
83
+ "llama-3.3-70b-instruct": 0.6975333333333333,
84
+ "mistral-large-instruct-2411": 0.7345333333333333,
85
+ "gemma-2-27b-it": 0.7445333333333334,
86
+ "gemma-2-9b-it": 1.1341999999999999,
87
+ "deepseek-v3": 0.7477333333333334,
88
+ "deepseek-r1": 0.7415333333333334,
89
+ "qwq-32b": 0.7549333333333333,
90
+ "Average": 0.8348444444444444
91
+ },
92
+ "AstronomyEnv": {
93
+ "qwen2.5-3b-instruct": 0.8259333333333334,
94
+ "qwen2.5-7b-instruct": 0.8053333333333335,
95
+ "qwen2.5-14b-instruct": 0.4937333333333333,
96
+ "qwen2.5-32b-instruct": 0.5776666666666666,
97
+ "qwen2.5-72b-instruct": 0.4677999999999999,
98
+ "llama-3.1-8b-instruct": 0.836,
99
+ "llama-3.1-70b-instruct": 0.5228,
100
+ "llama-3.2-3b-instruct": 0.8694000000000001,
101
+ "llama-3.3-70b-instruct": 0.5525333333333332,
102
+ "mistral-large-instruct-2411": 0.4943999999999999,
103
+ "gemma-2-27b-it": 0.6376000000000001,
104
+ "gemma-2-9b-it": 0.7730666666666668,
105
+ "deepseek-v3": 0.5540666666666666,
106
+ "deepseek-r1": 0.4287333333333333,
107
+ "qwq-32b": 0.4580666666666667,
108
+ "Average": 0.6198088888888889
109
+ },
110
+ "MusicGenresEnv": {
111
+ "qwen2.5-3b-instruct": 0.6298666666666667,
112
+ "qwen2.5-7b-instruct": 0.5864666666666667,
113
+ "qwen2.5-14b-instruct": 0.3452,
114
+ "qwen2.5-32b-instruct": 0.37546666666666667,
115
+ "qwen2.5-72b-instruct": 0.398,
116
+ "llama-3.1-8b-instruct": 0.6799999999999999,
117
+ "llama-3.1-70b-instruct": 0.44333333333333336,
118
+ "llama-3.2-3b-instruct": 0.8452,
119
+ "llama-3.3-70b-instruct": 0.49539999999999995,
120
+ "mistral-large-instruct-2411": 0.3673333333333333,
121
+ "gemma-2-27b-it": 0.5542666666666667,
122
+ "gemma-2-9b-it": 0.6927333333333332,
123
+ "deepseek-v3": 0.3997333333333334,
124
+ "deepseek-r1": 0.3074,
125
+ "qwq-32b": 0.30833333333333335,
126
+ "Average": 0.49524888888888896
127
+ },
128
+ "CloudEnv": {
129
+ "qwen2.5-3b-instruct": 0.7101999999999999,
130
+ "qwen2.5-7b-instruct": 0.6398380952380952,
131
+ "qwen2.5-14b-instruct": 0.2948095238095238,
132
+ "qwen2.5-32b-instruct": 0.39837142857142854,
133
+ "qwen2.5-72b-instruct": 0.3368666666666667,
134
+ "llama-3.1-8b-instruct": 0.6846857142857142,
135
+ "llama-3.1-70b-instruct": 0.4453333333333333,
136
+ "llama-3.2-3b-instruct": 0.7733333333333332,
137
+ "llama-3.3-70b-instruct": 0.4490380952380952,
138
+ "mistral-large-instruct-2411": 0.23912380952380952,
139
+ "gemma-2-27b-it": 0.406047619047619,
140
+ "gemma-2-9b-it": 0.675342857142857,
141
+ "deepseek-v3": 0.3188952380952381,
142
+ "deepseek-r1": 0.16405714285714285,
143
+ "qwq-32b": 0.20542857142857143,
144
+ "Average": 0.44942476190476194
145
+ },
146
+ "CuisineEnv": {
147
+ "qwen2.5-3b-instruct": 1.0595999999999999,
148
+ "qwen2.5-7b-instruct": 1.085838095238095,
149
+ "qwen2.5-14b-instruct": 0.882352380952381,
150
+ "qwen2.5-32b-instruct": 0.9331333333333334,
151
+ "qwen2.5-72b-instruct": 0.873,
152
+ "llama-3.1-8b-instruct": 1.1925238095238095,
153
+ "llama-3.1-70b-instruct": 0.9360190476190476,
154
+ "llama-3.2-3b-instruct": 1.2040571428571427,
155
+ "llama-3.3-70b-instruct": 1.0072571428571429,
156
+ "mistral-large-instruct-2411": 0.9003428571428571,
157
+ "gemma-2-27b-it": 1.1492285714285715,
158
+ "gemma-2-9b-it": 1.2268285714285714,
159
+ "deepseek-v3": 0.8427809523809524,
160
+ "deepseek-r1": 0.8026761904761905,
161
+ "qwq-32b": 0.8055523809523809,
162
+ "Average": 0.9934126984126983
163
+ },
164
+ "PlantEnv": {
165
+ "qwen2.5-3b-instruct": 0.6316666666666666,
166
+ "qwen2.5-7b-instruct": 0.6612000000000001,
167
+ "qwen2.5-14b-instruct": 0.6797333333333333,
168
+ "qwen2.5-32b-instruct": 0.7276666666666667,
169
+ "qwen2.5-72b-instruct": 0.6846666666666665,
170
+ "llama-3.1-8b-instruct": 0.7318666666666667,
171
+ "llama-3.1-70b-instruct": 0.6868666666666666,
172
+ "llama-3.2-3b-instruct": 0.7314,
173
+ "llama-3.3-70b-instruct": 0.739,
174
+ "mistral-large-instruct-2411": 0.6728666666666665,
175
+ "gemma-2-27b-it": 0.7182666666666666,
176
+ "gemma-2-9b-it": 0.7471333333333334,
177
+ "deepseek-v3": 0.6415333333333332,
178
+ "deepseek-r1": 0.6114666666666666,
179
+ "qwq-32b": 0.6385333333333333,
180
+ "Average": 0.6869244444444446
181
+ },
182
+ "HistoricalEnv": {
183
+ "qwen2.5-3b-instruct": 0.5945333333333332,
184
+ "qwen2.5-7b-instruct": 0.5029999999999999,
185
+ "qwen2.5-14b-instruct": 0.41719999999999996,
186
+ "qwen2.5-32b-instruct": 0.49926666666666664,
187
+ "qwen2.5-72b-instruct": 0.4616666666666667,
188
+ "llama-3.1-8b-instruct": 0.6741999999999999,
189
+ "llama-3.1-70b-instruct": 0.43866666666666665,
190
+ "llama-3.2-3b-instruct": 0.6622666666666666,
191
+ "llama-3.3-70b-instruct": 0.44580000000000003,
192
+ "mistral-large-instruct-2411": 0.30566666666666664,
193
+ "gemma-2-27b-it": 0.43679999999999997,
194
+ "gemma-2-9b-it": 0.6955333333333333,
195
+ "deepseek-v3": 0.3064,
196
+ "deepseek-r1": 0.1416,
197
+ "qwq-32b": 0.19106666666666666,
198
+ "Average": 0.4515777777777778
199
+ },
200
+ "GadgetEnv": {
201
+ "qwen2.5-3b-instruct": 0.7405999999999999,
202
+ "qwen2.5-7b-instruct": 0.7083999999999999,
203
+ "qwen2.5-14b-instruct": 0.48,
204
+ "qwen2.5-32b-instruct": 0.5347999999999999,
205
+ "qwen2.5-72b-instruct": 0.48633333333333334,
206
+ "llama-3.1-8b-instruct": 0.7890666666666666,
207
+ "llama-3.1-70b-instruct": 0.4845999999999999,
208
+ "llama-3.2-3b-instruct": 0.7646,
209
+ "llama-3.3-70b-instruct": 0.5077999999999999,
210
+ "mistral-large-instruct-2411": 0.6042666666666665,
211
+ "gemma-2-27b-it": 0.6635333333333333,
212
+ "gemma-2-9b-it": 0.8321333333333332,
213
+ "deepseek-v3": 0.5766666666666667,
214
+ "deepseek-r1": 0.6070666666666666,
215
+ "qwq-32b": 0.6155333333333333,
216
+ "Average": 0.6263599999999999
217
+ },
218
+ "TimeTravelEnv": {
219
+ "qwen2.5-3b-instruct": 0.976,
220
+ "qwen2.5-7b-instruct": 0.8145999999999999,
221
+ "qwen2.5-14b-instruct": 0.6627333333333333,
222
+ "qwen2.5-32b-instruct": 0.6956666666666667,
223
+ "qwen2.5-72b-instruct": 0.6541333333333333,
224
+ "llama-3.1-8b-instruct": 0.8264666666666665,
225
+ "llama-3.1-70b-instruct": 0.6590666666666667,
226
+ "llama-3.2-3b-instruct": 0.8872666666666665,
227
+ "llama-3.3-70b-instruct": 0.7066000000000001,
228
+ "mistral-large-instruct-2411": 0.7033333333333334,
229
+ "gemma-2-27b-it": 0.8493333333333334,
230
+ "gemma-2-9b-it": 1.0604666666666667,
231
+ "deepseek-v3": 0.7296666666666667,
232
+ "deepseek-r1": 0.6510666666666667,
233
+ "qwq-32b": 0.6808666666666667,
234
+ "Average": 0.7704844444444444
235
+ },
236
+ "PollutionEnv": {
237
+ "qwen2.5-3b-instruct": 0.8957809523809523,
238
+ "qwen2.5-7b-instruct": 0.8026,
239
+ "qwen2.5-14b-instruct": 0.6021904761904762,
240
+ "qwen2.5-32b-instruct": 0.6871238095238095,
241
+ "qwen2.5-72b-instruct": 0.6281809523809524,
242
+ "llama-3.1-8b-instruct": 0.9049904761904761,
243
+ "llama-3.1-70b-instruct": 0.609295238095238,
244
+ "llama-3.2-3b-instruct": 0.9090761904761905,
245
+ "llama-3.3-70b-instruct": 0.615352380952381,
246
+ "mistral-large-instruct-2411": 0.595695238095238,
247
+ "gemma-2-27b-it": 0.7770761904761903,
248
+ "gemma-2-9b-it": 0.8730190476190476,
249
+ "deepseek-v3": 0.6199238095238095,
250
+ "deepseek-r1": 0.5457142857142857,
251
+ "qwq-32b": 0.5781333333333333,
252
+ "Average": 0.7096101587301588
253
+ },
254
+ "DemographicEnv": {
255
+ "qwen2.5-3b-instruct": 1.2349333333333334,
256
+ "qwen2.5-7b-instruct": 0.9282,
257
+ "qwen2.5-14b-instruct": 0.8947999999999998,
258
+ "qwen2.5-32b-instruct": 0.8493999999999999,
259
+ "qwen2.5-72b-instruct": 0.8458,
260
+ "llama-3.1-8b-instruct": 1.1641333333333332,
261
+ "llama-3.1-70b-instruct": 0.8899333333333332,
262
+ "llama-3.2-3b-instruct": 1.1756666666666669,
263
+ "llama-3.3-70b-instruct": 0.8181999999999998,
264
+ "mistral-large-instruct-2411": 0.8889333333333335,
265
+ "gemma-2-27b-it": 1.1206,
266
+ "gemma-2-9b-it": 1.2548,
267
+ "deepseek-v3": 0.937,
268
+ "deepseek-r1": 0.8470666666666669,
269
+ "qwq-32b": 0.8959333333333334,
270
+ "Average": 0.9830266666666666
271
+ },
272
+ "GeneticEnv": {
273
+ "qwen2.5-3b-instruct": 0.8742666666666669,
274
+ "qwen2.5-7b-instruct": 0.7093333333333331,
275
+ "qwen2.5-14b-instruct": 0.40293333333333337,
276
+ "qwen2.5-32b-instruct": 0.44313333333333327,
277
+ "qwen2.5-72b-instruct": 0.42733333333333323,
278
+ "llama-3.1-8b-instruct": 0.7788666666666665,
279
+ "llama-3.1-70b-instruct": 0.39159999999999995,
280
+ "llama-3.2-3b-instruct": 0.8340666666666667,
281
+ "llama-3.3-70b-instruct": 0.4035333333333334,
282
+ "mistral-large-instruct-2411": 0.4183333333333333,
283
+ "gemma-2-27b-it": 0.4676666666666667,
284
+ "gemma-2-9b-it": 0.8420000000000002,
285
+ "deepseek-v3": 0.39733333333333337,
286
+ "deepseek-r1": 0.3223333333333333,
287
+ "qwq-32b": 0.4328,
288
+ "Average": 0.5430355555555555
289
+ },
290
+ "CraftsmanEnv": {
291
+ "qwen2.5-3b-instruct": 0.8531238095238095,
292
+ "qwen2.5-7b-instruct": 0.8701333333333332,
293
+ "qwen2.5-14b-instruct": 0.636152380952381,
294
+ "qwen2.5-32b-instruct": 0.5899619047619048,
295
+ "qwen2.5-72b-instruct": 0.6157428571428571,
296
+ "llama-3.1-8b-instruct": 0.906847619047619,
297
+ "llama-3.1-70b-instruct": 0.6374285714285713,
298
+ "llama-3.2-3b-instruct": 0.9079333333333333,
299
+ "llama-3.3-70b-instruct": 0.7432857142857142,
300
+ "mistral-large-instruct-2411": 0.5945047619047619,
301
+ "gemma-2-27b-it": 0.8030285714285714,
302
+ "gemma-2-9b-it": 0.9558666666666668,
303
+ "deepseek-v3": 0.6411523809523809,
304
+ "deepseek-r1": 0.572504761904762,
305
+ "qwq-32b": 0.5707142857142857,
306
+ "Average": 0.7265587301587301
307
+ },
308
+ "StarConstellationEnv": {
309
+ "qwen2.5-3b-instruct": 0.9018,
310
+ "qwen2.5-7b-instruct": 0.8849999999999998,
311
+ "qwen2.5-14b-instruct": 0.6111333333333333,
312
+ "qwen2.5-32b-instruct": 0.6682,
313
+ "qwen2.5-72b-instruct": 0.6413333333333333,
314
+ "llama-3.1-8b-instruct": 0.8276666666666668,
315
+ "llama-3.1-70b-instruct": 0.6845333333333333,
316
+ "llama-3.2-3b-instruct": 0.8996666666666666,
317
+ "llama-3.3-70b-instruct": 0.7238666666666665,
318
+ "mistral-large-instruct-2411": 0.6063333333333333,
319
+ "gemma-2-27b-it": 0.6717333333333333,
320
+ "gemma-2-9b-it": 0.8695999999999999,
321
+ "deepseek-v3": 0.55,
322
+ "deepseek-r1": 0.4897333333333334,
323
+ "qwq-32b": 0.5618000000000001,
324
+ "Average": 0.7061600000000001
325
+ },
326
+ "MythicalCreatureEnv": {
327
+ "qwen2.5-3b-instruct": 0.9463333333333332,
328
+ "qwen2.5-7b-instruct": 1.0008666666666666,
329
+ "qwen2.5-14b-instruct": 0.8189999999999997,
330
+ "qwen2.5-32b-instruct": 0.7707333333333333,
331
+ "qwen2.5-72b-instruct": 0.8385333333333334,
332
+ "llama-3.1-8b-instruct": 1.0950666666666666,
333
+ "llama-3.1-70b-instruct": 0.7916,
334
+ "llama-3.2-3b-instruct": 1.1887333333333332,
335
+ "llama-3.3-70b-instruct": 0.7888666666666666,
336
+ "mistral-large-instruct-2411": 0.7903999999999999,
337
+ "gemma-2-27b-it": 0.9704666666666666,
338
+ "gemma-2-9b-it": 1.1304666666666665,
339
+ "deepseek-v3": 0.7574,
340
+ "deepseek-r1": 0.7734,
341
+ "qwq-32b": 0.7396,
342
+ "Average": 0.8934311111111112
343
+ },
344
+ "ArtStyleEnv": {
345
+ "qwen2.5-3b-instruct": 0.9593238095238095,
346
+ "qwen2.5-7b-instruct": 0.8611714285714285,
347
+ "qwen2.5-14b-instruct": 0.6572000000000001,
348
+ "qwen2.5-32b-instruct": 0.6888190476190477,
349
+ "qwen2.5-72b-instruct": 0.6664380952380953,
350
+ "llama-3.1-8b-instruct": 0.9826952380952381,
351
+ "llama-3.1-70b-instruct": 0.6773714285714286,
352
+ "llama-3.2-3b-instruct": 1.0108000000000001,
353
+ "llama-3.3-70b-instruct": 0.7458571428571428,
354
+ "mistral-large-instruct-2411": 0.627504761904762,
355
+ "gemma-2-27b-it": 0.8328380952380952,
356
+ "gemma-2-9b-it": 1.0002666666666666,
357
+ "deepseek-v3": 0.723047619047619,
358
+ "deepseek-r1": 0.666,
359
+ "qwq-32b": 0.6872952380952381,
360
+ "Average": 0.7857752380952383
361
+ },
362
+ "CookingEnv": {
363
+ "qwen2.5-3b-instruct": 0.9731333333333332,
364
+ "qwen2.5-7b-instruct": 0.8531999999999998,
365
+ "qwen2.5-14b-instruct": 0.6777333333333333,
366
+ "qwen2.5-32b-instruct": 0.6949333333333334,
367
+ "qwen2.5-72b-instruct": 0.6868666666666666,
368
+ "llama-3.1-8b-instruct": 0.9575333333333333,
369
+ "llama-3.1-70b-instruct": 0.6741333333333334,
370
+ "llama-3.2-3b-instruct": 0.9920666666666665,
371
+ "llama-3.3-70b-instruct": 0.7073999999999999,
372
+ "mistral-large-instruct-2411": 0.6852,
373
+ "gemma-2-27b-it": 0.8009999999999998,
374
+ "gemma-2-9b-it": 0.9400666666666666,
375
+ "deepseek-v3": 0.6910000000000001,
376
+ "deepseek-r1": 0.6202666666666665,
377
+ "qwq-32b": 0.5726666666666667,
378
+ "Average": 0.7684799999999999
379
+ },
380
+ "HistoricalBattleEnv": {
381
+ "qwen2.5-3b-instruct": 0.3906380952380952,
382
+ "qwen2.5-7b-instruct": 0.39269523809523804,
383
+ "qwen2.5-14b-instruct": 0.36508571428571424,
384
+ "qwen2.5-32b-instruct": 0.3839047619047619,
385
+ "qwen2.5-72b-instruct": 0.37189523809523806,
386
+ "llama-3.1-8b-instruct": 0.5100190476190476,
387
+ "llama-3.1-70b-instruct": 0.3623142857142857,
388
+ "llama-3.2-3b-instruct": 0.5539428571428571,
389
+ "llama-3.3-70b-instruct": 0.3407428571428571,
390
+ "mistral-large-instruct-2411": 0.26249523809523806,
391
+ "gemma-2-27b-it": 0.3749619047619047,
392
+ "gemma-2-9b-it": 0.4291904761904761,
393
+ "deepseek-v3": 0.2707428571428572,
394
+ "deepseek-r1": 0.12205714285714286,
395
+ "qwq-32b": 0.1069238095238095,
396
+ "Average": 0.34917396825396824
397
+ },
398
+ "FungalEnv": {
399
+ "qwen2.5-3b-instruct": 0.9867619047619047,
400
+ "qwen2.5-7b-instruct": 0.7690285714285714,
401
+ "qwen2.5-14b-instruct": 0.5497523809523809,
402
+ "qwen2.5-32b-instruct": 0.5654571428571428,
403
+ "qwen2.5-72b-instruct": 0.5338571428571429,
404
+ "llama-3.1-8b-instruct": 0.9299904761904761,
405
+ "llama-3.1-70b-instruct": 0.6940095238095237,
406
+ "llama-3.2-3b-instruct": 1.0453999999999999,
407
+ "llama-3.3-70b-instruct": 0.6804285714285714,
408
+ "mistral-large-instruct-2411": 0.5070380952380952,
409
+ "gemma-2-27b-it": 0.6260571428571428,
410
+ "gemma-2-9b-it": 1.0525142857142857,
411
+ "deepseek-v3": 0.4512380952380953,
412
+ "deepseek-r1": 0.41535238095238086,
413
+ "qwq-32b": 0.4612952380952381,
414
+ "Average": 0.6845453968253967
415
+ },
416
+ "CryptographyEnv": {
417
+ "qwen2.5-3b-instruct": 0.7157333333333333,
418
+ "qwen2.5-7b-instruct": 0.7917428571428571,
419
+ "qwen2.5-14b-instruct": 0.5772095238095238,
420
+ "qwen2.5-32b-instruct": 0.5362666666666666,
421
+ "qwen2.5-72b-instruct": 0.5816380952380952,
422
+ "llama-3.1-8b-instruct": 0.7762666666666667,
423
+ "llama-3.1-70b-instruct": 0.5880761904761905,
424
+ "llama-3.2-3b-instruct": 0.9222476190476192,
425
+ "llama-3.3-70b-instruct": 0.6200666666666668,
426
+ "mistral-large-instruct-2411": 0.43243809523809523,
427
+ "gemma-2-27b-it": 0.6965333333333332,
428
+ "gemma-2-9b-it": 0.9170952380952381,
429
+ "deepseek-v3": 0.4242190476190476,
430
+ "deepseek-r1": 0.31665714285714286,
431
+ "qwq-32b": 0.3307142857142857,
432
+ "Average": 0.6151269841269841
433
+ },
434
+ "StorageEnv": {
435
+ "qwen2.5-3b-instruct": 0.5999999999999999,
436
+ "qwen2.5-7b-instruct": 0.5174666666666666,
437
+ "qwen2.5-14b-instruct": 0.26799999999999996,
438
+ "qwen2.5-32b-instruct": 0.3171333333333333,
439
+ "qwen2.5-72b-instruct": 0.30706666666666665,
440
+ "llama-3.1-8b-instruct": 0.6547333333333333,
441
+ "llama-3.1-70b-instruct": 0.3390666666666667,
442
+ "llama-3.2-3b-instruct": 0.6575333333333333,
443
+ "llama-3.3-70b-instruct": 0.2899333333333334,
444
+ "mistral-large-instruct-2411": 0.28440000000000004,
445
+ "gemma-2-27b-it": 0.4133333333333333,
446
+ "gemma-2-9b-it": 0.5988666666666667,
447
+ "deepseek-v3": 0.34040000000000004,
448
+ "deepseek-r1": 0.3333333333333333,
449
+ "qwq-32b": 0.33946666666666664,
450
+ "Average": 0.41738222222222215
451
+ },
452
+ "RoverEnv": {
453
+ "qwen2.5-3b-instruct": 0.9546666666666667,
454
+ "qwen2.5-7b-instruct": 1.0193333333333334,
455
+ "qwen2.5-14b-instruct": 0.5934,
456
+ "qwen2.5-32b-instruct": 0.6414,
457
+ "qwen2.5-72b-instruct": 0.5923999999999999,
458
+ "llama-3.1-8b-instruct": 0.9858666666666667,
459
+ "llama-3.1-70b-instruct": 0.7111333333333333,
460
+ "llama-3.2-3b-instruct": 1.0410666666666666,
461
+ "llama-3.3-70b-instruct": 0.6332666666666666,
462
+ "mistral-large-instruct-2411": 0.7143999999999999,
463
+ "gemma-2-27b-it": 0.7877333333333333,
464
+ "gemma-2-9b-it": 1.0685333333333333,
465
+ "deepseek-v3": 0.7905333333333333,
466
+ "deepseek-r1": 0.7494,
467
+ "qwq-32b": 0.7479333333333333,
468
+ "Average": 0.8020711111111112
469
+ },
470
+ "FashionEnv": {
471
+ "qwen2.5-3b-instruct": 1.0357333333333334,
472
+ "qwen2.5-7b-instruct": 1.081152380952381,
473
+ "qwen2.5-14b-instruct": 0.7285238095238096,
474
+ "qwen2.5-32b-instruct": 0.7431523809523809,
475
+ "qwen2.5-72b-instruct": 0.7598666666666667,
476
+ "llama-3.1-8b-instruct": 1.0961333333333332,
477
+ "llama-3.1-70b-instruct": 0.8041333333333333,
478
+ "llama-3.2-3b-instruct": 1.1326571428571426,
479
+ "llama-3.3-70b-instruct": 0.8184666666666667,
480
+ "mistral-large-instruct-2411": 0.8105238095238094,
481
+ "gemma-2-27b-it": 0.9382190476190475,
482
+ "gemma-2-9b-it": 1.0972,
483
+ "deepseek-v3": 0.8063809523809524,
484
+ "deepseek-r1": 0.7738476190476191,
485
+ "qwq-32b": 0.8007333333333335,
486
+ "Average": 0.8951149206349207
487
+ },
488
+ "LicenseEnv": {
489
+ "qwen2.5-3b-instruct": 0.7847999999999999,
490
+ "qwen2.5-7b-instruct": 0.8215333333333333,
491
+ "qwen2.5-14b-instruct": 0.6174666666666666,
492
+ "qwen2.5-32b-instruct": 0.7098666666666666,
493
+ "qwen2.5-72b-instruct": 0.7198666666666667,
494
+ "llama-3.1-8b-instruct": 0.8523333333333334,
495
+ "llama-3.1-70b-instruct": 0.6513333333333332,
496
+ "llama-3.2-3b-instruct": 0.9648666666666668,
497
+ "llama-3.3-70b-instruct": 0.6662000000000001,
498
+ "mistral-large-instruct-2411": 0.6437333333333333,
499
+ "gemma-2-27b-it": 0.7512666666666666,
500
+ "gemma-2-9b-it": 0.8070666666666666,
501
+ "deepseek-v3": 0.6174666666666666,
502
+ "deepseek-r1": 0.5982666666666666,
503
+ "qwq-32b": 0.6115999999999999,
504
+ "Average": 0.7211777777777777
505
+ },
506
+ "VirusClassificationEnv": {
507
+ "qwen2.5-3b-instruct": 0.5887238095238095,
508
+ "qwen2.5-7b-instruct": 0.6255999999999999,
509
+ "qwen2.5-14b-instruct": 0.43513333333333326,
510
+ "qwen2.5-32b-instruct": 0.4164,
511
+ "qwen2.5-72b-instruct": 0.39893333333333336,
512
+ "llama-3.1-8b-instruct": 0.6247333333333334,
513
+ "llama-3.1-70b-instruct": 0.5219333333333334,
514
+ "llama-3.2-3b-instruct": 0.6386095238095237,
515
+ "llama-3.3-70b-instruct": 0.4547333333333333,
516
+ "mistral-large-instruct-2411": 0.3114571428571429,
517
+ "gemma-2-27b-it": 0.48719999999999997,
518
+ "gemma-2-9b-it": 0.6325809523809524,
519
+ "deepseek-v3": 0.30473333333333336,
520
+ "deepseek-r1": 0.2137142857142857,
521
+ "qwq-32b": 0.22217142857142855,
522
+ "Average": 0.45844380952380953
523
+ },
524
+ "TestingEnv": {
525
+ "qwen2.5-3b-instruct": 0.5297333333333333,
526
+ "qwen2.5-7b-instruct": 0.5164666666666667,
527
+ "qwen2.5-14b-instruct": 0.4224666666666666,
528
+ "qwen2.5-32b-instruct": 0.4540666666666667,
529
+ "qwen2.5-72b-instruct": 0.39493333333333325,
530
+ "llama-3.1-8b-instruct": 0.5270666666666667,
531
+ "llama-3.1-70b-instruct": 0.3365333333333333,
532
+ "llama-3.2-3b-instruct": 0.5638666666666666,
533
+ "llama-3.3-70b-instruct": 0.39473333333333327,
534
+ "mistral-large-instruct-2411": 0.3972,
535
+ "gemma-2-27b-it": 0.5658,
536
+ "gemma-2-9b-it": 0.6542,
537
+ "deepseek-v3": 0.37939999999999996,
538
+ "deepseek-r1": 0.25579999999999997,
539
+ "qwq-32b": 0.3352,
540
+ "Average": 0.44849777777777783
541
+ },
542
+ "NarrativeDetectEnv": {
543
+ "qwen2.5-3b-instruct": 1.0932666666666666,
544
+ "qwen2.5-7b-instruct": 0.9698666666666667,
545
+ "qwen2.5-14b-instruct": 0.8831333333333333,
546
+ "qwen2.5-32b-instruct": 0.7640666666666666,
547
+ "qwen2.5-72b-instruct": 0.8158000000000001,
548
+ "llama-3.1-8b-instruct": 1.0600666666666667,
549
+ "llama-3.1-70b-instruct": 0.8113999999999999,
550
+ "llama-3.2-3b-instruct": 1.2458666666666667,
551
+ "llama-3.3-70b-instruct": 0.8439333333333334,
552
+ "mistral-large-instruct-2411": 0.8011333333333333,
553
+ "gemma-2-27b-it": 1.0462666666666665,
554
+ "gemma-2-9b-it": 1.0814666666666668,
555
+ "deepseek-v3": 0.9039333333333334,
556
+ "deepseek-r1": 0.82,
557
+ "qwq-32b": 0.8263999999999999,
558
+ "Average": 0.9311066666666665
559
+ },
560
+ "RenewableEnergyEnv": {
561
+ "qwen2.5-3b-instruct": 1.2372,
562
+ "qwen2.5-7b-instruct": 1.209,
563
+ "qwen2.5-14b-instruct": 1.018,
564
+ "qwen2.5-32b-instruct": 0.8681999999999999,
565
+ "qwen2.5-72b-instruct": 0.8295333333333333,
566
+ "llama-3.1-8b-instruct": 1.3595333333333333,
567
+ "llama-3.1-70b-instruct": 0.8166666666666667,
568
+ "llama-3.2-3b-instruct": 1.2722666666666664,
569
+ "llama-3.3-70b-instruct": 0.9336000000000002,
570
+ "mistral-large-instruct-2411": 0.8966666666666667,
571
+ "gemma-2-27b-it": 1.2019333333333333,
572
+ "gemma-2-9b-it": 1.3341999999999998,
573
+ "deepseek-v3": 0.898,
574
+ "deepseek-r1": 0.8659333333333334,
575
+ "qwq-32b": 0.9053333333333334,
576
+ "Average": 1.043071111111111
577
+ },
578
+ "CelestialEnv": {
579
+ "qwen2.5-3b-instruct": 0.8438666666666667,
580
+ "qwen2.5-7b-instruct": 0.8887999999999998,
581
+ "qwen2.5-14b-instruct": 0.6514,
582
+ "qwen2.5-32b-instruct": 0.6459333333333334,
583
+ "qwen2.5-72b-instruct": 0.6326666666666666,
584
+ "llama-3.1-8b-instruct": 0.9612666666666666,
585
+ "llama-3.1-70b-instruct": 0.6192,
586
+ "llama-3.2-3b-instruct": 0.8965333333333334,
587
+ "llama-3.3-70b-instruct": 0.6164,
588
+ "mistral-large-instruct-2411": 0.6203333333333334,
589
+ "gemma-2-27b-it": 0.7378,
590
+ "gemma-2-9b-it": 0.9489333333333333,
591
+ "deepseek-v3": 0.6561999999999999,
592
+ "deepseek-r1": 0.5421333333333334,
593
+ "qwq-32b": 0.5784666666666667,
594
+ "Average": 0.7226622222222223
595
+ },
596
+ "SpiceEnv": {
597
+ "qwen2.5-3b-instruct": 0.6036476190476192,
598
+ "qwen2.5-7b-instruct": 0.6609809523809524,
599
+ "qwen2.5-14b-instruct": 0.40272380952380954,
600
+ "qwen2.5-32b-instruct": 0.5753809523809524,
601
+ "qwen2.5-72b-instruct": 0.46193333333333336,
602
+ "llama-3.1-8b-instruct": 0.6512666666666667,
603
+ "llama-3.1-70b-instruct": 0.5352666666666666,
604
+ "llama-3.2-3b-instruct": 0.6274,
605
+ "llama-3.3-70b-instruct": 0.594,
606
+ "mistral-large-instruct-2411": 0.45503809523809513,
607
+ "gemma-2-27b-it": 0.6168666666666667,
608
+ "gemma-2-9b-it": 0.7341333333333334,
609
+ "deepseek-v3": 0.31380952380952376,
610
+ "deepseek-r1": 0.36774285714285704,
611
+ "qwq-32b": 0.37498095238095236,
612
+ "Average": 0.5316780952380953
613
+ },
614
+ "WildlifeEnv": {
615
+ "qwen2.5-3b-instruct": 0.7888,
616
+ "qwen2.5-7b-instruct": 0.7621333333333333,
617
+ "qwen2.5-14b-instruct": 0.6147999999999999,
618
+ "qwen2.5-32b-instruct": 0.7297333333333332,
619
+ "qwen2.5-72b-instruct": 0.6115999999999999,
620
+ "llama-3.1-8b-instruct": 0.8686666666666666,
621
+ "llama-3.1-70b-instruct": 0.6302,
622
+ "llama-3.2-3b-instruct": 0.7972666666666667,
623
+ "llama-3.3-70b-instruct": 0.6359999999999999,
624
+ "mistral-large-instruct-2411": 0.6615333333333333,
625
+ "gemma-2-27b-it": 0.7294666666666667,
626
+ "gemma-2-9b-it": 0.8138666666666665,
627
+ "deepseek-v3": 0.6534000000000001,
628
+ "deepseek-r1": 0.7072666666666667,
629
+ "qwq-32b": 0.7271333333333333,
630
+ "Average": 0.7154577777777777
631
+ },
632
+ "VehicleEnv": {
633
+ "qwen2.5-3b-instruct": 0.9630666666666666,
634
+ "qwen2.5-7b-instruct": 0.9056000000000001,
635
+ "qwen2.5-14b-instruct": 0.7067333333333332,
636
+ "qwen2.5-32b-instruct": 0.5080666666666666,
637
+ "qwen2.5-72b-instruct": 0.4952666666666666,
638
+ "llama-3.1-8b-instruct": 1.0144666666666668,
639
+ "llama-3.1-70b-instruct": 0.6701333333333334,
640
+ "llama-3.2-3b-instruct": 1.0123333333333333,
641
+ "llama-3.3-70b-instruct": 0.6127333333333332,
642
+ "mistral-large-instruct-2411": 0.37593333333333334,
643
+ "gemma-2-27b-it": 0.7358666666666668,
644
+ "gemma-2-9b-it": 1.0004666666666666,
645
+ "deepseek-v3": 0.38853333333333334,
646
+ "deepseek-r1": 0.30946666666666667,
647
+ "qwq-32b": 0.3487333333333333,
648
+ "Average": 0.6698266666666667
649
+ },
650
+ "BeverageEnv": {
651
+ "qwen2.5-3b-instruct": 1.1309999999999998,
652
+ "qwen2.5-7b-instruct": 1.0566666666666666,
653
+ "qwen2.5-14b-instruct": 0.7231333333333334,
654
+ "qwen2.5-32b-instruct": 0.8653333333333334,
655
+ "qwen2.5-72b-instruct": 0.8098666666666666,
656
+ "llama-3.1-8b-instruct": 1.0646666666666664,
657
+ "llama-3.1-70b-instruct": 0.7819333333333334,
658
+ "llama-3.2-3b-instruct": 1.0867999999999998,
659
+ "llama-3.3-70b-instruct": 0.8621333333333332,
660
+ "mistral-large-instruct-2411": 0.8074666666666666,
661
+ "gemma-2-27b-it": 0.9830666666666668,
662
+ "gemma-2-9b-it": 1.0837999999999999,
663
+ "deepseek-v3": 0.7968,
664
+ "deepseek-r1": 0.7807333333333334,
665
+ "qwq-32b": 0.7677333333333334,
666
+ "Average": 0.9067422222222222
667
+ },
668
+ "ControlEnv": {
669
+ "qwen2.5-3b-instruct": 1.2409333333333332,
670
+ "qwen2.5-7b-instruct": 1.1129999999999998,
671
+ "qwen2.5-14b-instruct": 0.9396000000000001,
672
+ "qwen2.5-32b-instruct": 0.9234,
673
+ "qwen2.5-72b-instruct": 0.8239333333333333,
674
+ "llama-3.1-8b-instruct": 1.1922666666666668,
675
+ "llama-3.1-70b-instruct": 0.8639999999999999,
676
+ "llama-3.2-3b-instruct": 1.1910666666666665,
677
+ "llama-3.3-70b-instruct": 0.9382666666666666,
678
+ "mistral-large-instruct-2411": 0.9432666666666666,
679
+ "gemma-2-27b-it": 1.0350666666666668,
680
+ "gemma-2-9b-it": 1.3362,
681
+ "deepseek-v3": 0.8779333333333333,
682
+ "deepseek-r1": 0.8366666666666667,
683
+ "qwq-32b": 0.8624666666666666,
684
+ "Average": 1.007871111111111
685
+ },
686
+ "CurrencyEnv": {
687
+ "qwen2.5-3b-instruct": 1.1619999999999997,
688
+ "qwen2.5-7b-instruct": 1.1104666666666667,
689
+ "qwen2.5-14b-instruct": 1.0066666666666666,
690
+ "qwen2.5-32b-instruct": 0.9369333333333332,
691
+ "qwen2.5-72b-instruct": 0.9753999999999999,
692
+ "llama-3.1-8b-instruct": 1.2676666666666667,
693
+ "llama-3.1-70b-instruct": 0.8947999999999998,
694
+ "llama-3.2-3b-instruct": 1.261,
695
+ "llama-3.3-70b-instruct": 0.9674666666666665,
696
+ "mistral-large-instruct-2411": 0.8869999999999998,
697
+ "gemma-2-27b-it": 1.0257333333333334,
698
+ "gemma-2-9b-it": 1.3512,
699
+ "deepseek-v3": 0.9630666666666666,
700
+ "deepseek-r1": 0.8782666666666665,
701
+ "qwq-32b": 0.9007999999999999,
702
+ "Average": 1.0392311111111112
703
+ },
704
+ "MarketingEnv": {
705
+ "qwen2.5-3b-instruct": 0.7427333333333332,
706
+ "qwen2.5-7b-instruct": 0.6565333333333332,
707
+ "qwen2.5-14b-instruct": 0.6416000000000001,
708
+ "qwen2.5-32b-instruct": 0.5615333333333333,
709
+ "qwen2.5-72b-instruct": 0.5475333333333334,
710
+ "llama-3.1-8b-instruct": 0.7419999999999999,
711
+ "llama-3.1-70b-instruct": 0.5211333333333333,
712
+ "llama-3.2-3b-instruct": 0.7737333333333333,
713
+ "llama-3.3-70b-instruct": 0.5229999999999999,
714
+ "mistral-large-instruct-2411": 0.5469999999999999,
715
+ "gemma-2-27b-it": 0.7222000000000001,
716
+ "gemma-2-9b-it": 0.8039333333333334,
717
+ "deepseek-v3": 0.6286666666666666,
718
+ "deepseek-r1": 0.5670666666666666,
719
+ "qwq-32b": 0.5600666666666666,
720
+ "Average": 0.6359155555555555
721
+ },
722
+ "BotanicalEnv": {
723
+ "qwen2.5-3b-instruct": 1.3478666666666668,
724
+ "qwen2.5-7b-instruct": 1.3568,
725
+ "qwen2.5-14b-instruct": 0.7390000000000001,
726
+ "qwen2.5-32b-instruct": 0.9401333333333334,
727
+ "qwen2.5-72b-instruct": 0.8344666666666665,
728
+ "llama-3.1-8b-instruct": 1.3095999999999999,
729
+ "llama-3.1-70b-instruct": 0.8597999999999999,
730
+ "llama-3.2-3b-instruct": 1.2815999999999999,
731
+ "llama-3.3-70b-instruct": 0.9140666666666665,
732
+ "mistral-large-instruct-2411": 0.8783333333333333,
733
+ "gemma-2-27b-it": 1.0331333333333332,
734
+ "gemma-2-9b-it": 1.4613999999999998,
735
+ "deepseek-v3": 0.8467333333333332,
736
+ "deepseek-r1": 0.6417333333333334,
737
+ "qwq-32b": 0.6957333333333333,
738
+ "Average": 1.00936
739
+ },
740
+ "CircusActEnv": {
741
+ "qwen2.5-3b-instruct": 0.966,
742
+ "qwen2.5-7b-instruct": 0.8732666666666665,
743
+ "qwen2.5-14b-instruct": 0.7160666666666667,
744
+ "qwen2.5-32b-instruct": 0.6424000000000001,
745
+ "qwen2.5-72b-instruct": 0.6185333333333333,
746
+ "llama-3.1-8b-instruct": 1.0588000000000002,
747
+ "llama-3.1-70b-instruct": 0.6578,
748
+ "llama-3.2-3b-instruct": 1.0192666666666663,
749
+ "llama-3.3-70b-instruct": 0.6707333333333333,
750
+ "mistral-large-instruct-2411": 0.5906,
751
+ "gemma-2-27b-it": 0.8663333333333334,
752
+ "gemma-2-9b-it": 0.9901333333333333,
753
+ "deepseek-v3": 0.6095333333333334,
754
+ "deepseek-r1": 0.5542666666666667,
755
+ "qwq-32b": 0.612,
756
+ "Average": 0.7630488888888889
757
+ },
758
+ "AudioDialectEnv": {
759
+ "qwen2.5-3b-instruct": 1.1560666666666666,
760
+ "qwen2.5-7b-instruct": 1.1981333333333333,
761
+ "qwen2.5-14b-instruct": 0.9919333333333332,
762
+ "qwen2.5-32b-instruct": 0.9843999999999999,
763
+ "qwen2.5-72b-instruct": 1.0026,
764
+ "llama-3.1-8b-instruct": 1.1826,
765
+ "llama-3.1-70b-instruct": 1.049,
766
+ "llama-3.2-3b-instruct": 1.2086666666666668,
767
+ "llama-3.3-70b-instruct": 1.1178666666666666,
768
+ "mistral-large-instruct-2411": 0.9938666666666667,
769
+ "gemma-2-27b-it": 1.1272666666666669,
770
+ "gemma-2-9b-it": 1.255666666666667,
771
+ "deepseek-v3": 0.9454666666666667,
772
+ "deepseek-r1": 0.8542666666666667,
773
+ "qwq-32b": 0.9452,
774
+ "Average": 1.0675333333333334
775
+ },
776
+ "LeadershipEnv": {
777
+ "qwen2.5-3b-instruct": 1.1378,
778
+ "qwen2.5-7b-instruct": 1.1529333333333334,
779
+ "qwen2.5-14b-instruct": 0.9892285714285715,
780
+ "qwen2.5-32b-instruct": 1.036095238095238,
781
+ "qwen2.5-72b-instruct": 0.9751333333333333,
782
+ "llama-3.1-8b-instruct": 1.2512571428571426,
783
+ "llama-3.1-70b-instruct": 0.9471428571428572,
784
+ "llama-3.2-3b-instruct": 1.450133333333333,
785
+ "llama-3.3-70b-instruct": 1.0285999999999997,
786
+ "mistral-large-instruct-2411": 0.9162380952380952,
787
+ "gemma-2-27b-it": 1.2164380952380953,
788
+ "gemma-2-9b-it": 1.3166761904761903,
789
+ "deepseek-v3": 0.9108571428571428,
790
+ "deepseek-r1": 0.9028190476190476,
791
+ "qwq-32b": 0.8550952380952381,
792
+ "Average": 1.0724298412698412
793
+ },
794
+ "TransportEnv": {
795
+ "qwen2.5-3b-instruct": 0.7256761904761905,
796
+ "qwen2.5-7b-instruct": 0.6674,
797
+ "qwen2.5-14b-instruct": 0.425752380952381,
798
+ "qwen2.5-32b-instruct": 0.5705047619047618,
799
+ "qwen2.5-72b-instruct": 0.4020666666666667,
800
+ "llama-3.1-8b-instruct": 0.7760380952380952,
801
+ "llama-3.1-70b-instruct": 0.5022666666666666,
802
+ "llama-3.2-3b-instruct": 0.8045333333333333,
803
+ "llama-3.3-70b-instruct": 0.5512285714285714,
804
+ "mistral-large-instruct-2411": 0.4226571428571429,
805
+ "gemma-2-27b-it": 0.6612190476190476,
806
+ "gemma-2-9b-it": 0.7567999999999999,
807
+ "deepseek-v3": 0.4245333333333333,
808
+ "deepseek-r1": 0.35583809523809523,
809
+ "qwq-32b": 0.42556190476190475,
810
+ "Average": 0.5648050793650794
811
+ },
812
+ "EcologicalEnv": {
813
+ "qwen2.5-3b-instruct": 0.5565333333333333,
814
+ "qwen2.5-7b-instruct": 0.3861333333333333,
815
+ "qwen2.5-14b-instruct": 0.23986666666666662,
816
+ "qwen2.5-32b-instruct": 0.2175333333333333,
817
+ "qwen2.5-72b-instruct": 0.2650666666666667,
818
+ "llama-3.1-8b-instruct": 0.5481333333333334,
819
+ "llama-3.1-70b-instruct": 0.27026666666666666,
820
+ "llama-3.2-3b-instruct": 0.5608666666666666,
821
+ "llama-3.3-70b-instruct": 0.34073333333333333,
822
+ "mistral-large-instruct-2411": 0.18666666666666668,
823
+ "gemma-2-27b-it": 0.3159333333333333,
824
+ "gemma-2-9b-it": 0.49386666666666673,
825
+ "deepseek-v3": 0.25439999999999996,
826
+ "deepseek-r1": 0.13513333333333333,
827
+ "qwq-32b": 0.1812,
828
+ "Average": 0.3301555555555555
829
+ },
830
+ "MythicEnv": {
831
+ "qwen2.5-3b-instruct": 1.1101999999999999,
832
+ "qwen2.5-7b-instruct": 0.9876000000000001,
833
+ "qwen2.5-14b-instruct": 0.7183999999999999,
834
+ "qwen2.5-32b-instruct": 0.8451333333333334,
835
+ "qwen2.5-72b-instruct": 0.7776666666666666,
836
+ "llama-3.1-8b-instruct": 1.1285333333333334,
837
+ "llama-3.1-70b-instruct": 0.8145999999999999,
838
+ "llama-3.2-3b-instruct": 1.2702,
839
+ "llama-3.3-70b-instruct": 0.8547333333333332,
840
+ "mistral-large-instruct-2411": 0.7791333333333332,
841
+ "gemma-2-27b-it": 0.9578666666666666,
842
+ "gemma-2-9b-it": 1.1880000000000002,
843
+ "deepseek-v3": 0.7562,
844
+ "deepseek-r1": 0.628,
845
+ "qwq-32b": 0.7259333333333334,
846
+ "Average": 0.9028133333333334
847
+ },
848
+ "EnzymeEnv": {
849
+ "qwen2.5-3b-instruct": 0.5272666666666666,
850
+ "qwen2.5-7b-instruct": 0.5749999999999998,
851
+ "qwen2.5-14b-instruct": 0.45233333333333325,
852
+ "qwen2.5-32b-instruct": 0.44746666666666657,
853
+ "qwen2.5-72b-instruct": 0.4640666666666666,
854
+ "llama-3.1-8b-instruct": 0.6982666666666667,
855
+ "llama-3.1-70b-instruct": 0.4665333333333333,
856
+ "llama-3.2-3b-instruct": 0.7106666666666666,
857
+ "llama-3.3-70b-instruct": 0.4798,
858
+ "mistral-large-instruct-2411": 0.425,
859
+ "gemma-2-27b-it": 0.5391999999999999,
860
+ "gemma-2-9b-it": 0.6941333333333333,
861
+ "deepseek-v3": 0.37926666666666664,
862
+ "deepseek-r1": 0.38086666666666663,
863
+ "qwq-32b": 0.4045333333333333,
864
+ "Average": 0.5096266666666667
865
+ },
866
+ "OSKernelEnv": {
867
+ "qwen2.5-3b-instruct": 1.1656,
868
+ "qwen2.5-7b-instruct": 1.3032,
869
+ "qwen2.5-14b-instruct": 0.8570666666666666,
870
+ "qwen2.5-32b-instruct": 0.8768666666666667,
871
+ "qwen2.5-72b-instruct": 0.8728,
872
+ "llama-3.1-8b-instruct": 1.184,
873
+ "llama-3.1-70b-instruct": 0.8322666666666667,
874
+ "llama-3.2-3b-instruct": 1.3510000000000002,
875
+ "llama-3.3-70b-instruct": 0.8083333333333333,
876
+ "mistral-large-instruct-2411": 0.8513333333333332,
877
+ "gemma-2-27b-it": 1.0258666666666667,
878
+ "gemma-2-9b-it": 1.3057333333333332,
879
+ "deepseek-v3": 0.8445333333333332,
880
+ "deepseek-r1": 0.7511333333333332,
881
+ "qwq-32b": 0.8475333333333334,
882
+ "Average": 0.9918177777777777
883
+ },
884
+ "MineralClassificationEnv": {
885
+ "qwen2.5-3b-instruct": 1.0583333333333333,
886
+ "qwen2.5-7b-instruct": 1.0704666666666667,
887
+ "qwen2.5-14b-instruct": 0.7609999999999999,
888
+ "qwen2.5-32b-instruct": 0.7188666666666668,
889
+ "qwen2.5-72b-instruct": 0.6955333333333333,
890
+ "llama-3.1-8b-instruct": 1.1152000000000002,
891
+ "llama-3.1-70b-instruct": 0.6619333333333334,
892
+ "llama-3.2-3b-instruct": 1.1165333333333334,
893
+ "llama-3.3-70b-instruct": 0.6672666666666666,
894
+ "mistral-large-instruct-2411": 0.7607333333333333,
895
+ "gemma-2-27b-it": 0.9294,
896
+ "gemma-2-9b-it": 1.2250666666666667,
897
+ "deepseek-v3": 0.8092,
898
+ "deepseek-r1": 0.7899999999999999,
899
+ "qwq-32b": 0.7879333333333334,
900
+ "Average": 0.8778311111111109
901
+ },
902
+ "EconomicEnv": {
903
+ "qwen2.5-3b-instruct": 1.1286,
904
+ "qwen2.5-7b-instruct": 1.16,
905
+ "qwen2.5-14b-instruct": 0.8704666666666666,
906
+ "qwen2.5-32b-instruct": 0.8274666666666667,
907
+ "qwen2.5-72b-instruct": 0.7895333333333333,
908
+ "llama-3.1-8b-instruct": 1.1526666666666667,
909
+ "llama-3.1-70b-instruct": 0.7768,
910
+ "llama-3.2-3b-instruct": 1.1796666666666666,
911
+ "llama-3.3-70b-instruct": 0.8427333333333333,
912
+ "mistral-large-instruct-2411": 0.8880666666666667,
913
+ "gemma-2-27b-it": 1.0978666666666665,
914
+ "gemma-2-9b-it": 1.4352000000000003,
915
+ "deepseek-v3": 0.9310666666666668,
916
+ "deepseek-r1": 0.8318000000000001,
917
+ "qwq-32b": 0.8415333333333332,
918
+ "Average": 0.9835644444444444
919
+ },
920
+ "DetectiveEnv": {
921
+ "qwen2.5-3b-instruct": 0.9592666666666666,
922
+ "qwen2.5-7b-instruct": 0.8579333333333332,
923
+ "qwen2.5-14b-instruct": 0.5528666666666666,
924
+ "qwen2.5-32b-instruct": 0.6906666666666667,
925
+ "qwen2.5-72b-instruct": 0.6596,
926
+ "llama-3.1-8b-instruct": 1.0208666666666666,
927
+ "llama-3.1-70b-instruct": 0.7068,
928
+ "llama-3.2-3b-instruct": 1.0127333333333335,
929
+ "llama-3.3-70b-instruct": 0.7222666666666665,
930
+ "mistral-large-instruct-2411": 0.6398666666666667,
931
+ "gemma-2-27b-it": 1.0352000000000001,
932
+ "gemma-2-9b-it": 1.2517999999999998,
933
+ "deepseek-v3": 0.6811999999999999,
934
+ "deepseek-r1": 0.6839333333333333,
935
+ "qwq-32b": 0.6996,
936
+ "Average": 0.81164
937
+ },
938
+ "ChessEnv": {
939
+ "qwen2.5-3b-instruct": 0.9558666666666668,
940
+ "qwen2.5-7b-instruct": 1.0245333333333335,
941
+ "qwen2.5-14b-instruct": 0.8131333333333333,
942
+ "qwen2.5-32b-instruct": 0.8892,
943
+ "qwen2.5-72b-instruct": 0.8331999999999999,
944
+ "llama-3.1-8b-instruct": 1.1225333333333334,
945
+ "llama-3.1-70b-instruct": 0.7468666666666667,
946
+ "llama-3.2-3b-instruct": 1.1218,
947
+ "llama-3.3-70b-instruct": 0.8098666666666666,
948
+ "mistral-large-instruct-2411": 0.7781333333333335,
949
+ "gemma-2-27b-it": 1.01,
950
+ "gemma-2-9b-it": 1.2222666666666666,
951
+ "deepseek-v3": 0.7901333333333332,
952
+ "deepseek-r1": 0.7748,
953
+ "qwq-32b": 0.8231333333333334,
954
+ "Average": 0.9143644444444444
955
+ },
956
+ "MythicalEnv": {
957
+ "qwen2.5-3b-instruct": 0.9932000000000001,
958
+ "qwen2.5-7b-instruct": 1.0899333333333334,
959
+ "qwen2.5-14b-instruct": 0.7302,
960
+ "qwen2.5-32b-instruct": 0.8645999999999999,
961
+ "qwen2.5-72b-instruct": 0.8496666666666666,
962
+ "llama-3.1-8b-instruct": 1.0838,
963
+ "llama-3.1-70b-instruct": 0.8592000000000001,
964
+ "llama-3.2-3b-instruct": 1.0404666666666667,
965
+ "llama-3.3-70b-instruct": 0.8421333333333333,
966
+ "mistral-large-instruct-2411": 0.8762666666666666,
967
+ "gemma-2-27b-it": 0.8621333333333334,
968
+ "gemma-2-9b-it": 1.0032,
969
+ "deepseek-v3": 0.885,
970
+ "deepseek-r1": 0.7668000000000001,
971
+ "qwq-32b": 0.8260000000000002,
972
+ "Average": 0.90484
973
+ },
974
+ "ChemicalCompoundsEnv": {
975
+ "qwen2.5-3b-instruct": 0.8311047619047619,
976
+ "qwen2.5-7b-instruct": 0.8545238095238095,
977
+ "qwen2.5-14b-instruct": 0.6625238095238095,
978
+ "qwen2.5-32b-instruct": 0.7971619047619047,
979
+ "qwen2.5-72b-instruct": 0.7787238095238094,
980
+ "llama-3.1-8b-instruct": 0.9210666666666665,
981
+ "llama-3.1-70b-instruct": 0.8095619047619046,
982
+ "llama-3.2-3b-instruct": 0.870647619047619,
983
+ "llama-3.3-70b-instruct": 0.8861619047619047,
984
+ "mistral-large-instruct-2411": 0.665295238095238,
985
+ "gemma-2-27b-it": 0.8178285714285713,
986
+ "gemma-2-9b-it": 0.9217333333333333,
987
+ "deepseek-v3": 0.6765047619047617,
988
+ "deepseek-r1": 0.4232571428571429,
989
+ "qwq-32b": 0.41698095238095234,
990
+ "Average": 0.7555384126984125
991
+ },
992
+ "ArchitecturalEnv": {
993
+ "qwen2.5-3b-instruct": 0.8535333333333333,
994
+ "qwen2.5-7b-instruct": 0.9366666666666668,
995
+ "qwen2.5-14b-instruct": 0.6431333333333333,
996
+ "qwen2.5-32b-instruct": 0.7234666666666666,
997
+ "qwen2.5-72b-instruct": 0.6861333333333335,
998
+ "llama-3.1-8b-instruct": 0.9704666666666666,
999
+ "llama-3.1-70b-instruct": 0.769,
1000
+ "llama-3.2-3b-instruct": 1.0212666666666665,
1001
+ "llama-3.3-70b-instruct": 0.8717333333333332,
1002
+ "mistral-large-instruct-2411": 0.6912,
1003
+ "gemma-2-27b-it": 0.8425333333333332,
1004
+ "gemma-2-9b-it": 1.0264,
1005
+ "deepseek-v3": 0.6093333333333333,
1006
+ "deepseek-r1": 0.6285333333333333,
1007
+ "qwq-32b": 0.6964,
1008
+ "Average": 0.7979866666666668
1009
+ },
1010
+ "ComputationEnv": {
1011
+ "qwen2.5-3b-instruct": 0.9820666666666668,
1012
+ "qwen2.5-7b-instruct": 0.9801333333333334,
1013
+ "qwen2.5-14b-instruct": 0.7334666666666666,
1014
+ "qwen2.5-32b-instruct": 0.6850666666666666,
1015
+ "qwen2.5-72b-instruct": 0.6938000000000001,
1016
+ "llama-3.1-8b-instruct": 1.0237333333333332,
1017
+ "llama-3.1-70b-instruct": 0.6686,
1018
+ "llama-3.2-3b-instruct": 1.0106666666666666,
1019
+ "llama-3.3-70b-instruct": 0.6860666666666666,
1020
+ "mistral-large-instruct-2411": 0.7517333333333334,
1021
+ "gemma-2-27b-it": 0.8299999999999998,
1022
+ "gemma-2-9b-it": 1.0842666666666667,
1023
+ "deepseek-v3": 0.7249999999999999,
1024
+ "deepseek-r1": 0.7374666666666667,
1025
+ "qwq-32b": 0.7521999999999999,
1026
+ "Average": 0.8229511111111111
1027
+ },
1028
+ "MachinePartEnv": {
1029
+ "qwen2.5-3b-instruct": 0.9062666666666666,
1030
+ "qwen2.5-7b-instruct": 0.8395333333333334,
1031
+ "qwen2.5-14b-instruct": 0.6237999999999999,
1032
+ "qwen2.5-32b-instruct": 0.6204666666666667,
1033
+ "qwen2.5-72b-instruct": 0.6410666666666667,
1034
+ "llama-3.1-8b-instruct": 0.8998666666666665,
1035
+ "llama-3.1-70b-instruct": 0.5422666666666667,
1036
+ "llama-3.2-3b-instruct": 0.9272,
1037
+ "llama-3.3-70b-instruct": 0.5463333333333333,
1038
+ "mistral-large-instruct-2411": 0.5808,
1039
+ "gemma-2-27b-it": 0.6800666666666666,
1040
+ "gemma-2-9b-it": 0.9401999999999997,
1041
+ "deepseek-v3": 0.6576,
1042
+ "deepseek-r1": 0.6320666666666666,
1043
+ "qwq-32b": 0.5902,
1044
+ "Average": 0.7085155555555555
1045
+ },
1046
+ "LiteraryEnv": {
1047
+ "qwen2.5-3b-instruct": 0.7915619047619047,
1048
+ "qwen2.5-7b-instruct": 0.684247619047619,
1049
+ "qwen2.5-14b-instruct": 0.4320857142857143,
1050
+ "qwen2.5-32b-instruct": 0.47454285714285704,
1051
+ "qwen2.5-72b-instruct": 0.45714285714285713,
1052
+ "llama-3.1-8b-instruct": 0.7595047619047619,
1053
+ "llama-3.1-70b-instruct": 0.5143047619047618,
1054
+ "llama-3.2-3b-instruct": 0.8008476190476189,
1055
+ "llama-3.3-70b-instruct": 0.5506952380952381,
1056
+ "mistral-large-instruct-2411": 0.5536571428571427,
1057
+ "gemma-2-27b-it": 0.6854666666666664,
1058
+ "gemma-2-9b-it": 0.8457333333333332,
1059
+ "deepseek-v3": 0.5392285714285714,
1060
+ "deepseek-r1": 0.5025047619047619,
1061
+ "qwq-32b": 0.4939333333333332,
1062
+ "Average": 0.6056971428571427
1063
+ },
1064
+ "MarineEnv": {
1065
+ "qwen2.5-3b-instruct": 1.0838666666666668,
1066
+ "qwen2.5-7b-instruct": 1.0142666666666666,
1067
+ "qwen2.5-14b-instruct": 0.7625333333333334,
1068
+ "qwen2.5-32b-instruct": 0.7773333333333333,
1069
+ "qwen2.5-72b-instruct": 0.8016666666666667,
1070
+ "llama-3.1-8b-instruct": 1.1154666666666668,
1071
+ "llama-3.1-70b-instruct": 0.8399333333333333,
1072
+ "llama-3.2-3b-instruct": 1.1426000000000003,
1073
+ "llama-3.3-70b-instruct": 0.8375999999999999,
1074
+ "mistral-large-instruct-2411": 0.8630666666666666,
1075
+ "gemma-2-27b-it": 0.9411333333333334,
1076
+ "gemma-2-9b-it": 1.1442,
1077
+ "deepseek-v3": 0.8894666666666666,
1078
+ "deepseek-r1": 0.8458,
1079
+ "qwq-32b": 0.8745333333333333,
1080
+ "Average": 0.9288977777777778
1081
+ },
1082
+ "PhilosophyEnv": {
1083
+ "qwen2.5-3b-instruct": 1.05,
1084
+ "qwen2.5-7b-instruct": 1.3602666666666665,
1085
+ "qwen2.5-14b-instruct": 0.6244,
1086
+ "qwen2.5-32b-instruct": 0.6048,
1087
+ "qwen2.5-72b-instruct": 0.6090666666666666,
1088
+ "llama-3.1-8b-instruct": 1.0042666666666666,
1089
+ "llama-3.1-70b-instruct": 0.6868,
1090
+ "llama-3.2-3b-instruct": 1.2236666666666667,
1091
+ "llama-3.3-70b-instruct": 0.6848666666666666,
1092
+ "mistral-large-instruct-2411": 0.5620666666666667,
1093
+ "gemma-2-27b-it": 0.9582,
1094
+ "gemma-2-9b-it": 0.9566666666666667,
1095
+ "deepseek-v3": 0.5928666666666667,
1096
+ "deepseek-r1": 0.5505333333333333,
1097
+ "qwq-32b": 0.5388666666666666,
1098
+ "Average": 0.800488888888889
1099
+ },
1100
+ "ArchaeologicalEnv": {
1101
+ "qwen2.5-3b-instruct": 1.0586,
1102
+ "qwen2.5-7b-instruct": 0.8644000000000001,
1103
+ "qwen2.5-14b-instruct": 0.7687333333333333,
1104
+ "qwen2.5-32b-instruct": 0.724,
1105
+ "qwen2.5-72b-instruct": 0.7683333333333333,
1106
+ "llama-3.1-8b-instruct": 1.0757999999999999,
1107
+ "llama-3.1-70b-instruct": 0.7328666666666667,
1108
+ "llama-3.2-3b-instruct": 1.0688666666666666,
1109
+ "llama-3.3-70b-instruct": 0.7887333333333334,
1110
+ "mistral-large-instruct-2411": 0.7232,
1111
+ "gemma-2-27b-it": 0.8009999999999999,
1112
+ "gemma-2-9b-it": 1.0729333333333333,
1113
+ "deepseek-v3": 0.6970000000000001,
1114
+ "deepseek-r1": 0.5904666666666667,
1115
+ "qwq-32b": 0.6000666666666665,
1116
+ "Average": 0.8223333333333332
1117
+ },
1118
+ "GemstoneEnv": {
1119
+ "qwen2.5-3b-instruct": 0.7110285714285715,
1120
+ "qwen2.5-7b-instruct": 0.7523809523809524,
1121
+ "qwen2.5-14b-instruct": 0.4729333333333333,
1122
+ "qwen2.5-32b-instruct": 0.5357714285714286,
1123
+ "qwen2.5-72b-instruct": 0.6073238095238096,
1124
+ "llama-3.1-8b-instruct": 0.8096666666666665,
1125
+ "llama-3.1-70b-instruct": 0.6153904761904763,
1126
+ "llama-3.2-3b-instruct": 0.7933904761904762,
1127
+ "llama-3.3-70b-instruct": 0.5937238095238094,
1128
+ "mistral-large-instruct-2411": 0.4208952380952381,
1129
+ "gemma-2-27b-it": 0.5776761904761905,
1130
+ "gemma-2-9b-it": 0.7067619047619047,
1131
+ "deepseek-v3": 0.41441904761904763,
1132
+ "deepseek-r1": 0.38596190476190473,
1133
+ "qwq-32b": 0.34542857142857136,
1134
+ "Average": 0.5828501587301588
1135
+ },
1136
+ "MicrobiologyEnv": {
1137
+ "qwen2.5-3b-instruct": 0.9303809523809525,
1138
+ "qwen2.5-7b-instruct": 0.9467904761904762,
1139
+ "qwen2.5-14b-instruct": 0.5958380952380952,
1140
+ "qwen2.5-32b-instruct": 0.6932190476190476,
1141
+ "qwen2.5-72b-instruct": 0.5158190476190476,
1142
+ "llama-3.1-8b-instruct": 0.9718666666666668,
1143
+ "llama-3.1-70b-instruct": 0.7010476190476191,
1144
+ "llama-3.2-3b-instruct": 1.0090190476190477,
1145
+ "llama-3.3-70b-instruct": 0.7338666666666666,
1146
+ "mistral-large-instruct-2411": 0.6196380952380951,
1147
+ "gemma-2-27b-it": 0.7761333333333335,
1148
+ "gemma-2-9b-it": 1.0257809523809525,
1149
+ "deepseek-v3": 0.5738761904761904,
1150
+ "deepseek-r1": 0.5433809523809524,
1151
+ "qwq-32b": 0.5820380952380952,
1152
+ "Average": 0.7479130158730157
1153
+ },
1154
+ "SciFiEnv": {
1155
+ "qwen2.5-3b-instruct": 0.9241999999999999,
1156
+ "qwen2.5-7b-instruct": 1.222,
1157
+ "qwen2.5-14b-instruct": 0.6346,
1158
+ "qwen2.5-32b-instruct": 0.6923333333333332,
1159
+ "qwen2.5-72b-instruct": 0.7514666666666667,
1160
+ "llama-3.1-8b-instruct": 1.1545999999999998,
1161
+ "llama-3.1-70b-instruct": 0.6702,
1162
+ "llama-3.2-3b-instruct": 1.0696,
1163
+ "llama-3.3-70b-instruct": 0.7630666666666667,
1164
+ "mistral-large-instruct-2411": 0.6732666666666667,
1165
+ "gemma-2-27b-it": 0.8202,
1166
+ "gemma-2-9b-it": 1.0992666666666666,
1167
+ "deepseek-v3": 0.6295999999999999,
1168
+ "deepseek-r1": 0.5723333333333334,
1169
+ "qwq-32b": 0.6310666666666667,
1170
+ "Average": 0.82052
1171
+ },
1172
+ "HormoneEnv": {
1173
+ "qwen2.5-3b-instruct": 0.6477999999999999,
1174
+ "qwen2.5-7b-instruct": 0.5792666666666666,
1175
+ "qwen2.5-14b-instruct": 0.42300000000000004,
1176
+ "qwen2.5-32b-instruct": 0.4808,
1177
+ "qwen2.5-72b-instruct": 0.47140000000000004,
1178
+ "llama-3.1-8b-instruct": 0.6914,
1179
+ "llama-3.1-70b-instruct": 0.3943333333333333,
1180
+ "llama-3.2-3b-instruct": 0.6731999999999999,
1181
+ "llama-3.3-70b-instruct": 0.4046,
1182
+ "mistral-large-instruct-2411": 0.47793333333333327,
1183
+ "gemma-2-27b-it": 0.49526666666666663,
1184
+ "gemma-2-9b-it": 0.6825333333333333,
1185
+ "deepseek-v3": 0.4746666666666667,
1186
+ "deepseek-r1": 0.4145333333333333,
1187
+ "qwq-32b": 0.4527333333333333,
1188
+ "Average": 0.5175644444444445
1189
+ },
1190
+ "SculptorEnv": {
1191
+ "qwen2.5-3b-instruct": 1.1031333333333335,
1192
+ "qwen2.5-7b-instruct": 0.9707333333333334,
1193
+ "qwen2.5-14b-instruct": 0.8746666666666666,
1194
+ "qwen2.5-32b-instruct": 0.9588666666666666,
1195
+ "qwen2.5-72b-instruct": 0.9808,
1196
+ "llama-3.1-8b-instruct": 1.215,
1197
+ "llama-3.1-70b-instruct": 1.0109333333333332,
1198
+ "llama-3.2-3b-instruct": 1.2163333333333335,
1199
+ "llama-3.3-70b-instruct": 0.9847999999999999,
1200
+ "mistral-large-instruct-2411": 0.9626666666666667,
1201
+ "gemma-2-27b-it": 1.1294666666666668,
1202
+ "gemma-2-9b-it": 1.2386666666666666,
1203
+ "deepseek-v3": 0.9729333333333333,
1204
+ "deepseek-r1": 0.8991999999999999,
1205
+ "qwq-32b": 0.9352666666666666,
1206
+ "Average": 1.0302311111111113
1207
+ },
1208
+ "NeuroEnv": {
1209
+ "qwen2.5-3b-instruct": 1.0565999999999998,
1210
+ "qwen2.5-7b-instruct": 0.9228666666666665,
1211
+ "qwen2.5-14b-instruct": 0.6512666666666667,
1212
+ "qwen2.5-32b-instruct": 0.7456666666666667,
1213
+ "qwen2.5-72b-instruct": 0.628,
1214
+ "llama-3.1-8b-instruct": 1.0157333333333336,
1215
+ "llama-3.1-70b-instruct": 0.6142,
1216
+ "llama-3.2-3b-instruct": 1.0081333333333333,
1217
+ "llama-3.3-70b-instruct": 0.5914,
1218
+ "mistral-large-instruct-2411": 0.7068,
1219
+ "gemma-2-27b-it": 0.6496666666666667,
1220
+ "gemma-2-9b-it": 1.0555333333333334,
1221
+ "deepseek-v3": 0.7531333333333333,
1222
+ "deepseek-r1": 0.6508666666666667,
1223
+ "qwq-32b": 0.8050666666666666,
1224
+ "Average": 0.7903288888888889
1225
+ },
1226
+ "OceanEnv": {
1227
+ "qwen2.5-3b-instruct": 0.9513333333333334,
1228
+ "qwen2.5-7b-instruct": 0.766,
1229
+ "qwen2.5-14b-instruct": 0.6742,
1230
+ "qwen2.5-32b-instruct": 0.7434,
1231
+ "qwen2.5-72b-instruct": 0.6952666666666667,
1232
+ "llama-3.1-8b-instruct": 0.9298666666666667,
1233
+ "llama-3.1-70b-instruct": 0.6888,
1234
+ "llama-3.2-3b-instruct": 1.0092666666666668,
1235
+ "llama-3.3-70b-instruct": 0.7631333333333334,
1236
+ "mistral-large-instruct-2411": 0.6806000000000001,
1237
+ "gemma-2-27b-it": 0.8006666666666667,
1238
+ "gemma-2-9b-it": 0.9381333333333334,
1239
+ "deepseek-v3": 0.5634666666666666,
1240
+ "deepseek-r1": 0.5218666666666666,
1241
+ "qwq-32b": 0.5317333333333332,
1242
+ "Average": 0.7505155555555555
1243
+ },
1244
+ "MineralEnv": {
1245
+ "qwen2.5-3b-instruct": 0.4416333333333333,
1246
+ "qwen2.5-7b-instruct": 0.40716190476190484,
1247
+ "qwen2.5-14b-instruct": 0.13431428571428572,
1248
+ "qwen2.5-32b-instruct": 0.18465714285714285,
1249
+ "qwen2.5-72b-instruct": 0.17393809523809525,
1250
+ "llama-3.1-8b-instruct": 0.4482333333333333,
1251
+ "llama-3.1-70b-instruct": 0.2729857142857143,
1252
+ "llama-3.2-3b-instruct": 0.46588571428571424,
1253
+ "llama-3.3-70b-instruct": 0.2577333333333333,
1254
+ "mistral-large-instruct-2411": 0.2553095238095238,
1255
+ "gemma-2-27b-it": 0.3398333333333333,
1256
+ "gemma-2-9b-it": 0.4829333333333333,
1257
+ "deepseek-v3": 0.17311428571428572,
1258
+ "deepseek-r1": 0.19182857142857146,
1259
+ "qwq-32b": 0.2801666666666667,
1260
+ "Average": 0.30064857142857143
1261
+ },
1262
+ "FishEnv": {
1263
+ "qwen2.5-3b-instruct": 1.2468666666666668,
1264
+ "qwen2.5-7b-instruct": 1.3481999999999998,
1265
+ "qwen2.5-14b-instruct": 0.9705999999999999,
1266
+ "qwen2.5-32b-instruct": 1.0598666666666667,
1267
+ "qwen2.5-72b-instruct": 0.9867999999999999,
1268
+ "llama-3.1-8b-instruct": 1.3636666666666666,
1269
+ "llama-3.1-70b-instruct": 1.0207333333333335,
1270
+ "llama-3.2-3b-instruct": 1.4142666666666668,
1271
+ "llama-3.3-70b-instruct": 1.0358,
1272
+ "mistral-large-instruct-2411": 0.9927999999999999,
1273
+ "gemma-2-27b-it": 1.1742,
1274
+ "gemma-2-9b-it": 1.3645999999999998,
1275
+ "deepseek-v3": 1.0188,
1276
+ "deepseek-r1": 0.9213333333333333,
1277
+ "qwq-32b": 0.9852666666666667,
1278
+ "Average": 1.12692
1279
+ },
1280
+ "MartialArtsEnv": {
1281
+ "qwen2.5-3b-instruct": 1.0723333333333334,
1282
+ "qwen2.5-7b-instruct": 1.0198,
1283
+ "qwen2.5-14b-instruct": 0.7902666666666667,
1284
+ "qwen2.5-32b-instruct": 0.8842000000000001,
1285
+ "qwen2.5-72b-instruct": 0.9042666666666666,
1286
+ "llama-3.1-8b-instruct": 1.117,
1287
+ "llama-3.1-70b-instruct": 0.8526666666666667,
1288
+ "llama-3.2-3b-instruct": 1.1304,
1289
+ "llama-3.3-70b-instruct": 0.9208000000000001,
1290
+ "mistral-large-instruct-2411": 0.8798666666666666,
1291
+ "gemma-2-27b-it": 1.0068666666666666,
1292
+ "gemma-2-9b-it": 1.1265333333333334,
1293
+ "deepseek-v3": 0.7828666666666667,
1294
+ "deepseek-r1": 0.7744,
1295
+ "qwq-32b": 0.8414000000000001,
1296
+ "Average": 0.9402444444444444
1297
+ },
1298
+ "RocketFuelEnv": {
1299
+ "qwen2.5-3b-instruct": 0.8296666666666667,
1300
+ "qwen2.5-7b-instruct": 0.8119333333333334,
1301
+ "qwen2.5-14b-instruct": 0.3957333333333334,
1302
+ "qwen2.5-32b-instruct": 0.5435333333333333,
1303
+ "qwen2.5-72b-instruct": 0.4489333333333333,
1304
+ "llama-3.1-8b-instruct": 0.8727333333333332,
1305
+ "llama-3.1-70b-instruct": 0.5453999999999999,
1306
+ "llama-3.2-3b-instruct": 0.8820666666666666,
1307
+ "llama-3.3-70b-instruct": 0.5007333333333334,
1308
+ "mistral-large-instruct-2411": 0.4926666666666666,
1309
+ "gemma-2-27b-it": 0.5702666666666667,
1310
+ "gemma-2-9b-it": 0.9189333333333332,
1311
+ "deepseek-v3": 0.45919999999999994,
1312
+ "deepseek-r1": 0.44160000000000005,
1313
+ "qwq-32b": 0.4107333333333333,
1314
+ "Average": 0.6082755555555556
1315
+ },
1316
+ "MLEnv": {
1317
+ "qwen2.5-3b-instruct": 1.0853809523809523,
1318
+ "qwen2.5-7b-instruct": 0.9570571428571428,
1319
+ "qwen2.5-14b-instruct": 0.7381333333333332,
1320
+ "qwen2.5-32b-instruct": 0.7021238095238095,
1321
+ "qwen2.5-72b-instruct": 0.6462666666666667,
1322
+ "llama-3.1-8b-instruct": 1.0434,
1323
+ "llama-3.1-70b-instruct": 0.6853333333333332,
1324
+ "llama-3.2-3b-instruct": 1.0912,
1325
+ "llama-3.3-70b-instruct": 0.8062666666666667,
1326
+ "mistral-large-instruct-2411": 0.742590476190476,
1327
+ "gemma-2-27b-it": 0.9825333333333333,
1328
+ "gemma-2-9b-it": 1.1573333333333333,
1329
+ "deepseek-v3": 0.7192761904761905,
1330
+ "deepseek-r1": 0.6918,
1331
+ "qwq-32b": 0.6837333333333333,
1332
+ "Average": 0.8488285714285714
1333
+ },
1334
+ "PoliticalManifestoEnv": {
1335
+ "qwen2.5-3b-instruct": 1.094,
1336
+ "qwen2.5-7b-instruct": 1.0524666666666664,
1337
+ "qwen2.5-14b-instruct": 0.8964666666666666,
1338
+ "qwen2.5-32b-instruct": 0.9097333333333332,
1339
+ "qwen2.5-72b-instruct": 0.9074666666666668,
1340
+ "llama-3.1-8b-instruct": 1.1484666666666665,
1341
+ "llama-3.1-70b-instruct": 0.9022666666666666,
1342
+ "llama-3.2-3b-instruct": 1.2590666666666668,
1343
+ "llama-3.3-70b-instruct": 0.9182666666666666,
1344
+ "mistral-large-instruct-2411": 0.9239333333333335,
1345
+ "gemma-2-27b-it": 1.1248666666666667,
1346
+ "gemma-2-9b-it": 1.2280666666666664,
1347
+ "deepseek-v3": 0.9872,
1348
+ "deepseek-r1": 0.9032,
1349
+ "qwq-32b": 0.9002666666666667,
1350
+ "Average": 1.0103822222222223
1351
+ },
1352
+ "CoffeeEnv": {
1353
+ "qwen2.5-3b-instruct": 0.5574857142857143,
1354
+ "qwen2.5-7b-instruct": 0.5191999999999999,
1355
+ "qwen2.5-14b-instruct": 0.287847619047619,
1356
+ "qwen2.5-32b-instruct": 0.3643714285714286,
1357
+ "qwen2.5-72b-instruct": 0.32374285714285717,
1358
+ "llama-3.1-8b-instruct": 0.6735904761904761,
1359
+ "llama-3.1-70b-instruct": 0.4418666666666667,
1360
+ "llama-3.2-3b-instruct": 0.6586857142857143,
1361
+ "llama-3.3-70b-instruct": 0.3596190476190476,
1362
+ "mistral-large-instruct-2411": 0.32551428571428564,
1363
+ "gemma-2-27b-it": 0.4073619047619047,
1364
+ "gemma-2-9b-it": 0.5588,
1365
+ "deepseek-v3": 0.3131333333333334,
1366
+ "deepseek-r1": 0.2641047619047619,
1367
+ "qwq-32b": 0.2930095238095238,
1368
+ "Average": 0.4232222222222221
1369
+ },
1370
+ "MotifAnalysisEnv": {
1371
+ "qwen2.5-3b-instruct": 1.5359333333333334,
1372
+ "qwen2.5-7b-instruct": 1.3934000000000002,
1373
+ "qwen2.5-14b-instruct": 1.2638,
1374
+ "qwen2.5-32b-instruct": 1.3157999999999999,
1375
+ "qwen2.5-72b-instruct": 1.2424,
1376
+ "llama-3.1-8b-instruct": 1.5532666666666666,
1377
+ "llama-3.1-70b-instruct": 1.3790666666666664,
1378
+ "llama-3.2-3b-instruct": 1.6122,
1379
+ "llama-3.3-70b-instruct": 1.5049333333333332,
1380
+ "mistral-large-instruct-2411": 1.2954666666666665,
1381
+ "gemma-2-27b-it": 1.5349999999999997,
1382
+ "gemma-2-9b-it": 1.5813333333333335,
1383
+ "deepseek-v3": 1.1815333333333333,
1384
+ "deepseek-r1": 0.9527999999999999,
1385
+ "qwq-32b": 1.0904666666666667,
1386
+ "Average": 1.3624933333333333
1387
+ },
1388
+ "NutritionEnv": {
1389
+ "qwen2.5-3b-instruct": 1.1223333333333332,
1390
+ "qwen2.5-7b-instruct": 1.1436666666666666,
1391
+ "qwen2.5-14b-instruct": 0.8542666666666667,
1392
+ "qwen2.5-32b-instruct": 0.8586666666666666,
1393
+ "qwen2.5-72b-instruct": 0.8068666666666667,
1394
+ "llama-3.1-8b-instruct": 1.1504666666666665,
1395
+ "llama-3.1-70b-instruct": 0.7640666666666667,
1396
+ "llama-3.2-3b-instruct": 1.1206666666666667,
1397
+ "llama-3.3-70b-instruct": 0.7728,
1398
+ "mistral-large-instruct-2411": 0.8578666666666667,
1399
+ "gemma-2-27b-it": 0.9822,
1400
+ "gemma-2-9b-it": 1.1840000000000002,
1401
+ "deepseek-v3": 0.8118666666666666,
1402
+ "deepseek-r1": 0.8373333333333333,
1403
+ "qwq-32b": 0.8091999999999999,
1404
+ "Average": 0.9384177777777779
1405
+ },
1406
+ "MalwareEnv": {
1407
+ "qwen2.5-3b-instruct": 1.0660666666666665,
1408
+ "qwen2.5-7b-instruct": 0.9987333333333333,
1409
+ "qwen2.5-14b-instruct": 0.8055999999999999,
1410
+ "qwen2.5-32b-instruct": 0.9301333333333333,
1411
+ "qwen2.5-72b-instruct": 0.8351333333333333,
1412
+ "llama-3.1-8b-instruct": 1.0980666666666667,
1413
+ "llama-3.1-70b-instruct": 0.8638666666666666,
1414
+ "llama-3.2-3b-instruct": 1.1550666666666667,
1415
+ "llama-3.3-70b-instruct": 0.9002666666666667,
1416
+ "mistral-large-instruct-2411": 0.8474,
1417
+ "gemma-2-27b-it": 1.0784666666666667,
1418
+ "gemma-2-9b-it": 1.1739333333333335,
1419
+ "deepseek-v3": 0.8863333333333333,
1420
+ "deepseek-r1": 0.7370666666666666,
1421
+ "qwq-32b": 0.8452666666666666,
1422
+ "Average": 0.9480933333333336
1423
+ },
1424
+ "GeologicalEnv": {
1425
+ "qwen2.5-3b-instruct": 0.7544666666666666,
1426
+ "qwen2.5-7b-instruct": 0.6998666666666666,
1427
+ "qwen2.5-14b-instruct": 0.5382,
1428
+ "qwen2.5-32b-instruct": 0.6078666666666667,
1429
+ "qwen2.5-72b-instruct": 0.5856,
1430
+ "llama-3.1-8b-instruct": 0.8460666666666666,
1431
+ "llama-3.1-70b-instruct": 0.6026,
1432
+ "llama-3.2-3b-instruct": 0.7952666666666668,
1433
+ "llama-3.3-70b-instruct": 0.6526,
1434
+ "mistral-large-instruct-2411": 0.5856666666666667,
1435
+ "gemma-2-27b-it": 0.7190666666666667,
1436
+ "gemma-2-9b-it": 0.8308666666666668,
1437
+ "deepseek-v3": 0.5351333333333332,
1438
+ "deepseek-r1": 0.5818,
1439
+ "qwq-32b": 0.5489999999999999,
1440
+ "Average": 0.6589377777777776
1441
+ },
1442
+ "TheatricalEnv": {
1443
+ "qwen2.5-3b-instruct": 1.0014666666666667,
1444
+ "qwen2.5-7b-instruct": 0.9438000000000001,
1445
+ "qwen2.5-14b-instruct": 0.7684666666666666,
1446
+ "qwen2.5-32b-instruct": 0.7975333333333333,
1447
+ "qwen2.5-72b-instruct": 0.7806666666666666,
1448
+ "llama-3.1-8b-instruct": 0.9822666666666666,
1449
+ "llama-3.1-70b-instruct": 0.6897333333333333,
1450
+ "llama-3.2-3b-instruct": 1.2334666666666667,
1451
+ "llama-3.3-70b-instruct": 0.7385333333333334,
1452
+ "mistral-large-instruct-2411": 0.7150666666666667,
1453
+ "gemma-2-27b-it": 0.9279999999999999,
1454
+ "gemma-2-9b-it": 1.0476666666666667,
1455
+ "deepseek-v3": 0.7080666666666667,
1456
+ "deepseek-r1": 0.7016000000000001,
1457
+ "qwq-32b": 0.7614000000000001,
1458
+ "Average": 0.8531822222222224
1459
+ },
1460
+ "PrintingTechniqueEnv": {
1461
+ "qwen2.5-3b-instruct": 0.5242857142857142,
1462
+ "qwen2.5-7b-instruct": 0.4669809523809524,
1463
+ "qwen2.5-14b-instruct": 0.2997142857142857,
1464
+ "qwen2.5-32b-instruct": 0.3549714285714286,
1465
+ "qwen2.5-72b-instruct": 0.27769523809523805,
1466
+ "llama-3.1-8b-instruct": 0.5112857142857143,
1467
+ "llama-3.1-70b-instruct": 0.3614476190476191,
1468
+ "llama-3.2-3b-instruct": 0.5472571428571429,
1469
+ "llama-3.3-70b-instruct": 0.3488571428571428,
1470
+ "mistral-large-instruct-2411": 0.34679999999999994,
1471
+ "gemma-2-27b-it": 0.4303714285714285,
1472
+ "gemma-2-9b-it": 0.5662952380952382,
1473
+ "deepseek-v3": 0.29273333333333335,
1474
+ "deepseek-r1": 0.31156190476190476,
1475
+ "qwq-32b": 0.3049809523809524,
1476
+ "Average": 0.3963492063492063
1477
+ },
1478
+ "StellarEnv": {
1479
+ "qwen2.5-3b-instruct": 0.9369333333333332,
1480
+ "qwen2.5-7b-instruct": 0.7797999999999999,
1481
+ "qwen2.5-14b-instruct": 0.6935333333333333,
1482
+ "qwen2.5-32b-instruct": 0.7302666666666666,
1483
+ "qwen2.5-72b-instruct": 0.6992,
1484
+ "llama-3.1-8b-instruct": 0.9464666666666665,
1485
+ "llama-3.1-70b-instruct": 0.7085999999999999,
1486
+ "llama-3.2-3b-instruct": 0.9724666666666666,
1487
+ "llama-3.3-70b-instruct": 0.7429999999999999,
1488
+ "mistral-large-instruct-2411": 0.6746,
1489
+ "gemma-2-27b-it": 0.8274666666666667,
1490
+ "gemma-2-9b-it": 0.9978,
1491
+ "deepseek-v3": 0.6367333333333333,
1492
+ "deepseek-r1": 0.6487333333333332,
1493
+ "qwq-32b": 0.7190666666666667,
1494
+ "Average": 0.7809777777777777
1495
+ },
1496
+ "SoilEnv": {
1497
+ "qwen2.5-3b-instruct": 1.2182666666666666,
1498
+ "qwen2.5-7b-instruct": 1.0028666666666668,
1499
+ "qwen2.5-14b-instruct": 0.8012666666666665,
1500
+ "qwen2.5-32b-instruct": 0.8625999999999999,
1501
+ "qwen2.5-72b-instruct": 0.7968666666666666,
1502
+ "llama-3.1-8b-instruct": 1.1898,
1503
+ "llama-3.1-70b-instruct": 0.8916000000000001,
1504
+ "llama-3.2-3b-instruct": 1.1725333333333334,
1505
+ "llama-3.3-70b-instruct": 0.9216,
1506
+ "mistral-large-instruct-2411": 0.8744,
1507
+ "gemma-2-27b-it": 0.9574666666666667,
1508
+ "gemma-2-9b-it": 1.1284,
1509
+ "deepseek-v3": 0.9315333333333333,
1510
+ "deepseek-r1": 0.8695333333333334,
1511
+ "qwq-32b": 0.8491333333333333,
1512
+ "Average": 0.9645244444444445
1513
+ },
1514
+ "SoftwareEnv": {
1515
+ "qwen2.5-3b-instruct": 0.7055999999999999,
1516
+ "qwen2.5-7b-instruct": 0.6421333333333334,
1517
+ "qwen2.5-14b-instruct": 0.4560666666666666,
1518
+ "qwen2.5-32b-instruct": 0.484,
1519
+ "qwen2.5-72b-instruct": 0.4653999999999999,
1520
+ "llama-3.1-8b-instruct": 0.6588666666666667,
1521
+ "llama-3.1-70b-instruct": 0.4653333333333333,
1522
+ "llama-3.2-3b-instruct": 0.7011999999999998,
1523
+ "llama-3.3-70b-instruct": 0.49446666666666667,
1524
+ "mistral-large-instruct-2411": 0.46806666666666663,
1525
+ "gemma-2-27b-it": 0.6290000000000001,
1526
+ "gemma-2-9b-it": 0.7563333333333333,
1527
+ "deepseek-v3": 0.46806666666666674,
1528
+ "deepseek-r1": 0.4063333333333333,
1529
+ "qwq-32b": 0.4793333333333333,
1530
+ "Average": 0.5520133333333334
1531
+ },
1532
+ "CarIdentificationEnv": {
1533
+ "qwen2.5-3b-instruct": 0.6415809523809524,
1534
+ "qwen2.5-7b-instruct": 0.7830761904761905,
1535
+ "qwen2.5-14b-instruct": 0.11047619047619046,
1536
+ "qwen2.5-32b-instruct": 0.1649142857142857,
1537
+ "qwen2.5-72b-instruct": 0.11052380952380951,
1538
+ "llama-3.1-8b-instruct": 0.6149333333333332,
1539
+ "llama-3.1-70b-instruct": 0.30315238095238095,
1540
+ "llama-3.2-3b-instruct": 0.8439333333333332,
1541
+ "llama-3.3-70b-instruct": 0.23700952380952378,
1542
+ "mistral-large-instruct-2411": 0.2485809523809524,
1543
+ "gemma-2-27b-it": 0.3316761904761905,
1544
+ "gemma-2-9b-it": 0.6974666666666666,
1545
+ "deepseek-v3": 0.18107619047619045,
1546
+ "deepseek-r1": 0.21692380952380952,
1547
+ "qwq-32b": 0.25435238095238094,
1548
+ "Average": 0.38264507936507924
1549
+ },
1550
+ "PharmaceuticalEnv": {
1551
+ "qwen2.5-3b-instruct": 0.6446285714285713,
1552
+ "qwen2.5-7b-instruct": 0.6438761904761904,
1553
+ "qwen2.5-14b-instruct": 0.46221904761904764,
1554
+ "qwen2.5-32b-instruct": 0.5071333333333333,
1555
+ "qwen2.5-72b-instruct": 0.47396190476190475,
1556
+ "llama-3.1-8b-instruct": 0.7244190476190476,
1557
+ "llama-3.1-70b-instruct": 0.5232666666666667,
1558
+ "llama-3.2-3b-instruct": 0.7074666666666667,
1559
+ "llama-3.3-70b-instruct": 0.5709238095238094,
1560
+ "mistral-large-instruct-2411": 0.5165238095238095,
1561
+ "gemma-2-27b-it": 0.639904761904762,
1562
+ "gemma-2-9b-it": 0.7634666666666667,
1563
+ "deepseek-v3": 0.5126,
1564
+ "deepseek-r1": 0.5030857142857144,
1565
+ "qwq-32b": 0.5336571428571428,
1566
+ "Average": 0.5818088888888888
1567
+ },
1568
+ "NetworkEnv": {
1569
+ "qwen2.5-3b-instruct": 0.9304666666666666,
1570
+ "qwen2.5-7b-instruct": 0.9181333333333332,
1571
+ "qwen2.5-14b-instruct": 0.7156,
1572
+ "qwen2.5-32b-instruct": 0.7281333333333334,
1573
+ "qwen2.5-72b-instruct": 0.7013333333333333,
1574
+ "llama-3.1-8b-instruct": 0.9331333333333334,
1575
+ "llama-3.1-70b-instruct": 0.6774666666666667,
1576
+ "llama-3.2-3b-instruct": 1.0052666666666665,
1577
+ "llama-3.3-70b-instruct": 0.6723999999999999,
1578
+ "mistral-large-instruct-2411": 0.6792666666666667,
1579
+ "gemma-2-27b-it": 0.8334666666666667,
1580
+ "gemma-2-9b-it": 0.9756,
1581
+ "deepseek-v3": 0.6752666666666667,
1582
+ "deepseek-r1": 0.6507333333333334,
1583
+ "qwq-32b": 0.7018000000000001,
1584
+ "Average": 0.7865377777777778
1585
+ },
1586
+ "BirdNestEnv": {
1587
+ "qwen2.5-3b-instruct": 0.9391333333333332,
1588
+ "qwen2.5-7b-instruct": 0.9543333333333333,
1589
+ "qwen2.5-14b-instruct": 0.8100666666666665,
1590
+ "qwen2.5-32b-instruct": 0.9158,
1591
+ "qwen2.5-72b-instruct": 0.8606666666666666,
1592
+ "llama-3.1-8b-instruct": 0.9903999999999998,
1593
+ "llama-3.1-70b-instruct": 0.9222666666666666,
1594
+ "llama-3.2-3b-instruct": 1.026333333333333,
1595
+ "llama-3.3-70b-instruct": 0.9565999999999999,
1596
+ "mistral-large-instruct-2411": 0.8474666666666666,
1597
+ "gemma-2-27b-it": 0.9609333333333334,
1598
+ "gemma-2-9b-it": 1.0085333333333333,
1599
+ "deepseek-v3": 0.7617999999999998,
1600
+ "deepseek-r1": 0.6876,
1601
+ "qwq-32b": 0.7255999999999999,
1602
+ "Average": 0.8911688888888886
1603
+ },
1604
+ "EnergyEnv": {
1605
+ "qwen2.5-3b-instruct": 1.0884,
1606
+ "qwen2.5-7b-instruct": 1.0123904761904763,
1607
+ "qwen2.5-14b-instruct": 0.7312285714285716,
1608
+ "qwen2.5-32b-instruct": 0.8728190476190475,
1609
+ "qwen2.5-72b-instruct": 0.8609333333333332,
1610
+ "llama-3.1-8b-instruct": 1.1586666666666665,
1611
+ "llama-3.1-70b-instruct": 0.9046,
1612
+ "llama-3.2-3b-instruct": 1.1040666666666665,
1613
+ "llama-3.3-70b-instruct": 1.0045904761904763,
1614
+ "mistral-large-instruct-2411": 0.7872380952380952,
1615
+ "gemma-2-27b-it": 1.0375904761904762,
1616
+ "gemma-2-9b-it": 1.1858666666666666,
1617
+ "deepseek-v3": 0.8143523809523809,
1618
+ "deepseek-r1": 0.6982476190476191,
1619
+ "qwq-32b": 0.7441428571428571,
1620
+ "Average": 0.9336755555555555
1621
+ },
1622
+ "LanguageEnv": {
1623
+ "qwen2.5-3b-instruct": 1.1255333333333333,
1624
+ "qwen2.5-7b-instruct": 1.1768,
1625
+ "qwen2.5-14b-instruct": 1.0519333333333332,
1626
+ "qwen2.5-32b-instruct": 0.9997999999999999,
1627
+ "qwen2.5-72b-instruct": 0.9798000000000002,
1628
+ "llama-3.1-8b-instruct": 1.1883333333333335,
1629
+ "llama-3.1-70b-instruct": 1.0505333333333333,
1630
+ "llama-3.2-3b-instruct": 1.204333333333333,
1631
+ "llama-3.3-70b-instruct": 1.1250666666666667,
1632
+ "mistral-large-instruct-2411": 0.9846666666666666,
1633
+ "gemma-2-27b-it": 1.1916666666666669,
1634
+ "gemma-2-9b-it": 1.2723333333333335,
1635
+ "deepseek-v3": 0.9475333333333333,
1636
+ "deepseek-r1": 0.7798666666666666,
1637
+ "qwq-32b": 0.8547333333333332,
1638
+ "Average": 1.0621955555555558
1639
+ },
1640
+ "AlgorithmEnv": {
1641
+ "qwen2.5-3b-instruct": 0.9703333333333333,
1642
+ "qwen2.5-7b-instruct": 0.8614666666666666,
1643
+ "qwen2.5-14b-instruct": 0.6675333333333334,
1644
+ "qwen2.5-32b-instruct": 0.7172,
1645
+ "qwen2.5-72b-instruct": 0.7046666666666666,
1646
+ "llama-3.1-8b-instruct": 0.8993333333333332,
1647
+ "llama-3.1-70b-instruct": 0.6912666666666667,
1648
+ "llama-3.2-3b-instruct": 0.8341333333333333,
1649
+ "llama-3.3-70b-instruct": 0.7737333333333333,
1650
+ "mistral-large-instruct-2411": 0.6648000000000001,
1651
+ "gemma-2-27b-it": 0.9326000000000001,
1652
+ "gemma-2-9b-it": 1.0570666666666668,
1653
+ "deepseek-v3": 0.7781333333333332,
1654
+ "deepseek-r1": 0.6048,
1655
+ "qwq-32b": 0.656,
1656
+ "Average": 0.787537777777778
1657
+ },
1658
+ "MathematicalEnv": {
1659
+ "qwen2.5-3b-instruct": 0.7803333333333333,
1660
+ "qwen2.5-7b-instruct": 0.5459333333333333,
1661
+ "qwen2.5-14b-instruct": 0.4785333333333333,
1662
+ "qwen2.5-32b-instruct": 0.5025333333333333,
1663
+ "qwen2.5-72b-instruct": 0.47973333333333334,
1664
+ "llama-3.1-8b-instruct": 0.7568666666666667,
1665
+ "llama-3.1-70b-instruct": 0.4238666666666667,
1666
+ "llama-3.2-3b-instruct": 0.7148666666666668,
1667
+ "llama-3.3-70b-instruct": 0.4582,
1668
+ "mistral-large-instruct-2411": 0.4603999999999999,
1669
+ "gemma-2-27b-it": 0.6693333333333333,
1670
+ "gemma-2-9b-it": 0.8029999999999999,
1671
+ "deepseek-v3": 0.44066666666666665,
1672
+ "deepseek-r1": 0.3256,
1673
+ "qwq-32b": 0.41286666666666666,
1674
+ "Average": 0.5501822222222222
1675
+ },
1676
+ "MusicalEnv": {
1677
+ "qwen2.5-3b-instruct": 0.7868666666666665,
1678
+ "qwen2.5-7b-instruct": 0.6212,
1679
+ "qwen2.5-14b-instruct": 0.4023333333333333,
1680
+ "qwen2.5-32b-instruct": 0.44286666666666663,
1681
+ "qwen2.5-72b-instruct": 0.40346666666666664,
1682
+ "llama-3.1-8b-instruct": 0.6890666666666666,
1683
+ "llama-3.1-70b-instruct": 0.43373333333333336,
1684
+ "llama-3.2-3b-instruct": 0.7595333333333332,
1685
+ "llama-3.3-70b-instruct": 0.3923333333333333,
1686
+ "mistral-large-instruct-2411": 0.46593333333333337,
1687
+ "gemma-2-27b-it": 0.6487999999999999,
1688
+ "gemma-2-9b-it": 0.7524,
1689
+ "deepseek-v3": 0.47313333333333335,
1690
+ "deepseek-r1": 0.4392666666666667,
1691
+ "qwq-32b": 0.4437333333333334,
1692
+ "Average": 0.5436444444444444
1693
+ },
1694
+ "InventorEnv": {
1695
+ "qwen2.5-3b-instruct": 1.1748,
1696
+ "qwen2.5-7b-instruct": 1.0563999999999998,
1697
+ "qwen2.5-14b-instruct": 0.7145333333333334,
1698
+ "qwen2.5-32b-instruct": 0.7015999999999999,
1699
+ "qwen2.5-72b-instruct": 0.7992666666666667,
1700
+ "llama-3.1-8b-instruct": 1.1523999999999996,
1701
+ "llama-3.1-70b-instruct": 0.7999333333333334,
1702
+ "llama-3.2-3b-instruct": 1.2088,
1703
+ "llama-3.3-70b-instruct": 0.9067333333333334,
1704
+ "mistral-large-instruct-2411": 0.6736000000000001,
1705
+ "gemma-2-27b-it": 0.8846,
1706
+ "gemma-2-9b-it": 1.2016666666666667,
1707
+ "deepseek-v3": 0.5913333333333333,
1708
+ "deepseek-r1": 0.5467333333333333,
1709
+ "qwq-32b": 0.5757333333333332,
1710
+ "Average": 0.8658755555555557
1711
+ },
1712
+ "MedicalEnv": {
1713
+ "qwen2.5-3b-instruct": 0.6536,
1714
+ "qwen2.5-7b-instruct": 0.5706761904761904,
1715
+ "qwen2.5-14b-instruct": 0.49113333333333326,
1716
+ "qwen2.5-32b-instruct": 0.5525238095238094,
1717
+ "qwen2.5-72b-instruct": 0.4351047619047619,
1718
+ "llama-3.1-8b-instruct": 0.675742857142857,
1719
+ "llama-3.1-70b-instruct": 0.4085142857142857,
1720
+ "llama-3.2-3b-instruct": 0.6928952380952381,
1721
+ "llama-3.3-70b-instruct": 0.44421904761904757,
1722
+ "mistral-large-instruct-2411": 0.3882190476190476,
1723
+ "gemma-2-27b-it": 0.4850761904761905,
1724
+ "gemma-2-9b-it": 0.6444000000000001,
1725
+ "deepseek-v3": 0.42532380952380955,
1726
+ "deepseek-r1": 0.4207714285714285,
1727
+ "qwq-32b": 0.5122666666666668,
1728
+ "Average": 0.5200311111111111
1729
+ },
1730
+ "MusicEnv": {
1731
+ "qwen2.5-3b-instruct": 0.8416666666666666,
1732
+ "qwen2.5-7b-instruct": 1.0005333333333333,
1733
+ "qwen2.5-14b-instruct": 0.6241333333333333,
1734
+ "qwen2.5-32b-instruct": 0.6859999999999999,
1735
+ "qwen2.5-72b-instruct": 0.5986190476190476,
1736
+ "llama-3.1-8b-instruct": 0.8823619047619047,
1737
+ "llama-3.1-70b-instruct": 0.620095238095238,
1738
+ "llama-3.2-3b-instruct": 1.0788857142857142,
1739
+ "llama-3.3-70b-instruct": 0.6707809523809523,
1740
+ "mistral-large-instruct-2411": 0.6624857142857142,
1741
+ "gemma-2-27b-it": 0.7730666666666666,
1742
+ "gemma-2-9b-it": 0.9248000000000001,
1743
+ "deepseek-v3": 0.6254857142857142,
1744
+ "deepseek-r1": 0.6092095238095239,
1745
+ "qwq-32b": 0.6053809523809524,
1746
+ "Average": 0.7469003174603175
1747
+ },
1748
+ "FantasyEnv": {
1749
+ "qwen2.5-3b-instruct": 0.6738,
1750
+ "qwen2.5-7b-instruct": 0.6780571428571429,
1751
+ "qwen2.5-14b-instruct": 0.20936190476190478,
1752
+ "qwen2.5-32b-instruct": 0.23541904761904764,
1753
+ "qwen2.5-72b-instruct": 0.3943904761904762,
1754
+ "llama-3.1-8b-instruct": 0.6281142857142856,
1755
+ "llama-3.1-70b-instruct": 0.3229714285714286,
1756
+ "llama-3.2-3b-instruct": 0.7130571428571428,
1757
+ "llama-3.3-70b-instruct": 0.2710190476190476,
1758
+ "mistral-large-instruct-2411": 0.1259142857142857,
1759
+ "gemma-2-27b-it": 0.35649523809523803,
1760
+ "gemma-2-9b-it": 0.7076952380952382,
1761
+ "deepseek-v3": 0.056790476190476204,
1762
+ "deepseek-r1": -0.025180952380952377,
1763
+ "qwq-32b": 0.03801904761904761,
1764
+ "Average": 0.35906158730158727
1765
+ },
1766
+ "EducationEnv": {
1767
+ "qwen2.5-3b-instruct": 0.9363999999999999,
1768
+ "qwen2.5-7b-instruct": 0.9230666666666666,
1769
+ "qwen2.5-14b-instruct": 0.7015333333333333,
1770
+ "qwen2.5-32b-instruct": 0.6477333333333334,
1771
+ "qwen2.5-72b-instruct": 0.6232,
1772
+ "llama-3.1-8b-instruct": 0.9055333333333333,
1773
+ "llama-3.1-70b-instruct": 0.5771333333333334,
1774
+ "llama-3.2-3b-instruct": 0.8691333333333334,
1775
+ "llama-3.3-70b-instruct": 0.6331333333333333,
1776
+ "mistral-large-instruct-2411": 0.6769333333333332,
1777
+ "gemma-2-27b-it": 0.7234,
1778
+ "gemma-2-9b-it": 0.8838666666666667,
1779
+ "deepseek-v3": 0.7129333333333334,
1780
+ "deepseek-r1": 0.7282666666666666,
1781
+ "qwq-32b": 0.7233333333333334,
1782
+ "Average": 0.7510399999999999
1783
+ },
1784
+ "ChemicalEnv": {
1785
+ "qwen2.5-3b-instruct": 1.0222666666666664,
1786
+ "qwen2.5-7b-instruct": 1.1687333333333334,
1787
+ "qwen2.5-14b-instruct": 0.8354666666666667,
1788
+ "qwen2.5-32b-instruct": 0.8561333333333334,
1789
+ "qwen2.5-72b-instruct": 0.8254666666666667,
1790
+ "llama-3.1-8b-instruct": 1.1288,
1791
+ "llama-3.1-70b-instruct": 0.8285333333333333,
1792
+ "llama-3.2-3b-instruct": 1.1526666666666665,
1793
+ "llama-3.3-70b-instruct": 0.9390666666666666,
1794
+ "mistral-large-instruct-2411": 0.8290666666666666,
1795
+ "gemma-2-27b-it": 0.953533333333333,
1796
+ "gemma-2-9b-it": 1.1003333333333334,
1797
+ "deepseek-v3": 0.8336666666666668,
1798
+ "deepseek-r1": 0.8385333333333334,
1799
+ "qwq-32b": 0.7902666666666667,
1800
+ "Average": 0.9401688888888889
1801
+ },
1802
+ "Average": {
1803
+ "qwen2.5-3b-instruct": 0.9038483262611976,
1804
+ "qwen2.5-7b-instruct": 0.867841584158416,
1805
+ "qwen2.5-14b-instruct": 0.635130598774163,
1806
+ "qwen2.5-32b-instruct": 0.6698686468646864,
1807
+ "qwen2.5-72b-instruct": 0.6424673738802452,
1808
+ "llama-3.1-8b-instruct": 0.9346749174917492,
1809
+ "llama-3.1-70b-instruct": 0.660992975011787,
1810
+ "llama-3.2-3b-instruct": 0.9649007072135783,
1811
+ "llama-3.3-70b-instruct": 0.6869286185761432,
1812
+ "mistral-large-instruct-2411": 0.6406110796793965,
1813
+ "gemma-2-27b-it": 0.7896256954266856,
1814
+ "gemma-2-9b-it": 0.9722245167373881,
1815
+ "deepseek-v3": 0.6361851013672796,
1816
+ "deepseek-r1": 0.5821313531353135,
1817
+ "qwq-32b": 0.6111832626119755
1818
+ }
1819
+ }
data/easy_0402.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "llama-3.2-3b-instruct": {
4
+ "success_rate": 0.18239999999999998,
5
+ "relative_action_count": 0.8956800000000001
6
+ },
7
+ "qwen2.5-3b-instruct": {
8
+ "success_rate": 0.184,
9
+ "relative_action_count": 0.8255466666666665
10
+ },
11
+ "gpt-3.5-turbo": {
12
+ "success_rate": 0.272,
13
+ "relative_action_count": 0.8743314285714285
14
+ },
15
+ "qwen2.5-7b-instruct": {
16
+ "success_rate": 0.36639999999999995,
17
+ "relative_action_count": 0.8682133333333335
18
+ },
19
+ "gemma-2-9b-it": {
20
+ "success_rate": 0.392,
21
+ "relative_action_count": 0.8522190476190475
22
+ },
23
+ "llama-3.1-8b-instruct": {
24
+ "success_rate": 0.4424,
25
+ "relative_action_count": 0.8441104761904763
26
+ },
27
+ "gemma-2-27b-it": {
28
+ "success_rate": 0.548,
29
+ "relative_action_count": 0.6583142857142856
30
+ },
31
+ "yi-lightning": {
32
+ "success_rate": 0.6728,
33
+ "relative_action_count": 0.5962819047619048
34
+ },
35
+ "llama-3.1-70b-instruct": {
36
+ "success_rate": 0.696,
37
+ "relative_action_count": 0.5514495238095238
38
+ },
39
+ "llama-3.3-70b-instruct": {
40
+ "success_rate": 0.712,
41
+ "relative_action_count": 0.5916438095238095
42
+ },
43
+ "gpt-4o-mini": {
44
+ "success_rate": 0.7239999999999999,
45
+ "relative_action_count": 0.5270952380952381
46
+ },
47
+ "gemini-1.5-pro": {
48
+ "success_rate": 0.7256,
49
+ "relative_action_count": 0.5686514285714285
50
+ },
51
+ "claude-3.5-haiku": {
52
+ "success_rate": 0.7343999999999999,
53
+ "relative_action_count": 0.757095238095238
54
+ },
55
+ "qwen2.5-14b-instruct": {
56
+ "success_rate": 0.756,
57
+ "relative_action_count": 0.5723257142857143
58
+ },
59
+ "qwen2.5-72b-instruct": {
60
+ "success_rate": 0.7584,
61
+ "relative_action_count": 0.5753561904761904
62
+ },
63
+ "gpt-4o": {
64
+ "success_rate": 0.7856000000000002,
65
+ "relative_action_count": 0.506207619047619
66
+ },
67
+ "qwen2.5-32b-instruct": {
68
+ "success_rate": 0.7879999999999999,
69
+ "relative_action_count": 0.5955619047619047
70
+ },
71
+ "mistral-large-instruct-2411": {
72
+ "success_rate": 0.7879999999999999,
73
+ "relative_action_count": 0.5365238095238094
74
+ },
75
+ "claude-3.5-sonnet": {
76
+ "success_rate": 0.8263999999999999,
77
+ "relative_action_count": 0.46185714285714285
78
+ },
79
+ "deepseek-r1": {
80
+ "success_rate": 0.8712,
81
+ "relative_action_count": 0.51432
82
+ },
83
+ "o1-mini": {
84
+ "success_rate": 0.8784000000000001,
85
+ "relative_action_count": 0.46449523809523807
86
+ },
87
+ "deepseek-v3": {
88
+ "success_rate": 0.8928,
89
+ "relative_action_count": 0.5308400000000001
90
+ },
91
+ "qwq-32b": {
92
+ "success_rate": 0.9032,
93
+ "relative_action_count": 0.5338533333333333
94
+ }
95
+ }
96
+
97
+
data/hard_0402.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "qwen2.5-3b-instruct": {
4
+ "success_rate": 0.0624,
5
+ "relative_action_count": 2.4255102697302697
6
+ },
7
+ "llama-3.2-3b-instruct": {
8
+ "success_rate": 0.064,
9
+ "relative_action_count": 2.4438042524142523
10
+ },
11
+ "llama-3.1-8b-instruct": {
12
+ "success_rate": 0.11599999999999999,
13
+ "relative_action_count": 2.3321907026307027
14
+ },
15
+ "gpt-3.5-turbo": {
16
+ "success_rate": 0.12079999999999999,
17
+ "relative_action_count": 2.34957508047508
18
+ },
19
+ "gemma-2-9b-it": {
20
+ "success_rate": 0.132,
21
+ "relative_action_count": 2.3394684981684977
22
+ },
23
+ "qwen2.5-7b-instruct": {
24
+ "success_rate": 0.1664,
25
+ "relative_action_count": 2.3259762459762454
26
+ },
27
+ "gemma-2-27b-it": {
28
+ "success_rate": 0.1696,
29
+ "relative_action_count": 2.28467764013764
30
+ },
31
+ "llama-3.1-70b-instruct": {
32
+ "success_rate": 0.256,
33
+ "relative_action_count": 1.9653564912864916
34
+ },
35
+ "yi-lightning": {
36
+ "success_rate": 0.30720000000000003,
37
+ "relative_action_count": 2.031278719058719
38
+ },
39
+ "gpt-4o-mini": {
40
+ "success_rate": 0.31040000000000006,
41
+ "relative_action_count": 1.9804984304584305
42
+ },
43
+ "llama-3.3-70b-instruct": {
44
+ "success_rate": 0.33840000000000003,
45
+ "relative_action_count": 1.90917626040626
46
+ },
47
+ "claude-3.5-haiku": {
48
+ "success_rate": 0.3592000000000001,
49
+ "relative_action_count": 2.0113219180819177
50
+ },
51
+ "gemini-1.5-pro": {
52
+ "success_rate": 0.36879999999999996,
53
+ "relative_action_count": 1.9371788544788544
54
+ },
55
+ "qwen2.5-14b-instruct": {
56
+ "success_rate": 0.3816,
57
+ "relative_action_count": 1.9383408547008547
58
+ },
59
+ "qwen2.5-72b-instruct": {
60
+ "success_rate": 0.4008,
61
+ "relative_action_count": 1.8648658674658674
62
+ },
63
+ "mistral-large-instruct-2411": {
64
+ "success_rate": 0.4144,
65
+ "relative_action_count": 1.795764299034299
66
+ },
67
+ "qwen2.5-32b-instruct": {
68
+ "success_rate": 0.43920000000000003,
69
+ "relative_action_count": 1.8831460717060717
70
+ },
71
+ "claude-3.5-sonnet": {
72
+ "success_rate": 0.44000000000000006,
73
+ "relative_action_count": 1.6636790032190032
74
+ },
75
+ "gpt-4o": {
76
+ "success_rate": 0.44960000000000006,
77
+ "relative_action_count": 1.7164597657897656
78
+ },
79
+ "deepseek-v3": {
80
+ "success_rate": 0.5496000000000001,
81
+ "relative_action_count": 1.705338828948829
82
+ },
83
+ "deepseek-r1": {
84
+ "success_rate": 0.6112,
85
+ "relative_action_count": 1.4205231568431569
86
+ },
87
+ "qwq-32b": {
88
+ "success_rate": 0.6112,
89
+ "relative_action_count": 1.5151790675990677
90
+ },
91
+ "o1-mini": {
92
+ "success_rate": 0.6296,
93
+ "relative_action_count": 1.4230264535464534
94
+ }
95
+ }
96
+
97
+
data/success_rate_0402.json ADDED
@@ -0,0 +1,1819 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "RelicEnv": {
3
+ "qwen2.5-3b-instruct": 0.18,
4
+ "qwen2.5-7b-instruct": 0.396,
5
+ "qwen2.5-14b-instruct": 0.8,
6
+ "qwen2.5-32b-instruct": 0.8560000000000001,
7
+ "qwen2.5-72b-instruct": 0.892,
8
+ "llama-3.1-8b-instruct": 0.21600000000000003,
9
+ "llama-3.1-70b-instruct": 0.6639999999999999,
10
+ "llama-3.2-3b-instruct": 0.164,
11
+ "llama-3.3-70b-instruct": 0.836,
12
+ "mistral-large-instruct-2411": 0.8560000000000001,
13
+ "gemma-2-27b-it": 0.544,
14
+ "gemma-2-9b-it": 0.36400000000000005,
15
+ "deepseek-v3": 0.9359999999999999,
16
+ "deepseek-r1": 0.916,
17
+ "qwq-32b": 0.9560000000000001,
18
+ "Average": 0.6384
19
+ },
20
+ "HerbEnv": {
21
+ "qwen2.5-3b-instruct": 0.184,
22
+ "qwen2.5-7b-instruct": 0.304,
23
+ "qwen2.5-14b-instruct": 0.784,
24
+ "qwen2.5-32b-instruct": 0.8400000000000001,
25
+ "qwen2.5-72b-instruct": 0.8039999999999999,
26
+ "llama-3.1-8b-instruct": 0.30000000000000004,
27
+ "llama-3.1-70b-instruct": 0.568,
28
+ "llama-3.2-3b-instruct": 0.128,
29
+ "llama-3.3-70b-instruct": 0.612,
30
+ "mistral-large-instruct-2411": 0.76,
31
+ "gemma-2-27b-it": 0.504,
32
+ "gemma-2-9b-it": 0.18000000000000002,
33
+ "deepseek-v3": 0.968,
34
+ "deepseek-r1": 0.9359999999999999,
35
+ "qwq-32b": 0.924,
36
+ "Average": 0.5863999999999999
37
+ },
38
+ "TransdimensionalEnv": {
39
+ "qwen2.5-3b-instruct": 0.156,
40
+ "qwen2.5-7b-instruct": 0.38400000000000006,
41
+ "qwen2.5-14b-instruct": 0.836,
42
+ "qwen2.5-32b-instruct": 0.876,
43
+ "qwen2.5-72b-instruct": 0.836,
44
+ "llama-3.1-8b-instruct": 0.44399999999999995,
45
+ "llama-3.1-70b-instruct": 0.828,
46
+ "llama-3.2-3b-instruct": 0.172,
47
+ "llama-3.3-70b-instruct": 0.86,
48
+ "mistral-large-instruct-2411": 0.86,
49
+ "gemma-2-27b-it": 0.5599999999999999,
50
+ "gemma-2-9b-it": 0.516,
51
+ "deepseek-v3": 0.968,
52
+ "deepseek-r1": 0.9359999999999999,
53
+ "qwq-32b": 0.968,
54
+ "Average": 0.6799999999999999
55
+ },
56
+ "SorcererEnv": {
57
+ "qwen2.5-3b-instruct": 0.16,
58
+ "qwen2.5-7b-instruct": 0.32400000000000007,
59
+ "qwen2.5-14b-instruct": 0.8039999999999999,
60
+ "qwen2.5-32b-instruct": 0.8240000000000001,
61
+ "qwen2.5-72b-instruct": 0.8320000000000001,
62
+ "llama-3.1-8b-instruct": 0.276,
63
+ "llama-3.1-70b-instruct": 0.6639999999999999,
64
+ "llama-3.2-3b-instruct": 0.196,
65
+ "llama-3.3-70b-instruct": 0.7360000000000001,
66
+ "mistral-large-instruct-2411": 0.8,
67
+ "gemma-2-27b-it": 0.5640000000000001,
68
+ "gemma-2-9b-it": 0.28800000000000003,
69
+ "deepseek-v3": 0.8640000000000001,
70
+ "deepseek-r1": 0.8240000000000001,
71
+ "qwq-32b": 0.8400000000000001,
72
+ "Average": 0.5997333333333333
73
+ },
74
+ "QuantumEnv": {
75
+ "qwen2.5-3b-instruct": 0.196,
76
+ "qwen2.5-7b-instruct": 0.532,
77
+ "qwen2.5-14b-instruct": 0.8720000000000001,
78
+ "qwen2.5-32b-instruct": 0.9039999999999999,
79
+ "qwen2.5-72b-instruct": 0.916,
80
+ "llama-3.1-8b-instruct": 0.45600000000000007,
81
+ "llama-3.1-70b-instruct": 0.7999999999999999,
82
+ "llama-3.2-3b-instruct": 0.168,
83
+ "llama-3.3-70b-instruct": 0.8480000000000001,
84
+ "mistral-large-instruct-2411": 0.8720000000000001,
85
+ "gemma-2-27b-it": 0.744,
86
+ "gemma-2-9b-it": 0.544,
87
+ "deepseek-v3": 0.884,
88
+ "deepseek-r1": 0.8640000000000001,
89
+ "qwq-32b": 0.868,
90
+ "Average": 0.6978666666666666
91
+ },
92
+ "AstronomyEnv": {
93
+ "qwen2.5-3b-instruct": 0.172,
94
+ "qwen2.5-7b-instruct": 0.42800000000000005,
95
+ "qwen2.5-14b-instruct": 0.716,
96
+ "qwen2.5-32b-instruct": 0.676,
97
+ "qwen2.5-72b-instruct": 0.748,
98
+ "llama-3.1-8b-instruct": 0.336,
99
+ "llama-3.1-70b-instruct": 0.692,
100
+ "llama-3.2-3b-instruct": 0.176,
101
+ "llama-3.3-70b-instruct": 0.6519999999999999,
102
+ "mistral-large-instruct-2411": 0.7999999999999999,
103
+ "gemma-2-27b-it": 0.508,
104
+ "gemma-2-9b-it": 0.372,
105
+ "deepseek-v3": 0.748,
106
+ "deepseek-r1": 0.8200000000000001,
107
+ "qwq-32b": 0.852,
108
+ "Average": 0.5797333333333333
109
+ },
110
+ "MusicGenresEnv": {
111
+ "qwen2.5-3b-instruct": 0.22000000000000003,
112
+ "qwen2.5-7b-instruct": 0.42000000000000004,
113
+ "qwen2.5-14b-instruct": 0.72,
114
+ "qwen2.5-32b-instruct": 0.716,
115
+ "qwen2.5-72b-instruct": 0.696,
116
+ "llama-3.1-8b-instruct": 0.35200000000000004,
117
+ "llama-3.1-70b-instruct": 0.6280000000000001,
118
+ "llama-3.2-3b-instruct": 0.136,
119
+ "llama-3.3-70b-instruct": 0.592,
120
+ "mistral-large-instruct-2411": 0.732,
121
+ "gemma-2-27b-it": 0.44800000000000006,
122
+ "gemma-2-9b-it": 0.332,
123
+ "deepseek-v3": 0.748,
124
+ "deepseek-r1": 0.792,
125
+ "qwq-32b": 0.876,
126
+ "Average": 0.5605333333333334
127
+ },
128
+ "CloudEnv": {
129
+ "qwen2.5-3b-instruct": 0.21199999999999997,
130
+ "qwen2.5-7b-instruct": 0.42000000000000004,
131
+ "qwen2.5-14b-instruct": 0.76,
132
+ "qwen2.5-32b-instruct": 0.656,
133
+ "qwen2.5-72b-instruct": 0.712,
134
+ "llama-3.1-8b-instruct": 0.42000000000000004,
135
+ "llama-3.1-70b-instruct": 0.664,
136
+ "llama-3.2-3b-instruct": 0.22800000000000004,
137
+ "llama-3.3-70b-instruct": 0.696,
138
+ "mistral-large-instruct-2411": 0.8360000000000001,
139
+ "gemma-2-27b-it": 0.6,
140
+ "gemma-2-9b-it": 0.4,
141
+ "deepseek-v3": 0.8200000000000001,
142
+ "deepseek-r1": 0.908,
143
+ "qwq-32b": 0.9120000000000001,
144
+ "Average": 0.6162666666666667
145
+ },
146
+ "CuisineEnv": {
147
+ "qwen2.5-3b-instruct": 0.21600000000000003,
148
+ "qwen2.5-7b-instruct": 0.316,
149
+ "qwen2.5-14b-instruct": 0.6960000000000001,
150
+ "qwen2.5-32b-instruct": 0.664,
151
+ "qwen2.5-72b-instruct": 0.656,
152
+ "llama-3.1-8b-instruct": 0.22799999999999998,
153
+ "llama-3.1-70b-instruct": 0.476,
154
+ "llama-3.2-3b-instruct": 0.152,
155
+ "llama-3.3-70b-instruct": 0.44400000000000006,
156
+ "mistral-large-instruct-2411": 0.644,
157
+ "gemma-2-27b-it": 0.27599999999999997,
158
+ "gemma-2-9b-it": 0.156,
159
+ "deepseek-v3": 0.8400000000000001,
160
+ "deepseek-r1": 0.7959999999999999,
161
+ "qwq-32b": 0.8800000000000001,
162
+ "Average": 0.49599999999999994
163
+ },
164
+ "PlantEnv": {
165
+ "qwen2.5-3b-instruct": 0.168,
166
+ "qwen2.5-7b-instruct": 0.236,
167
+ "qwen2.5-14b-instruct": 0.34,
168
+ "qwen2.5-32b-instruct": 0.22000000000000003,
169
+ "qwen2.5-72b-instruct": 0.22799999999999998,
170
+ "llama-3.1-8b-instruct": 0.148,
171
+ "llama-3.1-70b-instruct": 0.16,
172
+ "llama-3.2-3b-instruct": 0.084,
173
+ "llama-3.3-70b-instruct": 0.07599999999999998,
174
+ "mistral-large-instruct-2411": 0.264,
175
+ "gemma-2-27b-it": 0.14400000000000002,
176
+ "gemma-2-9b-it": 0.092,
177
+ "deepseek-v3": 0.512,
178
+ "deepseek-r1": 0.5,
179
+ "qwq-32b": 0.548,
180
+ "Average": 0.24800000000000003
181
+ },
182
+ "HistoricalEnv": {
183
+ "qwen2.5-3b-instruct": 0.24,
184
+ "qwen2.5-7b-instruct": 0.368,
185
+ "qwen2.5-14b-instruct": 0.5800000000000001,
186
+ "qwen2.5-32b-instruct": 0.476,
187
+ "qwen2.5-72b-instruct": 0.512,
188
+ "llama-3.1-8b-instruct": 0.332,
189
+ "llama-3.1-70b-instruct": 0.616,
190
+ "llama-3.2-3b-instruct": 0.2,
191
+ "llama-3.3-70b-instruct": 0.652,
192
+ "mistral-large-instruct-2411": 0.6880000000000001,
193
+ "gemma-2-27b-it": 0.5,
194
+ "gemma-2-9b-it": 0.376,
195
+ "deepseek-v3": 0.748,
196
+ "deepseek-r1": 0.828,
197
+ "qwq-32b": 0.884,
198
+ "Average": 0.5333333333333334
199
+ },
200
+ "GadgetEnv": {
201
+ "qwen2.5-3b-instruct": 0.124,
202
+ "qwen2.5-7b-instruct": 0.312,
203
+ "qwen2.5-14b-instruct": 0.852,
204
+ "qwen2.5-32b-instruct": 0.8640000000000001,
205
+ "qwen2.5-72b-instruct": 0.892,
206
+ "llama-3.1-8b-instruct": 0.284,
207
+ "llama-3.1-70b-instruct": 0.692,
208
+ "llama-3.2-3b-instruct": 0.11200000000000002,
209
+ "llama-3.3-70b-instruct": 0.7360000000000001,
210
+ "mistral-large-instruct-2411": 0.884,
211
+ "gemma-2-27b-it": 0.32799999999999996,
212
+ "gemma-2-9b-it": 0.184,
213
+ "deepseek-v3": 0.9640000000000001,
214
+ "deepseek-r1": 0.932,
215
+ "qwq-32b": 0.932,
216
+ "Average": 0.6061333333333334
217
+ },
218
+ "TimeTravelEnv": {
219
+ "qwen2.5-3b-instruct": 0.128,
220
+ "qwen2.5-7b-instruct": 0.292,
221
+ "qwen2.5-14b-instruct": 0.808,
222
+ "qwen2.5-32b-instruct": 0.828,
223
+ "qwen2.5-72b-instruct": 0.8039999999999999,
224
+ "llama-3.1-8b-instruct": 0.376,
225
+ "llama-3.1-70b-instruct": 0.684,
226
+ "llama-3.2-3b-instruct": 0.124,
227
+ "llama-3.3-70b-instruct": 0.716,
228
+ "mistral-large-instruct-2411": 0.884,
229
+ "gemma-2-27b-it": 0.32799999999999996,
230
+ "gemma-2-9b-it": 0.21600000000000003,
231
+ "deepseek-v3": 0.9399999999999998,
232
+ "deepseek-r1": 0.932,
233
+ "qwq-32b": 0.924,
234
+ "Average": 0.5989333333333333
235
+ },
236
+ "PollutionEnv": {
237
+ "qwen2.5-3b-instruct": 0.136,
238
+ "qwen2.5-7b-instruct": 0.328,
239
+ "qwen2.5-14b-instruct": 0.792,
240
+ "qwen2.5-32b-instruct": 0.7120000000000001,
241
+ "qwen2.5-72b-instruct": 0.704,
242
+ "llama-3.1-8b-instruct": 0.316,
243
+ "llama-3.1-70b-instruct": 0.664,
244
+ "llama-3.2-3b-instruct": 0.124,
245
+ "llama-3.3-70b-instruct": 0.6960000000000001,
246
+ "mistral-large-instruct-2411": 0.784,
247
+ "gemma-2-27b-it": 0.336,
248
+ "gemma-2-9b-it": 0.252,
249
+ "deepseek-v3": 0.8640000000000001,
250
+ "deepseek-r1": 0.8560000000000001,
251
+ "qwq-32b": 0.852,
252
+ "Average": 0.5610666666666666
253
+ },
254
+ "DemographicEnv": {
255
+ "qwen2.5-3b-instruct": 0.072,
256
+ "qwen2.5-7b-instruct": 0.42800000000000005,
257
+ "qwen2.5-14b-instruct": 0.68,
258
+ "qwen2.5-32b-instruct": 0.7799999999999999,
259
+ "qwen2.5-72b-instruct": 0.7719999999999999,
260
+ "llama-3.1-8b-instruct": 0.272,
261
+ "llama-3.1-70b-instruct": 0.6239999999999999,
262
+ "llama-3.2-3b-instruct": 0.176,
263
+ "llama-3.3-70b-instruct": 0.748,
264
+ "mistral-large-instruct-2411": 0.8200000000000001,
265
+ "gemma-2-27b-it": 0.356,
266
+ "gemma-2-9b-it": 0.156,
267
+ "deepseek-v3": 0.8960000000000001,
268
+ "deepseek-r1": 0.876,
269
+ "qwq-32b": 0.8960000000000001,
270
+ "Average": 0.5701333333333333
271
+ },
272
+ "GeneticEnv": {
273
+ "qwen2.5-3b-instruct": 0.084,
274
+ "qwen2.5-7b-instruct": 0.392,
275
+ "qwen2.5-14b-instruct": 0.884,
276
+ "qwen2.5-32b-instruct": 0.9279999999999999,
277
+ "qwen2.5-72b-instruct": 0.9400000000000001,
278
+ "llama-3.1-8b-instruct": 0.45999999999999996,
279
+ "llama-3.1-70b-instruct": 0.9,
280
+ "llama-3.2-3b-instruct": 0.192,
281
+ "llama-3.3-70b-instruct": 0.916,
282
+ "mistral-large-instruct-2411": 0.9040000000000001,
283
+ "gemma-2-27b-it": 0.776,
284
+ "gemma-2-9b-it": 0.548,
285
+ "deepseek-v3": 0.984,
286
+ "deepseek-r1": 0.952,
287
+ "qwq-32b": 0.932,
288
+ "Average": 0.7194666666666667
289
+ },
290
+ "CraftsmanEnv": {
291
+ "qwen2.5-3b-instruct": 0.14400000000000002,
292
+ "qwen2.5-7b-instruct": 0.256,
293
+ "qwen2.5-14b-instruct": 0.624,
294
+ "qwen2.5-32b-instruct": 0.736,
295
+ "qwen2.5-72b-instruct": 0.664,
296
+ "llama-3.1-8b-instruct": 0.22000000000000003,
297
+ "llama-3.1-70b-instruct": 0.524,
298
+ "llama-3.2-3b-instruct": 0.10800000000000001,
299
+ "llama-3.3-70b-instruct": 0.41600000000000004,
300
+ "mistral-large-instruct-2411": 0.7080000000000001,
301
+ "gemma-2-27b-it": 0.324,
302
+ "gemma-2-9b-it": 0.096,
303
+ "deepseek-v3": 0.9,
304
+ "deepseek-r1": 0.7879999999999999,
305
+ "qwq-32b": 0.8160000000000001,
306
+ "Average": 0.4882666666666667
307
+ },
308
+ "StarConstellationEnv": {
309
+ "qwen2.5-3b-instruct": 0.1,
310
+ "qwen2.5-7b-instruct": 0.332,
311
+ "qwen2.5-14b-instruct": 0.5960000000000001,
312
+ "qwen2.5-32b-instruct": 0.572,
313
+ "qwen2.5-72b-instruct": 0.5840000000000001,
314
+ "llama-3.1-8b-instruct": 0.376,
315
+ "llama-3.1-70b-instruct": 0.4640000000000001,
316
+ "llama-3.2-3b-instruct": 0.136,
317
+ "llama-3.3-70b-instruct": 0.41200000000000003,
318
+ "mistral-large-instruct-2411": 0.6120000000000001,
319
+ "gemma-2-27b-it": 0.472,
320
+ "gemma-2-9b-it": 0.22799999999999998,
321
+ "deepseek-v3": 0.744,
322
+ "deepseek-r1": 0.748,
323
+ "qwq-32b": 0.736,
324
+ "Average": 0.47413333333333335
325
+ },
326
+ "MythicalCreatureEnv": {
327
+ "qwen2.5-3b-instruct": 0.2,
328
+ "qwen2.5-7b-instruct": 0.324,
329
+ "qwen2.5-14b-instruct": 0.632,
330
+ "qwen2.5-32b-instruct": 0.712,
331
+ "qwen2.5-72b-instruct": 0.668,
332
+ "llama-3.1-8b-instruct": 0.31200000000000006,
333
+ "llama-3.1-70b-instruct": 0.62,
334
+ "llama-3.2-3b-instruct": 0.11200000000000002,
335
+ "llama-3.3-70b-instruct": 0.648,
336
+ "mistral-large-instruct-2411": 0.7480000000000001,
337
+ "gemma-2-27b-it": 0.42799999999999994,
338
+ "gemma-2-9b-it": 0.268,
339
+ "deepseek-v3": 0.8400000000000001,
340
+ "deepseek-r1": 0.8400000000000001,
341
+ "qwq-32b": 0.852,
342
+ "Average": 0.5469333333333333
343
+ },
344
+ "ArtStyleEnv": {
345
+ "qwen2.5-3b-instruct": 0.136,
346
+ "qwen2.5-7b-instruct": 0.332,
347
+ "qwen2.5-14b-instruct": 0.78,
348
+ "qwen2.5-32b-instruct": 0.8320000000000001,
349
+ "qwen2.5-72b-instruct": 0.748,
350
+ "llama-3.1-8b-instruct": 0.356,
351
+ "llama-3.1-70b-instruct": 0.616,
352
+ "llama-3.2-3b-instruct": 0.17200000000000001,
353
+ "llama-3.3-70b-instruct": 0.6199999999999999,
354
+ "mistral-large-instruct-2411": 0.828,
355
+ "gemma-2-27b-it": 0.43200000000000005,
356
+ "gemma-2-9b-it": 0.256,
357
+ "deepseek-v3": 0.876,
358
+ "deepseek-r1": 0.8200000000000001,
359
+ "qwq-32b": 0.868,
360
+ "Average": 0.5781333333333335
361
+ },
362
+ "CookingEnv": {
363
+ "qwen2.5-3b-instruct": 0.13999999999999999,
364
+ "qwen2.5-7b-instruct": 0.44799999999999995,
365
+ "qwen2.5-14b-instruct": 0.76,
366
+ "qwen2.5-32b-instruct": 0.7440000000000001,
367
+ "qwen2.5-72b-instruct": 0.7,
368
+ "llama-3.1-8b-instruct": 0.364,
369
+ "llama-3.1-70b-instruct": 0.6839999999999999,
370
+ "llama-3.2-3b-instruct": 0.156,
371
+ "llama-3.3-70b-instruct": 0.656,
372
+ "mistral-large-instruct-2411": 0.74,
373
+ "gemma-2-27b-it": 0.48,
374
+ "gemma-2-9b-it": 0.364,
375
+ "deepseek-v3": 0.8640000000000001,
376
+ "deepseek-r1": 0.812,
377
+ "qwq-32b": 0.9,
378
+ "Average": 0.5874666666666666
379
+ },
380
+ "HistoricalBattleEnv": {
381
+ "qwen2.5-3b-instruct": 0.256,
382
+ "qwen2.5-7b-instruct": 0.292,
383
+ "qwen2.5-14b-instruct": 0.45999999999999996,
384
+ "qwen2.5-32b-instruct": 0.476,
385
+ "qwen2.5-72b-instruct": 0.42400000000000004,
386
+ "llama-3.1-8b-instruct": 0.28400000000000003,
387
+ "llama-3.1-70b-instruct": 0.492,
388
+ "llama-3.2-3b-instruct": 0.148,
389
+ "llama-3.3-70b-instruct": 0.62,
390
+ "mistral-large-instruct-2411": 0.608,
391
+ "gemma-2-27b-it": 0.388,
392
+ "gemma-2-9b-it": 0.34,
393
+ "deepseek-v3": 0.724,
394
+ "deepseek-r1": 0.788,
395
+ "qwq-32b": 0.8560000000000001,
396
+ "Average": 0.47706666666666664
397
+ },
398
+ "FungalEnv": {
399
+ "qwen2.5-3b-instruct": 0.15999999999999998,
400
+ "qwen2.5-7b-instruct": 0.46399999999999997,
401
+ "qwen2.5-14b-instruct": 0.664,
402
+ "qwen2.5-32b-instruct": 0.728,
403
+ "qwen2.5-72b-instruct": 0.6839999999999999,
404
+ "llama-3.1-8b-instruct": 0.41600000000000004,
405
+ "llama-3.1-70b-instruct": 0.5840000000000001,
406
+ "llama-3.2-3b-instruct": 0.14,
407
+ "llama-3.3-70b-instruct": 0.644,
408
+ "mistral-large-instruct-2411": 0.7440000000000001,
409
+ "gemma-2-27b-it": 0.536,
410
+ "gemma-2-9b-it": 0.184,
411
+ "deepseek-v3": 0.844,
412
+ "deepseek-r1": 0.764,
413
+ "qwq-32b": 0.7879999999999999,
414
+ "Average": 0.5562666666666666
415
+ },
416
+ "CryptographyEnv": {
417
+ "qwen2.5-3b-instruct": 0.24000000000000005,
418
+ "qwen2.5-7b-instruct": 0.23199999999999998,
419
+ "qwen2.5-14b-instruct": 0.508,
420
+ "qwen2.5-32b-instruct": 0.5760000000000001,
421
+ "qwen2.5-72b-instruct": 0.528,
422
+ "llama-3.1-8b-instruct": 0.29600000000000004,
423
+ "llama-3.1-70b-instruct": 0.524,
424
+ "llama-3.2-3b-instruct": 0.11600000000000002,
425
+ "llama-3.3-70b-instruct": 0.512,
426
+ "mistral-large-instruct-2411": 0.6799999999999999,
427
+ "gemma-2-27b-it": 0.328,
428
+ "gemma-2-9b-it": 0.192,
429
+ "deepseek-v3": 0.784,
430
+ "deepseek-r1": 0.74,
431
+ "qwq-32b": 0.8480000000000001,
432
+ "Average": 0.4736
433
+ },
434
+ "StorageEnv": {
435
+ "qwen2.5-3b-instruct": 0.22800000000000004,
436
+ "qwen2.5-7b-instruct": 0.44000000000000006,
437
+ "qwen2.5-14b-instruct": 0.852,
438
+ "qwen2.5-32b-instruct": 0.884,
439
+ "qwen2.5-72b-instruct": 0.8119999999999999,
440
+ "llama-3.1-8b-instruct": 0.34800000000000003,
441
+ "llama-3.1-70b-instruct": 0.724,
442
+ "llama-3.2-3b-instruct": 0.21600000000000003,
443
+ "llama-3.3-70b-instruct": 0.796,
444
+ "mistral-large-instruct-2411": 0.8880000000000001,
445
+ "gemma-2-27b-it": 0.596,
446
+ "gemma-2-9b-it": 0.392,
447
+ "deepseek-v3": 0.9640000000000001,
448
+ "deepseek-r1": 0.9119999999999999,
449
+ "qwq-32b": 0.944,
450
+ "Average": 0.6663999999999999
451
+ },
452
+ "RoverEnv": {
453
+ "qwen2.5-3b-instruct": 0.14400000000000002,
454
+ "qwen2.5-7b-instruct": 0.236,
455
+ "qwen2.5-14b-instruct": 0.8480000000000001,
456
+ "qwen2.5-32b-instruct": 0.8360000000000001,
457
+ "qwen2.5-72b-instruct": 0.796,
458
+ "llama-3.1-8b-instruct": 0.28400000000000003,
459
+ "llama-3.1-70b-instruct": 0.612,
460
+ "llama-3.2-3b-instruct": 0.148,
461
+ "llama-3.3-70b-instruct": 0.724,
462
+ "mistral-large-instruct-2411": 0.828,
463
+ "gemma-2-27b-it": 0.4600000000000001,
464
+ "gemma-2-9b-it": 0.072,
465
+ "deepseek-v3": 0.9200000000000002,
466
+ "deepseek-r1": 0.9,
467
+ "qwq-32b": 0.8720000000000001,
468
+ "Average": 0.5786666666666668
469
+ },
470
+ "FashionEnv": {
471
+ "qwen2.5-3b-instruct": 0.17200000000000001,
472
+ "qwen2.5-7b-instruct": 0.304,
473
+ "qwen2.5-14b-instruct": 0.8240000000000001,
474
+ "qwen2.5-32b-instruct": 0.808,
475
+ "qwen2.5-72b-instruct": 0.768,
476
+ "llama-3.1-8b-instruct": 0.32,
477
+ "llama-3.1-70b-instruct": 0.6,
478
+ "llama-3.2-3b-instruct": 0.16399999999999998,
479
+ "llama-3.3-70b-instruct": 0.6160000000000001,
480
+ "mistral-large-instruct-2411": 0.756,
481
+ "gemma-2-27b-it": 0.524,
482
+ "gemma-2-9b-it": 0.292,
483
+ "deepseek-v3": 0.86,
484
+ "deepseek-r1": 0.756,
485
+ "qwq-32b": 0.86,
486
+ "Average": 0.5749333333333334
487
+ },
488
+ "LicenseEnv": {
489
+ "qwen2.5-3b-instruct": 0.196,
490
+ "qwen2.5-7b-instruct": 0.29200000000000004,
491
+ "qwen2.5-14b-instruct": 0.556,
492
+ "qwen2.5-32b-instruct": 0.44000000000000006,
493
+ "qwen2.5-72b-instruct": 0.484,
494
+ "llama-3.1-8b-instruct": 0.26,
495
+ "llama-3.1-70b-instruct": 0.496,
496
+ "llama-3.2-3b-instruct": 0.072,
497
+ "llama-3.3-70b-instruct": 0.45999999999999996,
498
+ "mistral-large-instruct-2411": 0.504,
499
+ "gemma-2-27b-it": 0.37600000000000006,
500
+ "gemma-2-9b-it": 0.296,
501
+ "deepseek-v3": 0.556,
502
+ "deepseek-r1": 0.52,
503
+ "qwq-32b": 0.5800000000000001,
504
+ "Average": 0.4058666666666667
505
+ },
506
+ "VirusClassificationEnv": {
507
+ "qwen2.5-3b-instruct": 0.22000000000000003,
508
+ "qwen2.5-7b-instruct": 0.28,
509
+ "qwen2.5-14b-instruct": 0.384,
510
+ "qwen2.5-32b-instruct": 0.38,
511
+ "qwen2.5-72b-instruct": 0.42800000000000005,
512
+ "llama-3.1-8b-instruct": 0.256,
513
+ "llama-3.1-70b-instruct": 0.332,
514
+ "llama-3.2-3b-instruct": 0.156,
515
+ "llama-3.3-70b-instruct": 0.396,
516
+ "mistral-large-instruct-2411": 0.532,
517
+ "gemma-2-27b-it": 0.34,
518
+ "gemma-2-9b-it": 0.31200000000000006,
519
+ "deepseek-v3": 0.536,
520
+ "deepseek-r1": 0.64,
521
+ "qwq-32b": 0.684,
522
+ "Average": 0.3917333333333333
523
+ },
524
+ "TestingEnv": {
525
+ "qwen2.5-3b-instruct": 0.19200000000000003,
526
+ "qwen2.5-7b-instruct": 0.22000000000000003,
527
+ "qwen2.5-14b-instruct": 0.608,
528
+ "qwen2.5-32b-instruct": 0.648,
529
+ "qwen2.5-72b-instruct": 0.708,
530
+ "llama-3.1-8b-instruct": 0.332,
531
+ "llama-3.1-70b-instruct": 0.68,
532
+ "llama-3.2-3b-instruct": 0.17200000000000001,
533
+ "llama-3.3-70b-instruct": 0.7040000000000001,
534
+ "mistral-large-instruct-2411": 0.764,
535
+ "gemma-2-27b-it": 0.22799999999999998,
536
+ "gemma-2-9b-it": 0.26,
537
+ "deepseek-v3": 0.8880000000000001,
538
+ "deepseek-r1": 0.764,
539
+ "qwq-32b": 0.7999999999999999,
540
+ "Average": 0.5312
541
+ },
542
+ "NarrativeDetectEnv": {
543
+ "qwen2.5-3b-instruct": 0.148,
544
+ "qwen2.5-7b-instruct": 0.30000000000000004,
545
+ "qwen2.5-14b-instruct": 0.552,
546
+ "qwen2.5-32b-instruct": 0.8440000000000001,
547
+ "qwen2.5-72b-instruct": 0.76,
548
+ "llama-3.1-8b-instruct": 0.28800000000000003,
549
+ "llama-3.1-70b-instruct": 0.6279999999999999,
550
+ "llama-3.2-3b-instruct": 0.10400000000000001,
551
+ "llama-3.3-70b-instruct": 0.704,
552
+ "mistral-large-instruct-2411": 0.7919999999999999,
553
+ "gemma-2-27b-it": 0.328,
554
+ "gemma-2-9b-it": 0.192,
555
+ "deepseek-v3": 0.8560000000000001,
556
+ "deepseek-r1": 0.748,
557
+ "qwq-32b": 0.784,
558
+ "Average": 0.5352
559
+ },
560
+ "RenewableEnergyEnv": {
561
+ "qwen2.5-3b-instruct": 0.184,
562
+ "qwen2.5-7b-instruct": 0.44399999999999995,
563
+ "qwen2.5-14b-instruct": 0.648,
564
+ "qwen2.5-32b-instruct": 0.932,
565
+ "qwen2.5-72b-instruct": 0.8880000000000001,
566
+ "llama-3.1-8b-instruct": 0.396,
567
+ "llama-3.1-70b-instruct": 0.812,
568
+ "llama-3.2-3b-instruct": 0.2,
569
+ "llama-3.3-70b-instruct": 0.8240000000000001,
570
+ "mistral-large-instruct-2411": 0.8560000000000001,
571
+ "gemma-2-27b-it": 0.348,
572
+ "gemma-2-9b-it": 0.188,
573
+ "deepseek-v3": 0.96,
574
+ "deepseek-r1": 0.9800000000000001,
575
+ "qwq-32b": 0.9800000000000001,
576
+ "Average": 0.6426666666666667
577
+ },
578
+ "CelestialEnv": {
579
+ "qwen2.5-3b-instruct": 0.20400000000000001,
580
+ "qwen2.5-7b-instruct": 0.252,
581
+ "qwen2.5-14b-instruct": 0.728,
582
+ "qwen2.5-32b-instruct": 0.792,
583
+ "qwen2.5-72b-instruct": 0.7239999999999999,
584
+ "llama-3.1-8b-instruct": 0.256,
585
+ "llama-3.1-70b-instruct": 0.6920000000000001,
586
+ "llama-3.2-3b-instruct": 0.192,
587
+ "llama-3.3-70b-instruct": 0.744,
588
+ "mistral-large-instruct-2411": 0.82,
589
+ "gemma-2-27b-it": 0.528,
590
+ "gemma-2-9b-it": 0.344,
591
+ "deepseek-v3": 0.8480000000000001,
592
+ "deepseek-r1": 0.8360000000000001,
593
+ "qwq-32b": 0.8879999999999999,
594
+ "Average": 0.5898666666666668
595
+ },
596
+ "SpiceEnv": {
597
+ "qwen2.5-3b-instruct": 0.21199999999999997,
598
+ "qwen2.5-7b-instruct": 0.332,
599
+ "qwen2.5-14b-instruct": 0.672,
600
+ "qwen2.5-32b-instruct": 0.476,
601
+ "qwen2.5-72b-instruct": 0.5880000000000001,
602
+ "llama-3.1-8b-instruct": 0.32799999999999996,
603
+ "llama-3.1-70b-instruct": 0.40800000000000003,
604
+ "llama-3.2-3b-instruct": 0.22000000000000003,
605
+ "llama-3.3-70b-instruct": 0.336,
606
+ "mistral-large-instruct-2411": 0.5800000000000001,
607
+ "gemma-2-27b-it": 0.28400000000000003,
608
+ "gemma-2-9b-it": 0.172,
609
+ "deepseek-v3": 0.908,
610
+ "deepseek-r1": 0.7679999999999999,
611
+ "qwq-32b": 0.8720000000000001,
612
+ "Average": 0.47706666666666664
613
+ },
614
+ "WildlifeEnv": {
615
+ "qwen2.5-3b-instruct": 0.21600000000000003,
616
+ "qwen2.5-7b-instruct": 0.352,
617
+ "qwen2.5-14b-instruct": 0.644,
618
+ "qwen2.5-32b-instruct": 0.592,
619
+ "qwen2.5-72b-instruct": 0.616,
620
+ "llama-3.1-8b-instruct": 0.316,
621
+ "llama-3.1-70b-instruct": 0.544,
622
+ "llama-3.2-3b-instruct": 0.23199999999999998,
623
+ "llama-3.3-70b-instruct": 0.616,
624
+ "mistral-large-instruct-2411": 0.628,
625
+ "gemma-2-27b-it": 0.45199999999999996,
626
+ "gemma-2-9b-it": 0.344,
627
+ "deepseek-v3": 0.736,
628
+ "deepseek-r1": 0.6040000000000001,
629
+ "qwq-32b": 0.716,
630
+ "Average": 0.5072
631
+ },
632
+ "VehicleEnv": {
633
+ "qwen2.5-3b-instruct": 0.172,
634
+ "qwen2.5-7b-instruct": 0.308,
635
+ "qwen2.5-14b-instruct": 0.54,
636
+ "qwen2.5-32b-instruct": 0.776,
637
+ "qwen2.5-72b-instruct": 0.78,
638
+ "llama-3.1-8b-instruct": 0.248,
639
+ "llama-3.1-70b-instruct": 0.62,
640
+ "llama-3.2-3b-instruct": 0.152,
641
+ "llama-3.3-70b-instruct": 0.6960000000000001,
642
+ "mistral-large-instruct-2411": 0.8800000000000001,
643
+ "gemma-2-27b-it": 0.44799999999999995,
644
+ "gemma-2-9b-it": 0.248,
645
+ "deepseek-v3": 0.9199999999999999,
646
+ "deepseek-r1": 0.9199999999999999,
647
+ "qwq-32b": 0.916,
648
+ "Average": 0.5749333333333333
649
+ },
650
+ "BeverageEnv": {
651
+ "qwen2.5-3b-instruct": 0.128,
652
+ "qwen2.5-7b-instruct": 0.296,
653
+ "qwen2.5-14b-instruct": 0.792,
654
+ "qwen2.5-32b-instruct": 0.6880000000000001,
655
+ "qwen2.5-72b-instruct": 0.724,
656
+ "llama-3.1-8b-instruct": 0.41200000000000003,
657
+ "llama-3.1-70b-instruct": 0.6199999999999999,
658
+ "llama-3.2-3b-instruct": 0.16399999999999998,
659
+ "llama-3.3-70b-instruct": 0.5800000000000001,
660
+ "mistral-large-instruct-2411": 0.748,
661
+ "gemma-2-27b-it": 0.40800000000000003,
662
+ "gemma-2-9b-it": 0.296,
663
+ "deepseek-v3": 0.8800000000000001,
664
+ "deepseek-r1": 0.7520000000000001,
665
+ "qwq-32b": 0.844,
666
+ "Average": 0.5554666666666667
667
+ },
668
+ "ControlEnv": {
669
+ "qwen2.5-3b-instruct": 0.12800000000000003,
670
+ "qwen2.5-7b-instruct": 0.364,
671
+ "qwen2.5-14b-instruct": 0.68,
672
+ "qwen2.5-32b-instruct": 0.8320000000000001,
673
+ "qwen2.5-72b-instruct": 0.8400000000000001,
674
+ "llama-3.1-8b-instruct": 0.364,
675
+ "llama-3.1-70b-instruct": 0.656,
676
+ "llama-3.2-3b-instruct": 0.15599999999999997,
677
+ "llama-3.3-70b-instruct": 0.6320000000000001,
678
+ "mistral-large-instruct-2411": 0.784,
679
+ "gemma-2-27b-it": 0.4640000000000001,
680
+ "gemma-2-9b-it": 0.18,
681
+ "deepseek-v3": 0.9119999999999999,
682
+ "deepseek-r1": 0.9119999999999999,
683
+ "qwq-32b": 0.932,
684
+ "Average": 0.5890666666666665
685
+ },
686
+ "CurrencyEnv": {
687
+ "qwen2.5-3b-instruct": 0.252,
688
+ "qwen2.5-7b-instruct": 0.392,
689
+ "qwen2.5-14b-instruct": 0.8560000000000001,
690
+ "qwen2.5-32b-instruct": 0.884,
691
+ "qwen2.5-72b-instruct": 0.836,
692
+ "llama-3.1-8b-instruct": 0.476,
693
+ "llama-3.1-70b-instruct": 0.7520000000000001,
694
+ "llama-3.2-3b-instruct": 0.22400000000000003,
695
+ "llama-3.3-70b-instruct": 0.7000000000000001,
696
+ "mistral-large-instruct-2411": 0.8960000000000001,
697
+ "gemma-2-27b-it": 0.68,
698
+ "gemma-2-9b-it": 0.196,
699
+ "deepseek-v3": 0.9800000000000001,
700
+ "deepseek-r1": 0.932,
701
+ "qwq-32b": 0.9640000000000001,
702
+ "Average": 0.668
703
+ },
704
+ "MarketingEnv": {
705
+ "qwen2.5-3b-instruct": 0.12,
706
+ "qwen2.5-7b-instruct": 0.34400000000000003,
707
+ "qwen2.5-14b-instruct": 0.524,
708
+ "qwen2.5-32b-instruct": 0.7479999999999999,
709
+ "qwen2.5-72b-instruct": 0.732,
710
+ "llama-3.1-8b-instruct": 0.30800000000000005,
711
+ "llama-3.1-70b-instruct": 0.7040000000000001,
712
+ "llama-3.2-3b-instruct": 0.14400000000000002,
713
+ "llama-3.3-70b-instruct": 0.7639999999999999,
714
+ "mistral-large-instruct-2411": 0.7600000000000001,
715
+ "gemma-2-27b-it": 0.32400000000000007,
716
+ "gemma-2-9b-it": 0.184,
717
+ "deepseek-v3": 0.812,
718
+ "deepseek-r1": 0.7959999999999999,
719
+ "qwq-32b": 0.8320000000000001,
720
+ "Average": 0.5397333333333333
721
+ },
722
+ "BotanicalEnv": {
723
+ "qwen2.5-3b-instruct": 0.18800000000000003,
724
+ "qwen2.5-7b-instruct": 0.316,
725
+ "qwen2.5-14b-instruct": 0.9119999999999999,
726
+ "qwen2.5-32b-instruct": 0.884,
727
+ "qwen2.5-72b-instruct": 0.9039999999999999,
728
+ "llama-3.1-8b-instruct": 0.4119999999999999,
729
+ "llama-3.1-70b-instruct": 0.836,
730
+ "llama-3.2-3b-instruct": 0.23600000000000004,
731
+ "llama-3.3-70b-instruct": 0.8480000000000001,
732
+ "mistral-large-instruct-2411": 0.8640000000000001,
733
+ "gemma-2-27b-it": 0.604,
734
+ "gemma-2-9b-it": 0.264,
735
+ "deepseek-v3": 0.9040000000000001,
736
+ "deepseek-r1": 0.9399999999999998,
737
+ "qwq-32b": 0.968,
738
+ "Average": 0.672
739
+ },
740
+ "CircusActEnv": {
741
+ "qwen2.5-3b-instruct": 0.17200000000000001,
742
+ "qwen2.5-7b-instruct": 0.32399999999999995,
743
+ "qwen2.5-14b-instruct": 0.64,
744
+ "qwen2.5-32b-instruct": 0.712,
745
+ "qwen2.5-72b-instruct": 0.768,
746
+ "llama-3.1-8b-instruct": 0.276,
747
+ "llama-3.1-70b-instruct": 0.648,
748
+ "llama-3.2-3b-instruct": 0.176,
749
+ "llama-3.3-70b-instruct": 0.62,
750
+ "mistral-large-instruct-2411": 0.748,
751
+ "gemma-2-27b-it": 0.384,
752
+ "gemma-2-9b-it": 0.29600000000000004,
753
+ "deepseek-v3": 0.8640000000000001,
754
+ "deepseek-r1": 0.82,
755
+ "qwq-32b": 0.8720000000000001,
756
+ "Average": 0.5546666666666668
757
+ },
758
+ "AudioDialectEnv": {
759
+ "qwen2.5-3b-instruct": 0.128,
760
+ "qwen2.5-7b-instruct": 0.312,
761
+ "qwen2.5-14b-instruct": 0.5800000000000001,
762
+ "qwen2.5-32b-instruct": 0.6,
763
+ "qwen2.5-72b-instruct": 0.528,
764
+ "llama-3.1-8b-instruct": 0.21600000000000003,
765
+ "llama-3.1-70b-instruct": 0.4,
766
+ "llama-3.2-3b-instruct": 0.132,
767
+ "llama-3.3-70b-instruct": 0.32399999999999995,
768
+ "mistral-large-instruct-2411": 0.68,
769
+ "gemma-2-27b-it": 0.28,
770
+ "gemma-2-9b-it": 0.11600000000000002,
771
+ "deepseek-v3": 0.7520000000000001,
772
+ "deepseek-r1": 0.7919999999999999,
773
+ "qwq-32b": 0.8119999999999999,
774
+ "Average": 0.4434666666666666
775
+ },
776
+ "LeadershipEnv": {
777
+ "qwen2.5-3b-instruct": 0.164,
778
+ "qwen2.5-7b-instruct": 0.372,
779
+ "qwen2.5-14b-instruct": 0.7,
780
+ "qwen2.5-32b-instruct": 0.732,
781
+ "qwen2.5-72b-instruct": 0.7639999999999999,
782
+ "llama-3.1-8b-instruct": 0.364,
783
+ "llama-3.1-70b-instruct": 0.708,
784
+ "llama-3.2-3b-instruct": 0.128,
785
+ "llama-3.3-70b-instruct": 0.6920000000000001,
786
+ "mistral-large-instruct-2411": 0.728,
787
+ "gemma-2-27b-it": 0.46799999999999997,
788
+ "gemma-2-9b-it": 0.20400000000000001,
789
+ "deepseek-v3": 0.8200000000000001,
790
+ "deepseek-r1": 0.748,
791
+ "qwq-32b": 0.828,
792
+ "Average": 0.5613333333333334
793
+ },
794
+ "TransportEnv": {
795
+ "qwen2.5-3b-instruct": 0.196,
796
+ "qwen2.5-7b-instruct": 0.372,
797
+ "qwen2.5-14b-instruct": 0.716,
798
+ "qwen2.5-32b-instruct": 0.732,
799
+ "qwen2.5-72b-instruct": 0.8,
800
+ "llama-3.1-8b-instruct": 0.316,
801
+ "llama-3.1-70b-instruct": 0.648,
802
+ "llama-3.2-3b-instruct": 0.15200000000000002,
803
+ "llama-3.3-70b-instruct": 0.6000000000000001,
804
+ "mistral-large-instruct-2411": 0.7879999999999999,
805
+ "gemma-2-27b-it": 0.44399999999999995,
806
+ "gemma-2-9b-it": 0.364,
807
+ "deepseek-v3": 0.8640000000000001,
808
+ "deepseek-r1": 0.8240000000000001,
809
+ "qwq-32b": 0.9199999999999999,
810
+ "Average": 0.5824
811
+ },
812
+ "EcologicalEnv": {
813
+ "qwen2.5-3b-instruct": 0.152,
814
+ "qwen2.5-7b-instruct": 0.45600000000000007,
815
+ "qwen2.5-14b-instruct": 0.748,
816
+ "qwen2.5-32b-instruct": 0.82,
817
+ "qwen2.5-72b-instruct": 0.792,
818
+ "llama-3.1-8b-instruct": 0.42000000000000004,
819
+ "llama-3.1-70b-instruct": 0.692,
820
+ "llama-3.2-3b-instruct": 0.21600000000000003,
821
+ "llama-3.3-70b-instruct": 0.64,
822
+ "mistral-large-instruct-2411": 0.772,
823
+ "gemma-2-27b-it": 0.5680000000000001,
824
+ "gemma-2-9b-it": 0.46799999999999997,
825
+ "deepseek-v3": 0.868,
826
+ "deepseek-r1": 0.8720000000000001,
827
+ "qwq-32b": 0.8879999999999999,
828
+ "Average": 0.6248
829
+ },
830
+ "MythicEnv": {
831
+ "qwen2.5-3b-instruct": 0.132,
832
+ "qwen2.5-7b-instruct": 0.36,
833
+ "qwen2.5-14b-instruct": 0.744,
834
+ "qwen2.5-32b-instruct": 0.74,
835
+ "qwen2.5-72b-instruct": 0.672,
836
+ "llama-3.1-8b-instruct": 0.236,
837
+ "llama-3.1-70b-instruct": 0.596,
838
+ "llama-3.2-3b-instruct": 0.12,
839
+ "llama-3.3-70b-instruct": 0.576,
840
+ "mistral-large-instruct-2411": 0.6960000000000001,
841
+ "gemma-2-27b-it": 0.45599999999999996,
842
+ "gemma-2-9b-it": 0.136,
843
+ "deepseek-v3": 0.8960000000000001,
844
+ "deepseek-r1": 0.8720000000000001,
845
+ "qwq-32b": 0.8400000000000001,
846
+ "Average": 0.5381333333333332
847
+ },
848
+ "EnzymeEnv": {
849
+ "qwen2.5-3b-instruct": 0.252,
850
+ "qwen2.5-7b-instruct": 0.43200000000000005,
851
+ "qwen2.5-14b-instruct": 0.636,
852
+ "qwen2.5-32b-instruct": 0.676,
853
+ "qwen2.5-72b-instruct": 0.676,
854
+ "llama-3.1-8b-instruct": 0.316,
855
+ "llama-3.1-70b-instruct": 0.552,
856
+ "llama-3.2-3b-instruct": 0.192,
857
+ "llama-3.3-70b-instruct": 0.5640000000000001,
858
+ "mistral-large-instruct-2411": 0.732,
859
+ "gemma-2-27b-it": 0.43600000000000005,
860
+ "gemma-2-9b-it": 0.264,
861
+ "deepseek-v3": 0.8400000000000001,
862
+ "deepseek-r1": 0.76,
863
+ "qwq-32b": 0.804,
864
+ "Average": 0.5421333333333334
865
+ },
866
+ "OSKernelEnv": {
867
+ "qwen2.5-3b-instruct": 0.192,
868
+ "qwen2.5-7b-instruct": 0.28400000000000003,
869
+ "qwen2.5-14b-instruct": 0.8119999999999999,
870
+ "qwen2.5-32b-instruct": 0.784,
871
+ "qwen2.5-72b-instruct": 0.788,
872
+ "llama-3.1-8b-instruct": 0.316,
873
+ "llama-3.1-70b-instruct": 0.6920000000000001,
874
+ "llama-3.2-3b-instruct": 0.128,
875
+ "llama-3.3-70b-instruct": 0.74,
876
+ "mistral-large-instruct-2411": 0.8559999999999999,
877
+ "gemma-2-27b-it": 0.46399999999999997,
878
+ "gemma-2-9b-it": 0.2,
879
+ "deepseek-v3": 0.9480000000000001,
880
+ "deepseek-r1": 0.96,
881
+ "qwq-32b": 0.984,
882
+ "Average": 0.6098666666666668
883
+ },
884
+ "MineralClassificationEnv": {
885
+ "qwen2.5-3b-instruct": 0.11600000000000002,
886
+ "qwen2.5-7b-instruct": 0.248,
887
+ "qwen2.5-14b-instruct": 0.8320000000000001,
888
+ "qwen2.5-32b-instruct": 0.9040000000000001,
889
+ "qwen2.5-72b-instruct": 0.884,
890
+ "llama-3.1-8b-instruct": 0.384,
891
+ "llama-3.1-70b-instruct": 0.8240000000000001,
892
+ "llama-3.2-3b-instruct": 0.14800000000000002,
893
+ "llama-3.3-70b-instruct": 0.8960000000000001,
894
+ "mistral-large-instruct-2411": 0.908,
895
+ "gemma-2-27b-it": 0.508,
896
+ "gemma-2-9b-it": 0.268,
897
+ "deepseek-v3": 0.984,
898
+ "deepseek-r1": 0.9199999999999999,
899
+ "qwq-32b": 0.9640000000000001,
900
+ "Average": 0.6525333333333333
901
+ },
902
+ "EconomicEnv": {
903
+ "qwen2.5-3b-instruct": 0.136,
904
+ "qwen2.5-7b-instruct": 0.24,
905
+ "qwen2.5-14b-instruct": 0.8560000000000001,
906
+ "qwen2.5-32b-instruct": 0.9199999999999999,
907
+ "qwen2.5-72b-instruct": 0.8960000000000001,
908
+ "llama-3.1-8b-instruct": 0.43600000000000005,
909
+ "llama-3.1-70b-instruct": 0.808,
910
+ "llama-3.2-3b-instruct": 0.152,
911
+ "llama-3.3-70b-instruct": 0.8240000000000001,
912
+ "mistral-large-instruct-2411": 0.924,
913
+ "gemma-2-27b-it": 0.45199999999999996,
914
+ "gemma-2-9b-it": 0.36,
915
+ "deepseek-v3": 0.9559999999999998,
916
+ "deepseek-r1": 0.9359999999999999,
917
+ "qwq-32b": 0.9719999999999999,
918
+ "Average": 0.6578666666666667
919
+ },
920
+ "DetectiveEnv": {
921
+ "qwen2.5-3b-instruct": 0.168,
922
+ "qwen2.5-7b-instruct": 0.38,
923
+ "qwen2.5-14b-instruct": 0.836,
924
+ "qwen2.5-32b-instruct": 0.884,
925
+ "qwen2.5-72b-instruct": 0.8480000000000001,
926
+ "llama-3.1-8b-instruct": 0.34800000000000003,
927
+ "llama-3.1-70b-instruct": 0.74,
928
+ "llama-3.2-3b-instruct": 0.248,
929
+ "llama-3.3-70b-instruct": 0.792,
930
+ "mistral-large-instruct-2411": 0.8960000000000001,
931
+ "gemma-2-27b-it": 0.512,
932
+ "gemma-2-9b-it": 0.33199999999999996,
933
+ "deepseek-v3": 0.976,
934
+ "deepseek-r1": 0.9640000000000001,
935
+ "qwq-32b": 0.984,
936
+ "Average": 0.6605333333333333
937
+ },
938
+ "ChessEnv": {
939
+ "qwen2.5-3b-instruct": 0.184,
940
+ "qwen2.5-7b-instruct": 0.27999999999999997,
941
+ "qwen2.5-14b-instruct": 0.592,
942
+ "qwen2.5-32b-instruct": 0.616,
943
+ "qwen2.5-72b-instruct": 0.5720000000000001,
944
+ "llama-3.1-8b-instruct": 0.188,
945
+ "llama-3.1-70b-instruct": 0.6639999999999999,
946
+ "llama-3.2-3b-instruct": 0.084,
947
+ "llama-3.3-70b-instruct": 0.6280000000000001,
948
+ "mistral-large-instruct-2411": 0.744,
949
+ "gemma-2-27b-it": 0.30000000000000004,
950
+ "gemma-2-9b-it": 0.096,
951
+ "deepseek-v3": 0.696,
952
+ "deepseek-r1": 0.6519999999999999,
953
+ "qwq-32b": 0.664,
954
+ "Average": 0.4639999999999999
955
+ },
956
+ "MythicalEnv": {
957
+ "qwen2.5-3b-instruct": 0.2,
958
+ "qwen2.5-7b-instruct": 0.336,
959
+ "qwen2.5-14b-instruct": 0.8039999999999999,
960
+ "qwen2.5-32b-instruct": 0.712,
961
+ "qwen2.5-72b-instruct": 0.632,
962
+ "llama-3.1-8b-instruct": 0.356,
963
+ "llama-3.1-70b-instruct": 0.54,
964
+ "llama-3.2-3b-instruct": 0.16,
965
+ "llama-3.3-70b-instruct": 0.556,
966
+ "mistral-large-instruct-2411": 0.728,
967
+ "gemma-2-27b-it": 0.54,
968
+ "gemma-2-9b-it": 0.404,
969
+ "deepseek-v3": 0.9279999999999999,
970
+ "deepseek-r1": 0.8959999999999999,
971
+ "qwq-32b": 0.876,
972
+ "Average": 0.5778666666666666
973
+ },
974
+ "ChemicalCompoundsEnv": {
975
+ "qwen2.5-3b-instruct": 0.18,
976
+ "qwen2.5-7b-instruct": 0.252,
977
+ "qwen2.5-14b-instruct": 0.40800000000000003,
978
+ "qwen2.5-32b-instruct": 0.30000000000000004,
979
+ "qwen2.5-72b-instruct": 0.28400000000000003,
980
+ "llama-3.1-8b-instruct": 0.148,
981
+ "llama-3.1-70b-instruct": 0.28,
982
+ "llama-3.2-3b-instruct": 0.14,
983
+ "llama-3.3-70b-instruct": 0.18000000000000002,
984
+ "mistral-large-instruct-2411": 0.43200000000000005,
985
+ "gemma-2-27b-it": 0.23200000000000004,
986
+ "gemma-2-9b-it": 0.13599999999999998,
987
+ "deepseek-v3": 0.46799999999999997,
988
+ "deepseek-r1": 0.624,
989
+ "qwq-32b": 0.752,
990
+ "Average": 0.32106666666666667
991
+ },
992
+ "ArchitecturalEnv": {
993
+ "qwen2.5-3b-instruct": 0.20400000000000001,
994
+ "qwen2.5-7b-instruct": 0.316,
995
+ "qwen2.5-14b-instruct": 0.72,
996
+ "qwen2.5-32b-instruct": 0.66,
997
+ "qwen2.5-72b-instruct": 0.7120000000000001,
998
+ "llama-3.1-8b-instruct": 0.256,
999
+ "llama-3.1-70b-instruct": 0.556,
1000
+ "llama-3.2-3b-instruct": 0.132,
1001
+ "llama-3.3-70b-instruct": 0.508,
1002
+ "mistral-large-instruct-2411": 0.724,
1003
+ "gemma-2-27b-it": 0.488,
1004
+ "gemma-2-9b-it": 0.236,
1005
+ "deepseek-v3": 0.82,
1006
+ "deepseek-r1": 0.744,
1007
+ "qwq-32b": 0.8240000000000001,
1008
+ "Average": 0.5266666666666666
1009
+ },
1010
+ "ComputationEnv": {
1011
+ "qwen2.5-3b-instruct": 0.152,
1012
+ "qwen2.5-7b-instruct": 0.248,
1013
+ "qwen2.5-14b-instruct": 0.76,
1014
+ "qwen2.5-32b-instruct": 0.884,
1015
+ "qwen2.5-72b-instruct": 0.8560000000000001,
1016
+ "llama-3.1-8b-instruct": 0.32799999999999996,
1017
+ "llama-3.1-70b-instruct": 0.788,
1018
+ "llama-3.2-3b-instruct": 0.13999999999999999,
1019
+ "llama-3.3-70b-instruct": 0.8560000000000001,
1020
+ "mistral-large-instruct-2411": 0.828,
1021
+ "gemma-2-27b-it": 0.45199999999999996,
1022
+ "gemma-2-9b-it": 0.252,
1023
+ "deepseek-v3": 0.96,
1024
+ "deepseek-r1": 0.9399999999999998,
1025
+ "qwq-32b": 0.908,
1026
+ "Average": 0.6234666666666667
1027
+ },
1028
+ "MachinePartEnv": {
1029
+ "qwen2.5-3b-instruct": 0.14,
1030
+ "qwen2.5-7b-instruct": 0.32,
1031
+ "qwen2.5-14b-instruct": 0.8240000000000001,
1032
+ "qwen2.5-32b-instruct": 0.8800000000000001,
1033
+ "qwen2.5-72b-instruct": 0.828,
1034
+ "llama-3.1-8b-instruct": 0.376,
1035
+ "llama-3.1-70b-instruct": 0.8200000000000001,
1036
+ "llama-3.2-3b-instruct": 0.168,
1037
+ "llama-3.3-70b-instruct": 0.8960000000000001,
1038
+ "mistral-large-instruct-2411": 0.876,
1039
+ "gemma-2-27b-it": 0.508,
1040
+ "gemma-2-9b-it": 0.268,
1041
+ "deepseek-v3": 0.9719999999999999,
1042
+ "deepseek-r1": 0.952,
1043
+ "qwq-32b": 0.916,
1044
+ "Average": 0.6496
1045
+ },
1046
+ "LiteraryEnv": {
1047
+ "qwen2.5-3b-instruct": 0.10400000000000001,
1048
+ "qwen2.5-7b-instruct": 0.328,
1049
+ "qwen2.5-14b-instruct": 0.8800000000000001,
1050
+ "qwen2.5-32b-instruct": 0.9279999999999999,
1051
+ "qwen2.5-72b-instruct": 0.9,
1052
+ "llama-3.1-8b-instruct": 0.336,
1053
+ "llama-3.1-70b-instruct": 0.664,
1054
+ "llama-3.2-3b-instruct": 0.13999999999999999,
1055
+ "llama-3.3-70b-instruct": 0.664,
1056
+ "mistral-large-instruct-2411": 0.884,
1057
+ "gemma-2-27b-it": 0.44399999999999995,
1058
+ "gemma-2-9b-it": 0.13999999999999999,
1059
+ "deepseek-v3": 0.984,
1060
+ "deepseek-r1": 0.9119999999999999,
1061
+ "qwq-32b": 0.968,
1062
+ "Average": 0.6184
1063
+ },
1064
+ "MarineEnv": {
1065
+ "qwen2.5-3b-instruct": 0.144,
1066
+ "qwen2.5-7b-instruct": 0.384,
1067
+ "qwen2.5-14b-instruct": 0.8720000000000001,
1068
+ "qwen2.5-32b-instruct": 0.844,
1069
+ "qwen2.5-72b-instruct": 0.8320000000000001,
1070
+ "llama-3.1-8b-instruct": 0.308,
1071
+ "llama-3.1-70b-instruct": 0.636,
1072
+ "llama-3.2-3b-instruct": 0.12000000000000002,
1073
+ "llama-3.3-70b-instruct": 0.704,
1074
+ "mistral-large-instruct-2411": 0.7879999999999999,
1075
+ "gemma-2-27b-it": 0.484,
1076
+ "gemma-2-9b-it": 0.23199999999999998,
1077
+ "deepseek-v3": 0.884,
1078
+ "deepseek-r1": 0.9,
1079
+ "qwq-32b": 0.8880000000000001,
1080
+ "Average": 0.6013333333333334
1081
+ },
1082
+ "PhilosophyEnv": {
1083
+ "qwen2.5-3b-instruct": 0.144,
1084
+ "qwen2.5-7b-instruct": 0.3,
1085
+ "qwen2.5-14b-instruct": 0.7280000000000001,
1086
+ "qwen2.5-32b-instruct": 0.82,
1087
+ "qwen2.5-72b-instruct": 0.8719999999999999,
1088
+ "llama-3.1-8b-instruct": 0.32799999999999996,
1089
+ "llama-3.1-70b-instruct": 0.764,
1090
+ "llama-3.2-3b-instruct": 0.036000000000000004,
1091
+ "llama-3.3-70b-instruct": 0.796,
1092
+ "mistral-large-instruct-2411": 0.7879999999999999,
1093
+ "gemma-2-27b-it": 0.372,
1094
+ "gemma-2-9b-it": 0.28,
1095
+ "deepseek-v3": 0.844,
1096
+ "deepseek-r1": 0.78,
1097
+ "qwq-32b": 0.8320000000000001,
1098
+ "Average": 0.5789333333333334
1099
+ },
1100
+ "ArchaeologicalEnv": {
1101
+ "qwen2.5-3b-instruct": 0.18,
1102
+ "qwen2.5-7b-instruct": 0.38,
1103
+ "qwen2.5-14b-instruct": 0.58,
1104
+ "qwen2.5-32b-instruct": 0.608,
1105
+ "qwen2.5-72b-instruct": 0.5640000000000001,
1106
+ "llama-3.1-8b-instruct": 0.26,
1107
+ "llama-3.1-70b-instruct": 0.608,
1108
+ "llama-3.2-3b-instruct": 0.192,
1109
+ "llama-3.3-70b-instruct": 0.548,
1110
+ "mistral-large-instruct-2411": 0.64,
1111
+ "gemma-2-27b-it": 0.476,
1112
+ "gemma-2-9b-it": 0.30000000000000004,
1113
+ "deepseek-v3": 0.916,
1114
+ "deepseek-r1": 0.7040000000000001,
1115
+ "qwq-32b": 0.7559999999999999,
1116
+ "Average": 0.5141333333333333
1117
+ },
1118
+ "GemstoneEnv": {
1119
+ "qwen2.5-3b-instruct": 0.192,
1120
+ "qwen2.5-7b-instruct": 0.264,
1121
+ "qwen2.5-14b-instruct": 0.492,
1122
+ "qwen2.5-32b-instruct": 0.45599999999999996,
1123
+ "qwen2.5-72b-instruct": 0.44000000000000006,
1124
+ "llama-3.1-8b-instruct": 0.192,
1125
+ "llama-3.1-70b-instruct": 0.40800000000000003,
1126
+ "llama-3.2-3b-instruct": 0.15200000000000002,
1127
+ "llama-3.3-70b-instruct": 0.45599999999999996,
1128
+ "mistral-large-instruct-2411": 0.528,
1129
+ "gemma-2-27b-it": 0.33999999999999997,
1130
+ "gemma-2-9b-it": 0.256,
1131
+ "deepseek-v3": 0.5680000000000001,
1132
+ "deepseek-r1": 0.5680000000000001,
1133
+ "qwq-32b": 0.636,
1134
+ "Average": 0.3965333333333333
1135
+ },
1136
+ "MicrobiologyEnv": {
1137
+ "qwen2.5-3b-instruct": 0.14400000000000002,
1138
+ "qwen2.5-7b-instruct": 0.38400000000000006,
1139
+ "qwen2.5-14b-instruct": 0.752,
1140
+ "qwen2.5-32b-instruct": 0.7,
1141
+ "qwen2.5-72b-instruct": 0.844,
1142
+ "llama-3.1-8b-instruct": 0.316,
1143
+ "llama-3.1-70b-instruct": 0.512,
1144
+ "llama-3.2-3b-instruct": 0.12000000000000002,
1145
+ "llama-3.3-70b-instruct": 0.496,
1146
+ "mistral-large-instruct-2411": 0.764,
1147
+ "gemma-2-27b-it": 0.504,
1148
+ "gemma-2-9b-it": 0.172,
1149
+ "deepseek-v3": 0.9279999999999999,
1150
+ "deepseek-r1": 0.952,
1151
+ "qwq-32b": 0.932,
1152
+ "Average": 0.568
1153
+ },
1154
+ "SciFiEnv": {
1155
+ "qwen2.5-3b-instruct": 0.192,
1156
+ "qwen2.5-7b-instruct": 0.384,
1157
+ "qwen2.5-14b-instruct": 0.7879999999999999,
1158
+ "qwen2.5-32b-instruct": 0.776,
1159
+ "qwen2.5-72b-instruct": 0.7879999999999999,
1160
+ "llama-3.1-8b-instruct": 0.35200000000000004,
1161
+ "llama-3.1-70b-instruct": 0.664,
1162
+ "llama-3.2-3b-instruct": 0.164,
1163
+ "llama-3.3-70b-instruct": 0.588,
1164
+ "mistral-large-instruct-2411": 0.736,
1165
+ "gemma-2-27b-it": 0.52,
1166
+ "gemma-2-9b-it": 0.33599999999999997,
1167
+ "deepseek-v3": 0.9279999999999999,
1168
+ "deepseek-r1": 0.9199999999999999,
1169
+ "qwq-32b": 0.9,
1170
+ "Average": 0.6023999999999999
1171
+ },
1172
+ "HormoneEnv": {
1173
+ "qwen2.5-3b-instruct": 0.152,
1174
+ "qwen2.5-7b-instruct": 0.40800000000000003,
1175
+ "qwen2.5-14b-instruct": 0.7999999999999999,
1176
+ "qwen2.5-32b-instruct": 0.784,
1177
+ "qwen2.5-72b-instruct": 0.764,
1178
+ "llama-3.1-8b-instruct": 0.336,
1179
+ "llama-3.1-70b-instruct": 0.76,
1180
+ "llama-3.2-3b-instruct": 0.184,
1181
+ "llama-3.3-70b-instruct": 0.8480000000000001,
1182
+ "mistral-large-instruct-2411": 0.8,
1183
+ "gemma-2-27b-it": 0.524,
1184
+ "gemma-2-9b-it": 0.312,
1185
+ "deepseek-v3": 0.9480000000000001,
1186
+ "deepseek-r1": 0.944,
1187
+ "qwq-32b": 0.852,
1188
+ "Average": 0.6277333333333334
1189
+ },
1190
+ "SculptorEnv": {
1191
+ "qwen2.5-3b-instruct": 0.23200000000000004,
1192
+ "qwen2.5-7b-instruct": 0.4159999999999999,
1193
+ "qwen2.5-14b-instruct": 0.7079999999999999,
1194
+ "qwen2.5-32b-instruct": 0.636,
1195
+ "qwen2.5-72b-instruct": 0.6,
1196
+ "llama-3.1-8b-instruct": 0.22799999999999998,
1197
+ "llama-3.1-70b-instruct": 0.484,
1198
+ "llama-3.2-3b-instruct": 0.188,
1199
+ "llama-3.3-70b-instruct": 0.532,
1200
+ "mistral-large-instruct-2411": 0.684,
1201
+ "gemma-2-27b-it": 0.30000000000000004,
1202
+ "gemma-2-9b-it": 0.156,
1203
+ "deepseek-v3": 0.788,
1204
+ "deepseek-r1": 0.7479999999999999,
1205
+ "qwq-32b": 0.8119999999999999,
1206
+ "Average": 0.5008
1207
+ },
1208
+ "NeuroEnv": {
1209
+ "qwen2.5-3b-instruct": 0.10800000000000001,
1210
+ "qwen2.5-7b-instruct": 0.24400000000000005,
1211
+ "qwen2.5-14b-instruct": 0.8960000000000001,
1212
+ "qwen2.5-32b-instruct": 0.892,
1213
+ "qwen2.5-72b-instruct": 0.8879999999999999,
1214
+ "llama-3.1-8b-instruct": 0.512,
1215
+ "llama-3.1-70b-instruct": 0.8880000000000001,
1216
+ "llama-3.2-3b-instruct": 0.20400000000000001,
1217
+ "llama-3.3-70b-instruct": 0.9279999999999999,
1218
+ "mistral-large-instruct-2411": 0.8880000000000001,
1219
+ "gemma-2-27b-it": 0.72,
1220
+ "gemma-2-9b-it": 0.42800000000000005,
1221
+ "deepseek-v3": 0.952,
1222
+ "deepseek-r1": 0.932,
1223
+ "qwq-32b": 0.852,
1224
+ "Average": 0.6888000000000001
1225
+ },
1226
+ "OceanEnv": {
1227
+ "qwen2.5-3b-instruct": 0.2,
1228
+ "qwen2.5-7b-instruct": 0.45999999999999996,
1229
+ "qwen2.5-14b-instruct": 0.6160000000000001,
1230
+ "qwen2.5-32b-instruct": 0.6000000000000001,
1231
+ "qwen2.5-72b-instruct": 0.62,
1232
+ "llama-3.1-8b-instruct": 0.36400000000000005,
1233
+ "llama-3.1-70b-instruct": 0.5680000000000001,
1234
+ "llama-3.2-3b-instruct": 0.156,
1235
+ "llama-3.3-70b-instruct": 0.476,
1236
+ "mistral-large-instruct-2411": 0.656,
1237
+ "gemma-2-27b-it": 0.43200000000000005,
1238
+ "gemma-2-9b-it": 0.248,
1239
+ "deepseek-v3": 0.852,
1240
+ "deepseek-r1": 0.836,
1241
+ "qwq-32b": 0.8240000000000001,
1242
+ "Average": 0.5272000000000001
1243
+ },
1244
+ "MineralEnv": {
1245
+ "qwen2.5-3b-instruct": 0.14400000000000002,
1246
+ "qwen2.5-7b-instruct": 0.38,
1247
+ "qwen2.5-14b-instruct": 0.768,
1248
+ "qwen2.5-32b-instruct": 0.6960000000000001,
1249
+ "qwen2.5-72b-instruct": 0.684,
1250
+ "llama-3.1-8b-instruct": 0.29600000000000004,
1251
+ "llama-3.1-70b-instruct": 0.556,
1252
+ "llama-3.2-3b-instruct": 0.16,
1253
+ "llama-3.3-70b-instruct": 0.56,
1254
+ "mistral-large-instruct-2411": 0.66,
1255
+ "gemma-2-27b-it": 0.384,
1256
+ "gemma-2-9b-it": 0.17200000000000001,
1257
+ "deepseek-v3": 0.8480000000000001,
1258
+ "deepseek-r1": 0.82,
1259
+ "qwq-32b": 0.8720000000000001,
1260
+ "Average": 0.5333333333333333
1261
+ },
1262
+ "FishEnv": {
1263
+ "qwen2.5-3b-instruct": 0.188,
1264
+ "qwen2.5-7b-instruct": 0.38,
1265
+ "qwen2.5-14b-instruct": 0.732,
1266
+ "qwen2.5-32b-instruct": 0.668,
1267
+ "qwen2.5-72b-instruct": 0.7200000000000001,
1268
+ "llama-3.1-8b-instruct": 0.392,
1269
+ "llama-3.1-70b-instruct": 0.624,
1270
+ "llama-3.2-3b-instruct": 0.13599999999999998,
1271
+ "llama-3.3-70b-instruct": 0.616,
1272
+ "mistral-large-instruct-2411": 0.736,
1273
+ "gemma-2-27b-it": 0.508,
1274
+ "gemma-2-9b-it": 0.268,
1275
+ "deepseek-v3": 0.86,
1276
+ "deepseek-r1": 0.868,
1277
+ "qwq-32b": 0.924,
1278
+ "Average": 0.5746666666666667
1279
+ },
1280
+ "MartialArtsEnv": {
1281
+ "qwen2.5-3b-instruct": 0.184,
1282
+ "qwen2.5-7b-instruct": 0.43200000000000005,
1283
+ "qwen2.5-14b-instruct": 0.672,
1284
+ "qwen2.5-32b-instruct": 0.5640000000000001,
1285
+ "qwen2.5-72b-instruct": 0.56,
1286
+ "llama-3.1-8b-instruct": 0.276,
1287
+ "llama-3.1-70b-instruct": 0.54,
1288
+ "llama-3.2-3b-instruct": 0.2,
1289
+ "llama-3.3-70b-instruct": 0.52,
1290
+ "mistral-large-instruct-2411": 0.568,
1291
+ "gemma-2-27b-it": 0.4,
1292
+ "gemma-2-9b-it": 0.22400000000000003,
1293
+ "deepseek-v3": 0.784,
1294
+ "deepseek-r1": 0.716,
1295
+ "qwq-32b": 0.752,
1296
+ "Average": 0.4928
1297
+ },
1298
+ "RocketFuelEnv": {
1299
+ "qwen2.5-3b-instruct": 0.22800000000000004,
1300
+ "qwen2.5-7b-instruct": 0.41600000000000004,
1301
+ "qwen2.5-14b-instruct": 0.852,
1302
+ "qwen2.5-32b-instruct": 0.7879999999999999,
1303
+ "qwen2.5-72b-instruct": 0.8160000000000001,
1304
+ "llama-3.1-8b-instruct": 0.36,
1305
+ "llama-3.1-70b-instruct": 0.6799999999999999,
1306
+ "llama-3.2-3b-instruct": 0.184,
1307
+ "llama-3.3-70b-instruct": 0.7239999999999999,
1308
+ "mistral-large-instruct-2411": 0.828,
1309
+ "gemma-2-27b-it": 0.6279999999999999,
1310
+ "gemma-2-9b-it": 0.248,
1311
+ "deepseek-v3": 0.916,
1312
+ "deepseek-r1": 0.8960000000000001,
1313
+ "qwq-32b": 0.9040000000000001,
1314
+ "Average": 0.6312000000000001
1315
+ },
1316
+ "MLEnv": {
1317
+ "qwen2.5-3b-instruct": 0.088,
1318
+ "qwen2.5-7b-instruct": 0.392,
1319
+ "qwen2.5-14b-instruct": 0.6,
1320
+ "qwen2.5-32b-instruct": 0.748,
1321
+ "qwen2.5-72b-instruct": 0.792,
1322
+ "llama-3.1-8b-instruct": 0.304,
1323
+ "llama-3.1-70b-instruct": 0.672,
1324
+ "llama-3.2-3b-instruct": 0.10799999999999998,
1325
+ "llama-3.3-70b-instruct": 0.5960000000000001,
1326
+ "mistral-large-instruct-2411": 0.7639999999999999,
1327
+ "gemma-2-27b-it": 0.264,
1328
+ "gemma-2-9b-it": 0.156,
1329
+ "deepseek-v3": 0.808,
1330
+ "deepseek-r1": 0.652,
1331
+ "qwq-32b": 0.772,
1332
+ "Average": 0.5144
1333
+ },
1334
+ "PoliticalManifestoEnv": {
1335
+ "qwen2.5-3b-instruct": 0.184,
1336
+ "qwen2.5-7b-instruct": 0.312,
1337
+ "qwen2.5-14b-instruct": 0.76,
1338
+ "qwen2.5-32b-instruct": 0.852,
1339
+ "qwen2.5-72b-instruct": 0.7839999999999999,
1340
+ "llama-3.1-8b-instruct": 0.42400000000000004,
1341
+ "llama-3.1-70b-instruct": 0.62,
1342
+ "llama-3.2-3b-instruct": 0.128,
1343
+ "llama-3.3-70b-instruct": 0.692,
1344
+ "mistral-large-instruct-2411": 0.796,
1345
+ "gemma-2-27b-it": 0.45200000000000007,
1346
+ "gemma-2-9b-it": 0.152,
1347
+ "deepseek-v3": 0.86,
1348
+ "deepseek-r1": 0.792,
1349
+ "qwq-32b": 0.8800000000000001,
1350
+ "Average": 0.5792
1351
+ },
1352
+ "CoffeeEnv": {
1353
+ "qwen2.5-3b-instruct": 0.20400000000000001,
1354
+ "qwen2.5-7b-instruct": 0.38,
1355
+ "qwen2.5-14b-instruct": 0.7799999999999999,
1356
+ "qwen2.5-32b-instruct": 0.8039999999999999,
1357
+ "qwen2.5-72b-instruct": 0.764,
1358
+ "llama-3.1-8b-instruct": 0.31599999999999995,
1359
+ "llama-3.1-70b-instruct": 0.552,
1360
+ "llama-3.2-3b-instruct": 0.17200000000000001,
1361
+ "llama-3.3-70b-instruct": 0.6599999999999999,
1362
+ "mistral-large-instruct-2411": 0.828,
1363
+ "gemma-2-27b-it": 0.592,
1364
+ "gemma-2-9b-it": 0.364,
1365
+ "deepseek-v3": 0.9120000000000001,
1366
+ "deepseek-r1": 0.9279999999999999,
1367
+ "qwq-32b": 0.9359999999999999,
1368
+ "Average": 0.6128
1369
+ },
1370
+ "MotifAnalysisEnv": {
1371
+ "qwen2.5-3b-instruct": 0.096,
1372
+ "qwen2.5-7b-instruct": 0.332,
1373
+ "qwen2.5-14b-instruct": 0.5680000000000001,
1374
+ "qwen2.5-32b-instruct": 0.496,
1375
+ "qwen2.5-72b-instruct": 0.5920000000000001,
1376
+ "llama-3.1-8b-instruct": 0.244,
1377
+ "llama-3.1-70b-instruct": 0.36000000000000004,
1378
+ "llama-3.2-3b-instruct": 0.13999999999999999,
1379
+ "llama-3.3-70b-instruct": 0.22400000000000003,
1380
+ "mistral-large-instruct-2411": 0.46399999999999997,
1381
+ "gemma-2-27b-it": 0.18,
1382
+ "gemma-2-9b-it": 0.128,
1383
+ "deepseek-v3": 0.752,
1384
+ "deepseek-r1": 0.8240000000000001,
1385
+ "qwq-32b": 0.8640000000000001,
1386
+ "Average": 0.4176
1387
+ },
1388
+ "NutritionEnv": {
1389
+ "qwen2.5-3b-instruct": 0.132,
1390
+ "qwen2.5-7b-instruct": 0.22000000000000003,
1391
+ "qwen2.5-14b-instruct": 0.7920000000000001,
1392
+ "qwen2.5-32b-instruct": 0.8400000000000001,
1393
+ "qwen2.5-72b-instruct": 0.876,
1394
+ "llama-3.1-8b-instruct": 0.264,
1395
+ "llama-3.1-70b-instruct": 0.64,
1396
+ "llama-3.2-3b-instruct": 0.128,
1397
+ "llama-3.3-70b-instruct": 0.7040000000000001,
1398
+ "mistral-large-instruct-2411": 0.8320000000000001,
1399
+ "gemma-2-27b-it": 0.38,
1400
+ "gemma-2-9b-it": 0.20800000000000002,
1401
+ "deepseek-v3": 0.944,
1402
+ "deepseek-r1": 0.944,
1403
+ "qwq-32b": 0.9120000000000001,
1404
+ "Average": 0.5877333333333333
1405
+ },
1406
+ "MalwareEnv": {
1407
+ "qwen2.5-3b-instruct": 0.16,
1408
+ "qwen2.5-7b-instruct": 0.316,
1409
+ "qwen2.5-14b-instruct": 0.728,
1410
+ "qwen2.5-32b-instruct": 0.756,
1411
+ "qwen2.5-72b-instruct": 0.7200000000000001,
1412
+ "llama-3.1-8b-instruct": 0.268,
1413
+ "llama-3.1-70b-instruct": 0.5840000000000001,
1414
+ "llama-3.2-3b-instruct": 0.10800000000000001,
1415
+ "llama-3.3-70b-instruct": 0.548,
1416
+ "mistral-large-instruct-2411": 0.752,
1417
+ "gemma-2-27b-it": 0.252,
1418
+ "gemma-2-9b-it": 0.12,
1419
+ "deepseek-v3": 0.916,
1420
+ "deepseek-r1": 0.9,
1421
+ "qwq-32b": 0.916,
1422
+ "Average": 0.5362666666666667
1423
+ },
1424
+ "GeologicalEnv": {
1425
+ "qwen2.5-3b-instruct": 0.132,
1426
+ "qwen2.5-7b-instruct": 0.336,
1427
+ "qwen2.5-14b-instruct": 0.7639999999999999,
1428
+ "qwen2.5-32b-instruct": 0.748,
1429
+ "qwen2.5-72b-instruct": 0.676,
1430
+ "llama-3.1-8b-instruct": 0.28800000000000003,
1431
+ "llama-3.1-70b-instruct": 0.552,
1432
+ "llama-3.2-3b-instruct": 0.13999999999999999,
1433
+ "llama-3.3-70b-instruct": 0.508,
1434
+ "mistral-large-instruct-2411": 0.812,
1435
+ "gemma-2-27b-it": 0.41600000000000004,
1436
+ "gemma-2-9b-it": 0.164,
1437
+ "deepseek-v3": 0.9119999999999999,
1438
+ "deepseek-r1": 0.8480000000000001,
1439
+ "qwq-32b": 0.8880000000000001,
1440
+ "Average": 0.5456000000000001
1441
+ },
1442
+ "TheatricalEnv": {
1443
+ "qwen2.5-3b-instruct": 0.14400000000000002,
1444
+ "qwen2.5-7b-instruct": 0.42400000000000004,
1445
+ "qwen2.5-14b-instruct": 0.676,
1446
+ "qwen2.5-32b-instruct": 0.78,
1447
+ "qwen2.5-72b-instruct": 0.808,
1448
+ "llama-3.1-8b-instruct": 0.41200000000000003,
1449
+ "llama-3.1-70b-instruct": 0.7959999999999999,
1450
+ "llama-3.2-3b-instruct": 0.1,
1451
+ "llama-3.3-70b-instruct": 0.768,
1452
+ "mistral-large-instruct-2411": 0.844,
1453
+ "gemma-2-27b-it": 0.528,
1454
+ "gemma-2-9b-it": 0.28,
1455
+ "deepseek-v3": 0.884,
1456
+ "deepseek-r1": 0.8240000000000001,
1457
+ "qwq-32b": 0.908,
1458
+ "Average": 0.6117333333333335
1459
+ },
1460
+ "PrintingTechniqueEnv": {
1461
+ "qwen2.5-3b-instruct": 0.144,
1462
+ "qwen2.5-7b-instruct": 0.252,
1463
+ "qwen2.5-14b-instruct": 0.736,
1464
+ "qwen2.5-32b-instruct": 0.7200000000000001,
1465
+ "qwen2.5-72b-instruct": 0.776,
1466
+ "llama-3.1-8b-instruct": 0.4,
1467
+ "llama-3.1-70b-instruct": 0.54,
1468
+ "llama-3.2-3b-instruct": 0.16,
1469
+ "llama-3.3-70b-instruct": 0.548,
1470
+ "mistral-large-instruct-2411": 0.7040000000000001,
1471
+ "gemma-2-27b-it": 0.44000000000000006,
1472
+ "gemma-2-9b-it": 0.192,
1473
+ "deepseek-v3": 0.916,
1474
+ "deepseek-r1": 0.852,
1475
+ "qwq-32b": 0.9279999999999999,
1476
+ "Average": 0.5538666666666666
1477
+ },
1478
+ "StellarEnv": {
1479
+ "qwen2.5-3b-instruct": 0.132,
1480
+ "qwen2.5-7b-instruct": 0.388,
1481
+ "qwen2.5-14b-instruct": 0.6759999999999999,
1482
+ "qwen2.5-32b-instruct": 0.724,
1483
+ "qwen2.5-72b-instruct": 0.6960000000000001,
1484
+ "llama-3.1-8b-instruct": 0.30000000000000004,
1485
+ "llama-3.1-70b-instruct": 0.6040000000000001,
1486
+ "llama-3.2-3b-instruct": 0.16,
1487
+ "llama-3.3-70b-instruct": 0.6240000000000001,
1488
+ "mistral-large-instruct-2411": 0.732,
1489
+ "gemma-2-27b-it": 0.364,
1490
+ "gemma-2-9b-it": 0.23199999999999998,
1491
+ "deepseek-v3": 0.82,
1492
+ "deepseek-r1": 0.648,
1493
+ "qwq-32b": 0.776,
1494
+ "Average": 0.5250666666666667
1495
+ },
1496
+ "SoilEnv": {
1497
+ "qwen2.5-3b-instruct": 0.172,
1498
+ "qwen2.5-7b-instruct": 0.48,
1499
+ "qwen2.5-14b-instruct": 0.8320000000000001,
1500
+ "qwen2.5-32b-instruct": 0.788,
1501
+ "qwen2.5-72b-instruct": 0.8240000000000001,
1502
+ "llama-3.1-8b-instruct": 0.42400000000000004,
1503
+ "llama-3.1-70b-instruct": 0.64,
1504
+ "llama-3.2-3b-instruct": 0.22799999999999998,
1505
+ "llama-3.3-70b-instruct": 0.664,
1506
+ "mistral-large-instruct-2411": 0.76,
1507
+ "gemma-2-27b-it": 0.628,
1508
+ "gemma-2-9b-it": 0.44000000000000006,
1509
+ "deepseek-v3": 0.884,
1510
+ "deepseek-r1": 0.8039999999999999,
1511
+ "qwq-32b": 0.8480000000000001,
1512
+ "Average": 0.6277333333333334
1513
+ },
1514
+ "SoftwareEnv": {
1515
+ "qwen2.5-3b-instruct": 0.14800000000000002,
1516
+ "qwen2.5-7b-instruct": 0.40800000000000003,
1517
+ "qwen2.5-14b-instruct": 0.744,
1518
+ "qwen2.5-32b-instruct": 0.86,
1519
+ "qwen2.5-72b-instruct": 0.8400000000000001,
1520
+ "llama-3.1-8b-instruct": 0.4159999999999999,
1521
+ "llama-3.1-70b-instruct": 0.72,
1522
+ "llama-3.2-3b-instruct": 0.16799999999999998,
1523
+ "llama-3.3-70b-instruct": 0.784,
1524
+ "mistral-large-instruct-2411": 0.804,
1525
+ "gemma-2-27b-it": 0.528,
1526
+ "gemma-2-9b-it": 0.308,
1527
+ "deepseek-v3": 0.836,
1528
+ "deepseek-r1": 0.8360000000000001,
1529
+ "qwq-32b": 0.8800000000000001,
1530
+ "Average": 0.6186666666666667
1531
+ },
1532
+ "CarIdentificationEnv": {
1533
+ "qwen2.5-3b-instruct": 0.272,
1534
+ "qwen2.5-7b-instruct": 0.4,
1535
+ "qwen2.5-14b-instruct": 0.9120000000000001,
1536
+ "qwen2.5-32b-instruct": 0.916,
1537
+ "qwen2.5-72b-instruct": 0.9359999999999999,
1538
+ "llama-3.1-8b-instruct": 0.544,
1539
+ "llama-3.1-70b-instruct": 0.8400000000000001,
1540
+ "llama-3.2-3b-instruct": 0.124,
1541
+ "llama-3.3-70b-instruct": 0.852,
1542
+ "mistral-large-instruct-2411": 0.9119999999999999,
1543
+ "gemma-2-27b-it": 0.672,
1544
+ "gemma-2-9b-it": 0.376,
1545
+ "deepseek-v3": 0.992,
1546
+ "deepseek-r1": 0.952,
1547
+ "qwq-32b": 0.9879999999999999,
1548
+ "Average": 0.7125333333333334
1549
+ },
1550
+ "PharmaceuticalEnv": {
1551
+ "qwen2.5-3b-instruct": 0.156,
1552
+ "qwen2.5-7b-instruct": 0.32,
1553
+ "qwen2.5-14b-instruct": 0.7600000000000001,
1554
+ "qwen2.5-32b-instruct": 0.752,
1555
+ "qwen2.5-72b-instruct": 0.7559999999999999,
1556
+ "llama-3.1-8b-instruct": 0.28400000000000003,
1557
+ "llama-3.1-70b-instruct": 0.508,
1558
+ "llama-3.2-3b-instruct": 0.148,
1559
+ "llama-3.3-70b-instruct": 0.472,
1560
+ "mistral-large-instruct-2411": 0.756,
1561
+ "gemma-2-27b-it": 0.336,
1562
+ "gemma-2-9b-it": 0.128,
1563
+ "deepseek-v3": 0.8800000000000001,
1564
+ "deepseek-r1": 0.8640000000000001,
1565
+ "qwq-32b": 0.8,
1566
+ "Average": 0.528
1567
+ },
1568
+ "NetworkEnv": {
1569
+ "qwen2.5-3b-instruct": 0.184,
1570
+ "qwen2.5-7b-instruct": 0.36,
1571
+ "qwen2.5-14b-instruct": 0.66,
1572
+ "qwen2.5-32b-instruct": 0.716,
1573
+ "qwen2.5-72b-instruct": 0.716,
1574
+ "llama-3.1-8b-instruct": 0.43199999999999994,
1575
+ "llama-3.1-70b-instruct": 0.68,
1576
+ "llama-3.2-3b-instruct": 0.14400000000000002,
1577
+ "llama-3.3-70b-instruct": 0.7040000000000001,
1578
+ "mistral-large-instruct-2411": 0.78,
1579
+ "gemma-2-27b-it": 0.492,
1580
+ "gemma-2-9b-it": 0.392,
1581
+ "deepseek-v3": 0.8400000000000001,
1582
+ "deepseek-r1": 0.736,
1583
+ "qwq-32b": 0.828,
1584
+ "Average": 0.5776
1585
+ },
1586
+ "BirdNestEnv": {
1587
+ "qwen2.5-3b-instruct": 0.148,
1588
+ "qwen2.5-7b-instruct": 0.21200000000000002,
1589
+ "qwen2.5-14b-instruct": 0.48,
1590
+ "qwen2.5-32b-instruct": 0.33999999999999997,
1591
+ "qwen2.5-72b-instruct": 0.42400000000000004,
1592
+ "llama-3.1-8b-instruct": 0.16799999999999998,
1593
+ "llama-3.1-70b-instruct": 0.22400000000000003,
1594
+ "llama-3.2-3b-instruct": 0.084,
1595
+ "llama-3.3-70b-instruct": 0.20800000000000002,
1596
+ "mistral-large-instruct-2411": 0.492,
1597
+ "gemma-2-27b-it": 0.176,
1598
+ "gemma-2-9b-it": 0.128,
1599
+ "deepseek-v3": 0.764,
1600
+ "deepseek-r1": 0.756,
1601
+ "qwq-32b": 0.8119999999999999,
1602
+ "Average": 0.36106666666666676
1603
+ },
1604
+ "EnergyEnv": {
1605
+ "qwen2.5-3b-instruct": 0.15999999999999998,
1606
+ "qwen2.5-7b-instruct": 0.42000000000000004,
1607
+ "qwen2.5-14b-instruct": 0.7999999999999999,
1608
+ "qwen2.5-32b-instruct": 0.7,
1609
+ "qwen2.5-72b-instruct": 0.5880000000000001,
1610
+ "llama-3.1-8b-instruct": 0.29600000000000004,
1611
+ "llama-3.1-70b-instruct": 0.46799999999999997,
1612
+ "llama-3.2-3b-instruct": 0.18,
1613
+ "llama-3.3-70b-instruct": 0.396,
1614
+ "mistral-large-instruct-2411": 0.78,
1615
+ "gemma-2-27b-it": 0.35200000000000004,
1616
+ "gemma-2-9b-it": 0.196,
1617
+ "deepseek-v3": 0.916,
1618
+ "deepseek-r1": 0.8720000000000001,
1619
+ "qwq-32b": 0.8880000000000001,
1620
+ "Average": 0.5341333333333333
1621
+ },
1622
+ "LanguageEnv": {
1623
+ "qwen2.5-3b-instruct": 0.196,
1624
+ "qwen2.5-7b-instruct": 0.304,
1625
+ "qwen2.5-14b-instruct": 0.388,
1626
+ "qwen2.5-32b-instruct": 0.512,
1627
+ "qwen2.5-72b-instruct": 0.5599999999999999,
1628
+ "llama-3.1-8b-instruct": 0.23200000000000004,
1629
+ "llama-3.1-70b-instruct": 0.40800000000000003,
1630
+ "llama-3.2-3b-instruct": 0.144,
1631
+ "llama-3.3-70b-instruct": 0.336,
1632
+ "mistral-large-instruct-2411": 0.536,
1633
+ "gemma-2-27b-it": 0.20800000000000002,
1634
+ "gemma-2-9b-it": 0.172,
1635
+ "deepseek-v3": 0.724,
1636
+ "deepseek-r1": 0.716,
1637
+ "qwq-32b": 0.8119999999999999,
1638
+ "Average": 0.41653333333333337
1639
+ },
1640
+ "AlgorithmEnv": {
1641
+ "qwen2.5-3b-instruct": 0.1,
1642
+ "qwen2.5-7b-instruct": 0.28400000000000003,
1643
+ "qwen2.5-14b-instruct": 0.688,
1644
+ "qwen2.5-32b-instruct": 0.6960000000000001,
1645
+ "qwen2.5-72b-instruct": 0.66,
1646
+ "llama-3.1-8b-instruct": 0.35200000000000004,
1647
+ "llama-3.1-70b-instruct": 0.512,
1648
+ "llama-3.2-3b-instruct": 0.22399999999999998,
1649
+ "llama-3.3-70b-instruct": 0.484,
1650
+ "mistral-large-instruct-2411": 0.788,
1651
+ "gemma-2-27b-it": 0.268,
1652
+ "gemma-2-9b-it": 0.164,
1653
+ "deepseek-v3": 0.792,
1654
+ "deepseek-r1": 0.724,
1655
+ "qwq-32b": 0.812,
1656
+ "Average": 0.5032
1657
+ },
1658
+ "MathematicalEnv": {
1659
+ "qwen2.5-3b-instruct": 0.048,
1660
+ "qwen2.5-7b-instruct": 0.42800000000000005,
1661
+ "qwen2.5-14b-instruct": 0.7000000000000001,
1662
+ "qwen2.5-32b-instruct": 0.8119999999999999,
1663
+ "qwen2.5-72b-instruct": 0.792,
1664
+ "llama-3.1-8b-instruct": 0.316,
1665
+ "llama-3.1-70b-instruct": 0.8,
1666
+ "llama-3.2-3b-instruct": 0.12800000000000003,
1667
+ "llama-3.3-70b-instruct": 0.8400000000000001,
1668
+ "mistral-large-instruct-2411": 0.884,
1669
+ "gemma-2-27b-it": 0.268,
1670
+ "gemma-2-9b-it": 0.068,
1671
+ "deepseek-v3": 0.9119999999999999,
1672
+ "deepseek-r1": 0.876,
1673
+ "qwq-32b": 0.8160000000000001,
1674
+ "Average": 0.5792
1675
+ },
1676
+ "MusicalEnv": {
1677
+ "qwen2.5-3b-instruct": 0.04,
1678
+ "qwen2.5-7b-instruct": 0.336,
1679
+ "qwen2.5-14b-instruct": 0.8039999999999999,
1680
+ "qwen2.5-32b-instruct": 0.8560000000000001,
1681
+ "qwen2.5-72b-instruct": 0.8400000000000001,
1682
+ "llama-3.1-8b-instruct": 0.34400000000000003,
1683
+ "llama-3.1-70b-instruct": 0.68,
1684
+ "llama-3.2-3b-instruct": 0.088,
1685
+ "llama-3.3-70b-instruct": 0.8240000000000001,
1686
+ "mistral-large-instruct-2411": 0.884,
1687
+ "gemma-2-27b-it": 0.28,
1688
+ "gemma-2-9b-it": 0.11599999999999999,
1689
+ "deepseek-v3": 0.9480000000000001,
1690
+ "deepseek-r1": 0.892,
1691
+ "qwq-32b": 0.9039999999999999,
1692
+ "Average": 0.5890666666666668
1693
+ },
1694
+ "InventorEnv": {
1695
+ "qwen2.5-3b-instruct": 0.14800000000000002,
1696
+ "qwen2.5-7b-instruct": 0.43200000000000005,
1697
+ "qwen2.5-14b-instruct": 0.776,
1698
+ "qwen2.5-32b-instruct": 0.7999999999999999,
1699
+ "qwen2.5-72b-instruct": 0.772,
1700
+ "llama-3.1-8b-instruct": 0.4,
1701
+ "llama-3.1-70b-instruct": 0.7,
1702
+ "llama-3.2-3b-instruct": 0.188,
1703
+ "llama-3.3-70b-instruct": 0.616,
1704
+ "mistral-large-instruct-2411": 0.8039999999999999,
1705
+ "gemma-2-27b-it": 0.552,
1706
+ "gemma-2-9b-it": 0.364,
1707
+ "deepseek-v3": 0.9399999999999998,
1708
+ "deepseek-r1": 0.908,
1709
+ "qwq-32b": 0.9,
1710
+ "Average": 0.62
1711
+ },
1712
+ "MedicalEnv": {
1713
+ "qwen2.5-3b-instruct": 0.22000000000000003,
1714
+ "qwen2.5-7b-instruct": 0.544,
1715
+ "qwen2.5-14b-instruct": 0.8320000000000001,
1716
+ "qwen2.5-32b-instruct": 0.8800000000000001,
1717
+ "qwen2.5-72b-instruct": 0.8960000000000001,
1718
+ "llama-3.1-8b-instruct": 0.52,
1719
+ "llama-3.1-70b-instruct": 0.82,
1720
+ "llama-3.2-3b-instruct": 0.23200000000000004,
1721
+ "llama-3.3-70b-instruct": 0.8960000000000001,
1722
+ "mistral-large-instruct-2411": 0.8960000000000001,
1723
+ "gemma-2-27b-it": 0.692,
1724
+ "gemma-2-9b-it": 0.5760000000000001,
1725
+ "deepseek-v3": 0.9039999999999999,
1726
+ "deepseek-r1": 0.9359999999999999,
1727
+ "qwq-32b": 0.9199999999999999,
1728
+ "Average": 0.7175999999999999
1729
+ },
1730
+ "MusicEnv": {
1731
+ "qwen2.5-3b-instruct": 0.184,
1732
+ "qwen2.5-7b-instruct": 0.26,
1733
+ "qwen2.5-14b-instruct": 0.656,
1734
+ "qwen2.5-32b-instruct": 0.664,
1735
+ "qwen2.5-72b-instruct": 0.7559999999999999,
1736
+ "llama-3.1-8b-instruct": 0.356,
1737
+ "llama-3.1-70b-instruct": 0.596,
1738
+ "llama-3.2-3b-instruct": 0.10800000000000001,
1739
+ "llama-3.3-70b-instruct": 0.596,
1740
+ "mistral-large-instruct-2411": 0.6639999999999999,
1741
+ "gemma-2-27b-it": 0.45600000000000007,
1742
+ "gemma-2-9b-it": 0.28400000000000003,
1743
+ "deepseek-v3": 0.8119999999999999,
1744
+ "deepseek-r1": 0.868,
1745
+ "qwq-32b": 0.868,
1746
+ "Average": 0.5418666666666667
1747
+ },
1748
+ "FantasyEnv": {
1749
+ "qwen2.5-3b-instruct": 0.148,
1750
+ "qwen2.5-7b-instruct": 0.32,
1751
+ "qwen2.5-14b-instruct": 0.74,
1752
+ "qwen2.5-32b-instruct": 0.7879999999999999,
1753
+ "qwen2.5-72b-instruct": 0.5720000000000001,
1754
+ "llama-3.1-8b-instruct": 0.40800000000000003,
1755
+ "llama-3.1-70b-instruct": 0.676,
1756
+ "llama-3.2-3b-instruct": 0.152,
1757
+ "llama-3.3-70b-instruct": 0.704,
1758
+ "mistral-large-instruct-2411": 0.8240000000000001,
1759
+ "gemma-2-27b-it": 0.524,
1760
+ "gemma-2-9b-it": 0.324,
1761
+ "deepseek-v3": 0.9199999999999999,
1762
+ "deepseek-r1": 0.9719999999999999,
1763
+ "qwq-32b": 0.9719999999999999,
1764
+ "Average": 0.6029333333333332
1765
+ },
1766
+ "EducationEnv": {
1767
+ "qwen2.5-3b-instruct": 0.10400000000000001,
1768
+ "qwen2.5-7b-instruct": 0.268,
1769
+ "qwen2.5-14b-instruct": 0.828,
1770
+ "qwen2.5-32b-instruct": 0.9039999999999999,
1771
+ "qwen2.5-72b-instruct": 0.8480000000000001,
1772
+ "llama-3.1-8b-instruct": 0.5680000000000001,
1773
+ "llama-3.1-70b-instruct": 0.768,
1774
+ "llama-3.2-3b-instruct": 0.192,
1775
+ "llama-3.3-70b-instruct": 0.9039999999999999,
1776
+ "mistral-large-instruct-2411": 0.876,
1777
+ "gemma-2-27b-it": 0.624,
1778
+ "gemma-2-9b-it": 0.45999999999999996,
1779
+ "deepseek-v3": 0.9480000000000001,
1780
+ "deepseek-r1": 0.9,
1781
+ "qwq-32b": 0.9359999999999999,
1782
+ "Average": 0.6752
1783
+ },
1784
+ "ChemicalEnv": {
1785
+ "qwen2.5-3b-instruct": 0.264,
1786
+ "qwen2.5-7b-instruct": 0.44000000000000006,
1787
+ "qwen2.5-14b-instruct": 0.724,
1788
+ "qwen2.5-32b-instruct": 0.7040000000000001,
1789
+ "qwen2.5-72b-instruct": 0.72,
1790
+ "llama-3.1-8b-instruct": 0.36,
1791
+ "llama-3.1-70b-instruct": 0.62,
1792
+ "llama-3.2-3b-instruct": 0.16399999999999998,
1793
+ "llama-3.3-70b-instruct": 0.45999999999999996,
1794
+ "mistral-large-instruct-2411": 0.68,
1795
+ "gemma-2-27b-it": 0.44399999999999995,
1796
+ "gemma-2-9b-it": 0.316,
1797
+ "deepseek-v3": 0.8799999999999999,
1798
+ "deepseek-r1": 0.6799999999999999,
1799
+ "qwq-32b": 0.8200000000000001,
1800
+ "Average": 0.5517333333333333
1801
+ },
1802
+ "Average": {
1803
+ "qwen2.5-3b-instruct": 0.1655841584158416,
1804
+ "qwen2.5-7b-instruct": 0.34736633663366323,
1805
+ "qwen2.5-14b-instruct": 0.7148514851485149,
1806
+ "qwen2.5-32b-instruct": 0.7330693069306928,
1807
+ "qwen2.5-72b-instruct": 0.7272079207920793,
1808
+ "llama-3.1-8b-instruct": 0.3334653465346535,
1809
+ "llama-3.1-70b-instruct": 0.6271287128712871,
1810
+ "llama-3.2-3b-instruct": 0.15599999999999997,
1811
+ "llama-3.3-70b-instruct": 0.6372277227722771,
1812
+ "mistral-large-instruct-2411": 0.7573861386138615,
1813
+ "gemma-2-27b-it": 0.44522772277227735,
1814
+ "gemma-2-9b-it": 0.264,
1815
+ "deepseek-v3": 0.8605148514851484,
1816
+ "deepseek-r1": 0.8304554455445546,
1817
+ "qwq-32b": 0.8630891089108911
1818
+ }
1819
+ }
src/display/css_html_js.py → display.py RENAMED
@@ -1,34 +1,26 @@
1
  custom_css = """
2
-
3
  .markdown-text {
4
  font-size: 16px !important;
5
  }
6
-
7
  #models-to-add-text {
8
  font-size: 18px !important;
9
  }
10
-
11
  #citation-button span {
12
  font-size: 16px !important;
13
  }
14
-
15
  #citation-button textarea {
16
  font-size: 16px !important;
17
  }
18
-
19
  #citation-button > label > button {
20
  margin: 6px;
21
  transform: scale(1.3);
22
  }
23
-
24
  #leaderboard-table {
25
  margin-top: 15px
26
  }
27
-
28
  #leaderboard-table-lite {
29
  margin-top: 15px
30
  }
31
-
32
  #search-bar-table-box > div:first-child {
33
  background: none;
34
  border: none;
@@ -37,7 +29,6 @@ custom_css = """
37
  #search-bar {
38
  padding: 0px;
39
  }
40
-
41
  /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
  #leaderboard-table td:nth-child(2),
43
  #leaderboard-table th:nth-child(2) {
@@ -45,11 +36,9 @@ custom_css = """
45
  overflow: auto;
46
  white-space: nowrap;
47
  }
48
-
49
  .tab-buttons button {
50
  font-size: 20px;
51
  }
52
-
53
  #scale-logo {
54
  border-style: none !important;
55
  box-shadow: none;
@@ -58,7 +47,6 @@ custom_css = """
58
  margin-right: auto;
59
  max-width: 600px;
60
  }
61
-
62
  #scale-logo .download {
63
  display: none;
64
  }
@@ -102,4 +90,4 @@ get_window_url_params = """
102
  url_params = Object.fromEntries(params);
103
  return url_params;
104
  }
105
- """
 
1
  custom_css = """
 
2
  .markdown-text {
3
  font-size: 16px !important;
4
  }
 
5
  #models-to-add-text {
6
  font-size: 18px !important;
7
  }
 
8
  #citation-button span {
9
  font-size: 16px !important;
10
  }
 
11
  #citation-button textarea {
12
  font-size: 16px !important;
13
  }
 
14
  #citation-button > label > button {
15
  margin: 6px;
16
  transform: scale(1.3);
17
  }
 
18
  #leaderboard-table {
19
  margin-top: 15px
20
  }
 
21
  #leaderboard-table-lite {
22
  margin-top: 15px
23
  }
 
24
  #search-bar-table-box > div:first-child {
25
  background: none;
26
  border: none;
 
29
  #search-bar {
30
  padding: 0px;
31
  }
 
32
  /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
33
  #leaderboard-table td:nth-child(2),
34
  #leaderboard-table th:nth-child(2) {
 
36
  overflow: auto;
37
  white-space: nowrap;
38
  }
 
39
  .tab-buttons button {
40
  font-size: 20px;
41
  }
 
42
  #scale-logo {
43
  border-style: none !important;
44
  box-shadow: none;
 
47
  margin-right: auto;
48
  max-width: 600px;
49
  }
 
50
  #scale-logo .download {
51
  display: none;
52
  }
 
90
  url_params = Object.fromEntries(params);
91
  return url_params;
92
  }
93
+ """
process_data.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+
4
+ hard_data_path = 'data/hard_0402.json'
5
+ easy_data_path = 'data/easy_0402.json'
6
+
7
+ average_success_rate_file_path = 'data/success_rate_0402.json'
8
+ average_action_count_file_path = 'data/action_count_0402.json'
9
+
10
+ def load_average_data():
11
+ with open(average_success_rate_file_path, 'r') as f:
12
+ average_success_rate_data = json.load(f)
13
+ with open(average_action_count_file_path, 'r') as f:
14
+ average_action_count_data = json.load(f)
15
+ average_success_rate = average_success_rate_data['Average']
16
+ average_action_count = average_action_count_data['Average']
17
+ df = pd.DataFrame({
18
+ 'Success Rate ↑ (100 Average)': average_success_rate,
19
+ 'Action Count ↓ (100 Average)': average_action_count
20
+ })
21
+ return df
22
+
23
+ def load_hard_data():
24
+ with open(hard_data_path, 'r') as f:
25
+ hard_data = json.load(f)
26
+ df = pd.DataFrame(hard_data)
27
+ df = df.T # 转置为 model 是行,指标是列
28
+ df.rename(columns={'success_rate':'Success Rate ↑ (Hard)', 'relative_action_count':'Action Count ↓ (Hard)'}, inplace=True)
29
+ return df
30
+
31
+ def load_easy_data():
32
+ with open(easy_data_path, 'r') as f:
33
+ hard_data = json.load(f)
34
+ df = pd.DataFrame(hard_data)
35
+ df = df.T # 转置为 model 是行,指标是列
36
+ df.rename(columns={'success_rate':'Success Rate ↑ (Easy)', 'relative_action_count':'Action Count ↓ (Easy)'}, inplace=True)
37
+ return df
38
+
39
+
40
+ if __name__ == '__main__':
41
+ average_df = load_average_data()
42
+ hard_df = load_hard_data()
43
+ easy_df = load_easy_data()
44
+ import ipdb; ipdb.set_trace()
45
+ df = pd.concat([average_df, hard_df, easy_df], axis=1)
46
+ print(df)
47
+
pyproject.toml DELETED
@@ -1,13 +0,0 @@
1
- [tool.ruff]
2
- # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
- select = ["E", "F"]
4
- ignore = ["E501"] # line too long (black is taking care of this)
5
- line-length = 119
6
- fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
-
8
- [tool.isort]
9
- profile = "black"
10
- line_length = 119
11
-
12
- [tool.black]
13
- line-length = 119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,16 +1,3 @@
1
- APScheduler
2
- black
3
- datasets
4
- gradio
5
- gradio[oauth]
6
- gradio_leaderboard==0.0.13
7
- gradio_client
8
- huggingface-hub>=0.18.0
9
- matplotlib
10
- numpy
11
  pandas
12
- python-dateutil
13
- tqdm
14
- transformers
15
- tokenizers>=0.15.0
16
- sentencepiece
 
1
+ apscheduler
 
 
 
 
 
 
 
 
 
2
  pandas
3
+ datasets==2.21.0
 
 
 
 
src/about.py DELETED
@@ -1,72 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
- @dataclass
5
- class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
-
10
-
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
20
-
21
-
22
-
23
- # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
-
26
- # What does your leaderboard evaluate?
27
- INTRODUCTION_TEXT = """
28
- Intro text
29
- """
30
-
31
- # Which evaluations are you running? how can people reproduce what you have?
32
- LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
34
-
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
-
38
- """
39
-
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
-
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
-
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
-
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
60
-
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
-
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
- """
69
-
70
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
- CITATION_BUTTON_TEXT = r"""
72
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py DELETED
@@ -1,27 +0,0 @@
1
- def model_hyperlink(link, model_name):
2
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
-
4
-
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
8
-
9
-
10
- def styled_error(error):
11
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
-
13
-
14
- def styled_warning(warn):
15
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
-
17
-
18
- def styled_message(message):
19
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
-
21
-
22
- def has_no_nan_values(df, columns):
23
- return df[columns].notna().all(axis=1)
24
-
25
-
26
- def has_nan_values(df, columns):
27
- return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py DELETED
@@ -1,110 +0,0 @@
1
- from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
-
4
- import pandas as pd
5
-
6
- from src.about import Tasks
7
-
8
- def fields(raw_class):
9
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
-
11
-
12
- # These classes are for user facing column names,
13
- # to avoid having to change them all around the code
14
- # when a modif is needed
15
- @dataclass
16
- class ColumnContent:
17
- name: str
18
- type: str
19
- displayed_by_default: bool
20
- hidden: bool = False
21
- never_hidden: bool = False
22
-
23
- ## Leaderboard columns
24
- auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
-
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
-
46
- ## For the queue columns in the submission tab
47
- @dataclass(frozen=True)
48
- class EvalQueueColumn: # Queue column
49
- model = ColumnContent("model", "markdown", True)
50
- revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
- status = ColumnContent("status", "str", True)
55
-
56
- ## All the model information that we might need
57
- @dataclass
58
- class ModelDetails:
59
- name: str
60
- display_name: str = ""
61
- symbol: str = "" # emoji
62
-
63
-
64
- class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
- Unknown = ModelDetails(name="", symbol="?")
70
-
71
- def to_str(self, separator=" "):
72
- return f"{self.value.symbol}{separator}{self.value.name}"
73
-
74
- @staticmethod
75
- def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "⭕" in type:
83
- return ModelType.IFT
84
- return ModelType.Unknown
85
-
86
- class WeightType(Enum):
87
- Adapter = ModelDetails("Adapter")
88
- Original = ModelDetails("Original")
89
- Delta = ModelDetails("Delta")
90
-
91
- class Precision(Enum):
92
- float16 = ModelDetails("float16")
93
- bfloat16 = ModelDetails("bfloat16")
94
- Unknown = ModelDetails("?")
95
-
96
- def from_str(precision):
97
- if precision in ["torch.float16", "float16"]:
98
- return Precision.float16
99
- if precision in ["torch.bfloat16", "bfloat16"]:
100
- return Precision.bfloat16
101
- return Precision.Unknown
102
-
103
- # Column selection
104
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
-
106
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
-
109
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py DELETED
@@ -1,25 +0,0 @@
1
- import os
2
-
3
- from huggingface_hub import HfApi
4
-
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
-
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
- # ----------------------------------
11
-
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
-
16
- # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
-
19
- # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
-
25
- API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals.py DELETED
@@ -1,196 +0,0 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
-
14
-
15
- @dataclass
16
- class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
- model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
- still_on_hub: bool = False
34
-
35
- @classmethod
36
- def init_from_json_file(self, json_filepath):
37
- """Inits the result from the specific model result file"""
38
- with open(json_filepath) as fp:
39
- data = json.load(fp)
40
-
41
- config = data.get("config")
42
-
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
-
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
-
69
- # Extract results available in this file (some results are split in several files)
70
- results = {}
71
- for task in Tasks:
72
- task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
-
82
- return self(
83
- eval_name=result_key,
84
- full_model=full_model,
85
- org=org,
86
- model=model,
87
- results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
92
- )
93
-
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
- try:
99
- with open(request_file, "r") as f:
100
- request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
-
110
- def to_dict(self):
111
- """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
- data_dict = {
114
- "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
- }
128
-
129
- for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
-
132
- return data_dict
133
-
134
-
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
-
143
- # Select correct request file (precision)
144
- request_file = ""
145
- request_files = sorted(request_files, reverse=True)
146
- for tmp_request_file in request_files:
147
- with open(tmp_request_file, "r") as f:
148
- req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
- request_file = tmp_request_file
154
- return request_file
155
-
156
-
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
- """From the path of the results folder root, extract all needed info for results"""
159
- model_result_filepaths = []
160
-
161
- for root, _, files in os.walk(results_path):
162
- # We should only have json files in model results
163
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
- continue
165
-
166
- # Sort the files by date
167
- try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
-
172
- for file in files:
173
- model_result_filepaths.append(os.path.join(root, file))
174
-
175
- eval_results = {}
176
- for model_result_filepath in model_result_filepaths:
177
- # Creation of result
178
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
-
181
- # Store results of same eval together
182
- eval_name = eval_result.eval_name
183
- if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
- else:
186
- eval_results[eval_name] = eval_result
187
-
188
- results = []
189
- for v in eval_results.values():
190
- try:
191
- v.to_dict() # we test if the dict version is complete
192
- results.append(v)
193
- except KeyError: # not all eval values present
194
- continue
195
-
196
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py DELETED
@@ -1,58 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
19
-
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
-
24
-
25
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- all_evals = []
29
-
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,99 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
-
13
- def check_model_card(repo_id: str) -> tuple[bool, str]:
14
- """Checks if the model card and license exist and have been filled"""
15
- try:
16
- card = ModelCard.load(repo_id)
17
- except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
-
20
- # Enforce license metadata
21
- if card.data.license is None:
22
- if not ("license_name" in card.data and "license_link" in card.data):
23
- return False, (
24
- "License not found. Please add a license to your model card using the `license` metadata or a"
25
- " `license_name`/`license_link` pair."
26
- )
27
-
28
- # Enforce card content
29
- if len(card.text) < 200:
30
- return False, "Please add a description to your model card, it is too short."
31
-
32
- return True, ""
33
-
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
- try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
- if test_tokenizer:
39
- try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- except ValueError as e:
42
- return (
43
- False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
- )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
- return True, None, config
50
-
51
- except ValueError:
52
- return (
53
- False,
54
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
- )
57
-
58
- except Exception as e:
59
- return False, "was not found on hub!", None
60
-
61
-
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
- try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
- model_size = size_factor * model_size
71
- return model_size
72
-
73
- def get_model_arch(model_info: ModelInfo):
74
- """Gets the model architecture from the configuration"""
75
- return model_info.config.get("architectures", "Unknown")
76
-
77
- def already_submitted_models(requested_models_dir: str) -> set[str]:
78
- """Gather a list of already submitted models to avoid duplicates"""
79
- depth = 1
80
- file_names = []
81
- users_to_submission_dates = defaultdict(list)
82
-
83
- for root, _, files in os.walk(requested_models_dir):
84
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
- if current_depth == depth:
86
- for file in files:
87
- if not file.endswith(".json"):
88
- continue
89
- with open(os.path.join(root, file), "r") as f:
90
- info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
-
93
- # Select organisation
94
- if info["model"].count("/") == 0 or "submitted_time" not in info:
95
- continue
96
- organisation, _ = info["model"].split("/")
97
- users_to_submission_dates[organisation].append(info["submitted_time"])
98
-
99
- return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py DELETED
@@ -1,119 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
-
14
- REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
16
-
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
-
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
-
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
-
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
-
78
- eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
- "status": "PENDING",
85
- "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
- }
92
-
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
-
97
- print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
- os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
-
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
104
-
105
- print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
113
-
114
- # Remove the local file
115
- os.remove(out_path)
116
-
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
texts.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ TITLE = """<h1 align="center" id="space-title">KUMO Benchmark</h1>"""
3
+
4
+ DESCRIPTION = f"""
5
+ ## Generative Evaluation of Complex Reasoning in Large Language Models
6
+
7
+ ✨ KUMO is a novel benchmark designed to systematically evaluate the complex reasoning capabilities of Large Language Models (LLMs) through procedurally generated reasoning games. Explore the limits of LLM reasoning and track model performance on our interactive leaderboard.
8
+
9
+
10
+ """