BounharAbdelaziz commited on
Commit
ad3a876
Β·
verified Β·
1 Parent(s): 1f1dbb3

added causal lm eval

Browse files
Files changed (1) hide show
  1. human_eval.py +109 -23
human_eval.py CHANGED
@@ -33,6 +33,22 @@ def create_html_media(media_path, is_gif=False):
33
  """
34
  return html_string
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  class LMBattleArena:
37
  def __init__(self, dataset_path):
38
  """Initialize battle arena with dataset"""
@@ -40,23 +56,29 @@ class LMBattleArena:
40
  print(self.df.head())
41
  self.current_index = 0
42
  self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
43
- self.evaluation_results = []
 
44
  self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
45
 
46
- def get_next_battle_pair(self):
47
  """Retrieve next pair of summaries for comparison"""
48
  if self.current_index >= len(self.df):
49
  return None
50
 
51
  row = self.df.iloc[self.current_index]
52
- model_summary_cols = [
53
- col
54
- for col in row.index
55
- if col.upper() != 'PROMPT'
56
- ]
 
 
 
 
 
57
  selected_models = random.sample(model_summary_cols, 2)
58
  battle_data = {
59
- 'prompt': row['prompt'],
60
  'model_1': row[selected_models[0]],
61
  'model_2': row[selected_models[1]],
62
  'model1_name': selected_models[0],
@@ -65,7 +87,7 @@ class LMBattleArena:
65
  self.current_index += 1
66
  return battle_data
67
 
68
- def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name):
69
  """Record user's model preference and update scores"""
70
  self.model_scores[model1_name]['total_comparisons'] += 1
71
  self.model_scores[model2_name]['total_comparisons'] += 1
@@ -87,14 +109,23 @@ class LMBattleArena:
87
  'model2_name': model2_name,
88
  'preferred_models': preferred_models
89
  }
90
- self.evaluation_results.append(evaluation)
 
 
 
91
 
92
- return self.get_model_scores_df()
93
 
94
- def get_model_scores_df(self):
95
  """Convert model scores to DataFrame"""
96
  scores_data = []
97
  for model, stats in self.model_scores.items():
 
 
 
 
 
 
98
  win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
99
  scores_data.append({
100
  'Model': model,
@@ -113,11 +144,11 @@ class LMBattleArena:
113
  return results_df
114
 
115
 
116
- def create_battle_arena(dataset_path, is_gif):
117
  arena = LMBattleArena(dataset_path)
118
 
119
- def battle_round():
120
- battle_data = arena.get_next_battle_pair()
121
 
122
  if battle_data is None:
123
  return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
@@ -131,11 +162,11 @@ def create_battle_arena(dataset_path, is_gif):
131
  gr.DataFrame(visible=True)
132
  )
133
 
134
- def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models):
135
  scores_df = arena.record_evaluation(
136
- preferred_models, input_text, output_1, output_2, model1_name, model2_name
137
  )
138
- next_battle = battle_round()
139
  return (*next_battle[:-1], scores_df)
140
 
141
  with gr.Blocks(css="footer{display:none !important}") as demo:
@@ -145,9 +176,60 @@ def create_battle_arena(dataset_path, is_gif):
145
  gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
146
 
147
  with gr.Tabs():
148
- with gr.Tab("Battle Arena"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  gr.Markdown("# πŸ€– Pretrained SmolLMs Battle Arena")
150
 
 
 
 
151
  input_text = gr.Textbox(
152
  label="Input prompt",
153
  interactive=False,
@@ -180,18 +262,22 @@ def create_battle_arena(dataset_path, is_gif):
180
 
181
  submit_btn.click(
182
  submit_preference,
183
- inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models],
184
  outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
185
  )
186
 
187
- demo.load(battle_round, outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table])
 
 
 
 
188
 
189
  return demo
190
 
191
  if __name__ == "__main__":
192
 
193
  # load the existing dataset that contains outputs of the LMs
194
- human_eval_dataset = load_dataset("atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas", split='train').to_csv('human_eval_dataset.csv')
195
 
196
  # precision
197
  torch_dtype = torch.float16
@@ -200,5 +286,5 @@ if __name__ == "__main__":
200
  device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
201
  dataset_path = 'human_eval_dataset.csv'
202
  is_gif = True
203
- demo = create_battle_arena(dataset_path, is_gif)
204
  demo.launch(debug=True)
 
33
  """
34
  return html_string
35
 
36
+ MASKED_LM_MODELS = [
37
+ "BounharAbdelaziz/XLM-RoBERTa-Morocco",
38
+ "SI2M-Lab/DarijaBERT",
39
+ "BounharAbdelaziz/ModernBERT-Morocco",
40
+ "google-bert/bert-base-multilingual-cased",
41
+ "FacebookAI/xlm-roberta-large",
42
+ "aubmindlab/bert-base-arabertv02",
43
+ ]
44
+
45
+ CAUSAL_LM_MODELS = [
46
+ "BounharAbdelaziz/Al-Atlas-LLM-0.5B",
47
+ "Qwen/Qwen2.5-0.5B",
48
+ "tiiuae/Falcon3-1B-Base",
49
+ "MBZUAI-Paris/Atlas-Chat-2B",
50
+ ]
51
+
52
  class LMBattleArena:
53
  def __init__(self, dataset_path):
54
  """Initialize battle arena with dataset"""
 
56
  print(self.df.head())
57
  self.current_index = 0
58
  self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
59
+ self.evaluation_results_masked = []
60
+ self.evaluation_results_causal = []
61
  self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
62
 
63
+ def get_next_battle_pair(self, is_causal):
64
  """Retrieve next pair of summaries for comparison"""
65
  if self.current_index >= len(self.df):
66
  return None
67
 
68
  row = self.df.iloc[self.current_index]
69
+ if is_causal:
70
+ model_summary_cols = [
71
+ col
72
+ for col in CAUSAL_LM_MODELS
73
+ ]
74
+ else:
75
+ model_summary_cols = [
76
+ col
77
+ for col in MASKED_LM_MODELS
78
+ ]
79
  selected_models = random.sample(model_summary_cols, 2)
80
  battle_data = {
81
+ 'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
82
  'model_1': row[selected_models[0]],
83
  'model_2': row[selected_models[1]],
84
  'model1_name': selected_models[0],
 
87
  self.current_index += 1
88
  return battle_data
89
 
90
+ def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
91
  """Record user's model preference and update scores"""
92
  self.model_scores[model1_name]['total_comparisons'] += 1
93
  self.model_scores[model2_name]['total_comparisons'] += 1
 
109
  'model2_name': model2_name,
110
  'preferred_models': preferred_models
111
  }
112
+ if is_causal:
113
+ self.evaluation_results_causal.append(evaluation)
114
+ else:
115
+ self.evaluation_results_masked.append(evaluation)
116
 
117
+ return self.get_model_scores_df(is_causal)
118
 
119
+ def get_model_scores_df(self, is_causal):
120
  """Convert model scores to DataFrame"""
121
  scores_data = []
122
  for model, stats in self.model_scores.items():
123
+ if is_causal:
124
+ if model not in CAUSAL_LM_MODELS:
125
+ continue
126
+ else:
127
+ if model not in MASKED_LM_MODELS:
128
+ continue
129
  win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
130
  scores_data.append({
131
  'Model': model,
 
144
  return results_df
145
 
146
 
147
+ def create_battle_arena(dataset_path, is_gif, is_causal):
148
  arena = LMBattleArena(dataset_path)
149
 
150
+ def battle_round(is_causal):
151
+ battle_data = arena.get_next_battle_pair(is_causal)
152
 
153
  if battle_data is None:
154
  return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
 
162
  gr.DataFrame(visible=True)
163
  )
164
 
165
+ def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal):
166
  scores_df = arena.record_evaluation(
167
+ preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal
168
  )
169
+ next_battle = battle_round(is_causal)
170
  return (*next_battle[:-1], scores_df)
171
 
172
  with gr.Blocks(css="footer{display:none !important}") as demo:
 
176
  gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
177
 
178
  with gr.Tabs():
179
+ with gr.Tab("Masked LM Battle Arena"):
180
+ gr.Markdown("# πŸ€– Pretrained SmolLMs Battle Arena")
181
+
182
+ # Use gr.State to store the boolean value without displaying it
183
+ is_causal = gr.State(value=False)
184
+
185
+ input_text = gr.Textbox(
186
+ label="Input prompt",
187
+ interactive=False,
188
+ )
189
+
190
+ with gr.Row():
191
+ output_1 = gr.Textbox(
192
+ label="Model A",
193
+ interactive=False
194
+ )
195
+ model1_name = gr.State() # Hidden state for model1 name
196
+
197
+ with gr.Row():
198
+ output_2 = gr.Textbox(
199
+ label="Model B",
200
+ interactive=False
201
+ )
202
+ model2_name = gr.State() # Hidden state for model2 name
203
+
204
+ preferred_models = gr.Radio(
205
+ label="Which model is better?",
206
+ choices=["Model A", "Model B", "Both Good", "Both Bad"]
207
+ )
208
+ submit_btn = gr.Button("Vote", variant="primary")
209
+
210
+ scores_table = gr.DataFrame(
211
+ headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
212
+ label="πŸ† Leaderboard"
213
+ )
214
+
215
+ submit_btn.click(
216
+ submit_preference,
217
+ inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
218
+ outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
219
+ )
220
+
221
+ demo.load(
222
+ battle_round,
223
+ inputs=[is_causal],
224
+ outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
225
+ )
226
+
227
+ with gr.Tab("Causal LM Battle Arena"):
228
  gr.Markdown("# πŸ€– Pretrained SmolLMs Battle Arena")
229
 
230
+ # Use gr.State to store the boolean value without displaying it
231
+ is_causal = gr.State(value=True)
232
+
233
  input_text = gr.Textbox(
234
  label="Input prompt",
235
  interactive=False,
 
262
 
263
  submit_btn.click(
264
  submit_preference,
265
+ inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
266
  outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
267
  )
268
 
269
+ demo.load(
270
+ battle_round,
271
+ inputs=[is_causal],
272
+ outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
273
+ )
274
 
275
  return demo
276
 
277
  if __name__ == "__main__":
278
 
279
  # load the existing dataset that contains outputs of the LMs
280
+ human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test').to_csv('human_eval_dataset.csv')
281
 
282
  # precision
283
  torch_dtype = torch.float16
 
286
  device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
287
  dataset_path = 'human_eval_dataset.csv'
288
  is_gif = True
289
+ demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
290
  demo.launch(debug=True)