advanced

Running on CPU Upgrade

App Files Files Community

advanced / yourbench_space /leaderboard_space /utils.py

alozowski HF Staff

Update pyproject.toml and apply ruff

64a657c 27 days ago

raw

history blame contribute delete

11.2 kB

	import json
	from typing import Any

	from env import TASK, MODELS, ORG_NAME

	import gradio as gr
	from datasets import Dataset, load_dataset


	KNOWN_METRIC_LABELS = {
	"accuracy": "Accuracy",
	"accuracy_stderr": "Accuracy (stderr)",
	}


	def aggregate_results() -> list:
	"""Extract scores for each model and return list of result dictionaries."""
	all_results = []
	for model_path in MODELS:
	try:
	path = f"{ORG_NAME}/details_{model_path.replace('/', '__')}_private"
	dataset = load_dataset(path, "results", split="latest")
	config = json.loads(dataset["config_general"][0])
	results = json.loads(dataset["results"][0])

	_, model = model_path.split("/")
	duration = round(config["end_time"] - config["start_time"], 2)

	result = {
	"Model": model,
	"Duration (s)": duration,
	}

	for metric, metric_values in results.items():
	if metric == "all":
	continue

	for raw_metric_name, metric_value in metric_values.items():
	base_name = raw_metric_name.split("(")[0].strip()
	pretty_label = KNOWN_METRIC_LABELS.get(base_name, raw_metric_name)

	if isinstance(metric_value, float):
	metric_value = round(metric_value, 3)

	result[pretty_label] = metric_value

	all_results.append(result)

	except Exception as e:
	print(f"Error processing {model_path} {ORG_NAME}: {e}")

	# Sort final result by Accuracy
	all_results.sort(key=lambda r: r.get("Accuracy", 0), reverse=True)

	return all_results


	def extract_dataviz() -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]:
	"""Extract best, worst, and all samples for visualization"""
	sample_index_map = {}

	for model_path in MODELS:
	try:
	dataset_path = f"{ORG_NAME}/details_{model_path.replace('/', '__')}_private"
	split_name = f"custom_{TASK.replace('/', '_')}_0"
	dataset = load_dataset(dataset_path, split_name, split="latest")

	for idx, row in enumerate(dataset):
	prompt = row["full_prompt"]
	gold = row.get("gold", "")
	gold = gold[0] if isinstance(gold, list) and gold else gold
	score = list(row["metrics"].values())[0]
	predictions = row.get("predictions", [])
	prediction = predictions[0] if predictions else ""

	if idx not in sample_index_map:
	sample_index_map[idx] = {
	"ix": idx,
	"prompt": prompt,
	"gold": gold,
	"model_scores": [],
	"models": [],
	}

	if model_path not in sample_index_map[idx]["models"]:
	sample_index_map[idx][f"{model_path}_score"] = row["metrics"]
	sample_index_map[idx][f"{model_path}_prediction"] = prediction
	sample_index_map[idx]["model_scores"].append(score)
	sample_index_map[idx]["models"].append(model_path)

	except Exception as e:
	print(f"Error processing {model_path}: {e}")

	all_samples = sorted(sample_index_map.values(), key=lambda r: r["ix"])

	hard_samples = [sample for sample in all_samples if sum(sample["model_scores"]) == 0]

	easy_samples = [sample for sample in all_samples if sum(sample["model_scores"]) == len(sample["model_scores"])]

	return easy_samples, hard_samples, all_samples


	def samples_to_box_display(samples: list[dict[str, Any]], example_index: int = 0) -> str:
	"""
	Adapted from Nathan's code https://huggingface.co./spaces/SaylorTwift/OpenEvalsModelDetails/
	Support both light and dark themes
	"""
	if not samples:
	return "No samples in this category!"

	sample = samples[example_index]
	outputs = []

	for model in sample["models"]:
	try:
	outputs.append({
	"Model": model,
	"Prediction": sample[f"{model}_prediction"],
	"Prompt": sample["prompt"],
	"Metrics": sample[f"{model}_score"],
	"Gold": sample["gold"],
	})
	except (KeyError, IndexError):
	continue

	if not outputs:
	return "No results found for the selected combination."

	# CSS for theme compatibility
	css = """
	<style>
	:root {
	--primary-bg: #f5f5f5;
	--secondary-bg: #ffffff;
	--gold-bg: #e6f3e6;
	--text-color: #333333;
	--border-color: #ddd;
	}

	@media (prefers-color-scheme: dark) {
	:root {
	--primary-bg: #2a2a2a;
	--secondary-bg: #333333;
	--gold-bg: #2a3a2a;
	--text-color: #e0e0e0;
	--border-color: #555;
	}
	}

	.box-container {
	max-width: 800px;
	margin: 0 auto;
	color: var(--text-color);
	}

	.gold-box {
	background: var(--gold-bg);
	padding: 20px;
	border-radius: 10px;
	margin-bottom: 20px;
	}

	.model-box {
	background: var(--primary-bg);
	padding: 20px;
	margin-bottom: 20px;
	border-radius: 10px;
	}

	.content-section {
	background: var(--secondary-bg);
	padding: 15px;
	border-radius: 5px;
	margin-top: 10px;
	}

	.metric-row {
	padding: 5px;
	border-bottom: 1px solid var(--border-color);
	}

	h2, h3 {
	color: var(--text-color);
	}

	pre, code {
	white-space: pre-wrap;
	word-wrap: break-word;
	margin: 0;
	color: var(--text-color);
	}
	</style>
	"""

	# Create HTML output with all models
	html_output = f"{css}<div class='box-container'>\n\n"

	# Show gold answer at the top with distinct styling
	if outputs:
	html_output += "<div class='gold-box'>\n"
	html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n"
	html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n"
	html_output += f"<pre><code>{outputs[0]['Gold']}</code></pre>\n"
	html_output += "</div>\n"
	html_output += "</div>\n"

	for output in outputs:
	html_output += "<div class='model-box'>\n"
	html_output += f"<h2 style='margin-top: 0;'>{output['Model']}</h2>\n"

	# Format metrics as a clean table
	html_output += "<details open style='margin-bottom: 15px;'>\n"
	html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
	metrics = output["Metrics"]
	if isinstance(metrics, str):
	metrics = eval(metrics)
	html_output += "<div style='overflow-x: auto;'>\n"
	html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n"
	for key, value in metrics.items():
	if isinstance(value, float):
	value = f"{value:.3f}"
	html_output += f"<tr class='metric-row'><td><strong>{key}</strong></td><td>{value}</td></tr>\n"
	html_output += "</table>\n"
	html_output += "</div>\n"
	html_output += "</details>\n\n"

	# Handle prompt formatting with better styling
	html_output += "<details style='margin-bottom: 15px;'>\n"
	html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
	html_output += "<div class='content-section'>\n"

	prompt_text = output["Prompt"]
	if isinstance(prompt_text, list):
	for i, msg in enumerate(prompt_text):
	if isinstance(msg, dict) and "content" in msg:
	role = msg.get("role", "message").title()
	html_output += "<div style='margin-bottom: 10px;'>\n"
	html_output += f"<strong>{role}:</strong>\n"
	html_output += "<div style='overflow-x: auto;'>\n"
	html_output += f"<pre><code>{msg['content']}</code></pre>\n"
	html_output += "</div>\n"
	html_output += "</div>\n"
	else:
	html_output += "<div style='margin-bottom: 10px;'>\n"
	html_output += "<div style='overflow-x: auto;'>\n"
	html_output += f"<pre><code>{json.dumps(msg, indent=2)}</code></pre>\n"
	html_output += "</div>\n"
	html_output += "</div>\n"
	else:
	html_output += "<div style='overflow-x: auto;'>\n"
	if isinstance(prompt_text, dict) and "content" in prompt_text:
	html_output += f"<pre><code>{prompt_text['content']}</code></pre>\n"
	else:
	html_output += f"<pre><code>{prompt_text}</code></pre>\n"
	html_output += "</div>\n"

	html_output += "</div>\n"
	html_output += "</details>\n\n"

	# Style prediction output - now in a collapsible section
	html_output += "<details open style='margin-bottom: 15px;'>\n"
	html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
	# Add word count in a muted style
	word_count = len(output["Prediction"].split())
	html_output += f"<span style='color: inherit; opacity: 0.7; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
	html_output += "</summary>\n"
	html_output += "<div class='content-section'>\n"
	html_output += "<div style='overflow-x: auto;'>\n"
	html_output += f"<pre><code>{output['Prediction']}</code></pre>\n"
	html_output += "</div>\n"
	html_output += "</div>\n"
	html_output += "</details>\n"
	html_output += "</div>\n\n"

	html_output += "</div>"
	return html_output


	def run_pipeline(samples_ix: int = 0) -> tuple[Any, Any, Any, Any]:
	"""Run evaluation pipeline and return results for display"""
	results = aggregate_results()
	easy_samples, hard_samples, all_samples = extract_dataviz()

	return (
	gr.Dataframe(Dataset.from_list(results).to_pandas(), visible=True),
	gr.HTML(
	samples_to_box_display(easy_samples, samples_ix),
	label="Easiest samples (always found)",
	visible=True,
	),
	gr.HTML(
	samples_to_box_display(hard_samples, samples_ix),
	label="Hardest samples (always failed)",
	visible=True,
	),
	gr.HTML(
	samples_to_box_display(all_samples, samples_ix),
	label="All samples",
	visible=True,
	),
	)


	def update_examples(samples_ix: int = 0) -> tuple[str, str, str]:
	"""Return HTML strings for easy, hard, and all samples"""
	easy_samples, hard_samples, all_samples = extract_dataviz()

	return (
	samples_to_box_display(easy_samples, samples_ix),
	samples_to_box_display(hard_samples, samples_ix),
	samples_to_box_display(all_samples, samples_ix),
	)