File size: 3,315 Bytes
cbf9114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115

import gradio as gr
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

def load_results():
    with open('results.json', 'r') as f:
        return json.load(f)

def create_metrics_df(results):
    rows = []
    for r in results:
        row = {
            'Model': r['model_name'],
            'Timestamp': r['timestamp'],
            'Embeddings': r['config']['embedding_model'],
            'Retriever': r['config']['retriever_type'],
            'Top-K': r['config']['retrieval_config'].get('top_k', 'N/A')
        }
        
        # Add metrics
        metrics = r['metrics']
        for category in ['retrieval', 'generation']:
            if category in metrics:
                for metric_name, value in metrics[category].items():
                    row[f"{category}_{metric_name}"] = round(value, 4)
        
        rows.append(row)
    
    return pd.DataFrame(rows)

def create_comparison_plot(df, metric_category):
    metrics = [col for col in df.columns if col.startswith(metric_category)]
    if not metrics:
        return None
        
    fig = go.Figure()
    for metric in metrics:
        fig.add_trace(go.Bar(
            name=metric.split('_')[-1],
            x=df['Model'],
            y=df[metric],
            text=df[metric].round(3),
            textposition='auto',
        ))
    
    fig.update_layout(
        title=f"{metric_category.capitalize()} Metrics Comparison",
        xaxis_title="Model",
        yaxis_title="Score",
        barmode='group'
    )
    return fig

def create_interface():
    results = load_results()
    df = create_metrics_df(results)
    
    with gr.Blocks() as demo:
        gr.Markdown("# RAG Evaluation Leaderboard")
        
        with gr.Tabs():
            with gr.Tab("Leaderboard"):
                gr.Dataframe(
                    df,
                    headers=df.columns.tolist(),
                    interactive=False
                )
            
            with gr.Tab("Retrieval Metrics"):
                gr.Plot(create_comparison_plot(df, 'retrieval'))
            
            with gr.Tab("Generation Metrics"):
                gr.Plot(create_comparison_plot(df, 'generation'))
            
            with gr.Tab("Configuration Details"):
                config_df = df[['Model', 'Embeddings', 'Retriever', 'Top-K', 'Timestamp']]
                gr.Dataframe(config_df)
        
        gr.Markdown('''
        ## How to Submit
        
        To submit your results:
        ```python
        from rag_leaderboard import RAGLeaderboard
        
        # Initialize leaderboard
        leaderboard = RAGLeaderboard(
            repo_id="your-username/repo-name",
            token="your-hf-token"
        )
        
        # Submit results
        leaderboard.submit_results(
            model_name="Your Model Name",
            metrics={
                "retrieval": {"hit_rate": 0.8, "mrr": 0.6},
                "generation": {"rouge1": 0.7, "rouge2": 0.5, "rougeL": 0.6}
            },
            config={
                "embedding_model": "your-embedding-model",
                "retriever_type": "dense",
                "retrieval_config": {"top_k": 3}
            }
        )
        ```
        ''')
    
    return demo

demo = create_interface()
demo.launch()