File size: 3,674 Bytes
252caca
d092d11
18b6d60
143b62d
d092d11
4fb58cc
d092d11
87bb867
d092d11
143b62d
d092d11
143b62d
 
 
031841d
87bb867
031841d
 
4fb58cc
031841d
87bb867
031841d
 
 
 
 
 
 
 
 
87bb867
 
 
 
 
 
252caca
87bb867
 
 
252caca
87bb867
 
 
252caca
87bb867
 
 
 
252caca
87bb867
 
031841d
29fb045
143b62d
f961a8f
 
 
252caca
18b6d60
 
031841d
 
 
 
 
 
143b62d
 
 
 
 
031841d
 
 
 
143b62d
 
 
18b6d60
 
87bb867
 
143b62d
 
031841d
 
 
 
 
 
29fb045
031841d
 
 
 
 
 
 
 
 
 
 
143b62d
f253a0d
143b62d
 
031841d
143b62d
 
 
 
031841d
143b62d
 
031841d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import html
import json
import logging

import gradio as gr
import numpy as np
from dotenv import load_dotenv

from pages.summarization_playground import custom_css, generate_answer
from utils.metric import metric_rouge_score
from utils.model import Model

load_dotenv()


def display_results(response_list):
    overall_score = np.mean([r["metric_score"]["rouge_score"] for r in response_list])

    html_output = f"<h2>Overall Score: {overall_score:.2f}</h2>"

    for i, item in enumerate(response_list, 1):
        dialogue = item["dialogue"]
        summary = item["summary"]
        response = item["response"]
        rouge_score = item["metric_score"]["rouge_score"]

        dialogue = html.escape(item["dialogue"]).replace("\n", "<br>")
        summary = html.escape(item["summary"]).replace("\n", "<br>")
        response = html.escape(item["response"]).replace("\n", "<br>")

        html_output += f"""
        <details>
        <summary>Response {i} (Rouge Score: {rouge_score:.2f})</summary>
        <div style="display: flex; justify-content: space-between;">
            <div style="width: 30%;">
                <h3>Dialogue</h3>
                <pre style="white-space: pre-wrap; word-wrap: break-word;">{dialogue}</pre>
            </div>
            <div style="width: 30%;">
                <h3>Summary</h3>
                <pre style="white-space: pre-wrap; word-wrap: break-word;">{summary}</pre>
            </div>
            <div style="width: 30%;">
                <h3>Response</h3>
                <pre style="white-space: pre-wrap; word-wrap: break-word;">{response}</pre>
            </div>
        </div>
        </details>
        """

    return html_output


def process(model_selection, prompt, num=10):
    response_list = []
    with open("test_samples/test_data.json", "r") as file:
        json_data = file.read()
        dataset = json.loads(json_data)

    for i, data in enumerate(dataset):
        logging.info(f"Start testing datapoint {i+1}")
        dialogue = data["dialogue"]
        format = data["format"]
        summary = data["summary"]
        response = generate_answer(
            dialogue, model_selection, prompt + f" Output following {format} format."
        )

        rouge_score = metric_rouge_score(response, summary)

        response_list.append(
            {
                "dialogue": dialogue,
                "summary": summary,
                "response": response,
                "metric_score": {"rouge_score": rouge_score},
            }
        )

        logging.info(f"Complete testing datapoint {i+1}")

    return display_results(response_list)


def create_batch_evaluation_interface():
    with gr.Blocks(
        theme=gr.themes.Soft(spacing_size="sm", text_size="sm"), css=custom_css
    ) as demo:
        gr.Markdown(
            "## Here are evaluation setups. It will run though datapoints in test_data.josn to generate and evaluate. Show results once finished."
        )

        model_dropdown = gr.Dropdown(
            choices=Model.__model_list__,
            label="Choose a model",
            value=Model.__model_list__[0],
        )
        Template_text = gr.Textbox(
            value="""Summarize the following dialogue""",
            label="Input Prompting Template",
            lines=8,
            placeholder="Input your prompts",
        )
        submit_button = gr.Button("✨ Submit ✨")
        output = gr.HTML(label="Results")

        submit_button.click(
            process, inputs=[model_dropdown, Template_text], outputs=output
        )

    return demo


if __name__ == "__main__":
    demo = create_batch_evaluation_interface()
    demo.launch()