Chris-lab / pages /batch_evaluation.py
kz209
update format
031841d
import html
import json
import logging
import gradio as gr
import numpy as np
from dotenv import load_dotenv
from pages.summarization_playground import custom_css, generate_answer
from utils.metric import metric_rouge_score
from utils.model import Model
load_dotenv()
def display_results(response_list):
overall_score = np.mean([r["metric_score"]["rouge_score"] for r in response_list])
html_output = f"<h2>Overall Score: {overall_score:.2f}</h2>"
for i, item in enumerate(response_list, 1):
dialogue = item["dialogue"]
summary = item["summary"]
response = item["response"]
rouge_score = item["metric_score"]["rouge_score"]
dialogue = html.escape(item["dialogue"]).replace("\n", "<br>")
summary = html.escape(item["summary"]).replace("\n", "<br>")
response = html.escape(item["response"]).replace("\n", "<br>")
html_output += f"""
<details>
<summary>Response {i} (Rouge Score: {rouge_score:.2f})</summary>
<div style="display: flex; justify-content: space-between;">
<div style="width: 30%;">
<h3>Dialogue</h3>
<pre style="white-space: pre-wrap; word-wrap: break-word;">{dialogue}</pre>
</div>
<div style="width: 30%;">
<h3>Summary</h3>
<pre style="white-space: pre-wrap; word-wrap: break-word;">{summary}</pre>
</div>
<div style="width: 30%;">
<h3>Response</h3>
<pre style="white-space: pre-wrap; word-wrap: break-word;">{response}</pre>
</div>
</div>
</details>
"""
return html_output
def process(model_selection, prompt, num=10):
response_list = []
with open("test_samples/test_data.json", "r") as file:
json_data = file.read()
dataset = json.loads(json_data)
for i, data in enumerate(dataset):
logging.info(f"Start testing datapoint {i+1}")
dialogue = data["dialogue"]
format = data["format"]
summary = data["summary"]
response = generate_answer(
dialogue, model_selection, prompt + f" Output following {format} format."
)
rouge_score = metric_rouge_score(response, summary)
response_list.append(
{
"dialogue": dialogue,
"summary": summary,
"response": response,
"metric_score": {"rouge_score": rouge_score},
}
)
logging.info(f"Complete testing datapoint {i+1}")
return display_results(response_list)
def create_batch_evaluation_interface():
with gr.Blocks(
theme=gr.themes.Soft(spacing_size="sm", text_size="sm"), css=custom_css
) as demo:
gr.Markdown(
"## Here are evaluation setups. It will run though datapoints in test_data.josn to generate and evaluate. Show results once finished."
)
model_dropdown = gr.Dropdown(
choices=Model.__model_list__,
label="Choose a model",
value=Model.__model_list__[0],
)
Template_text = gr.Textbox(
value="""Summarize the following dialogue""",
label="Input Prompting Template",
lines=8,
placeholder="Input your prompts",
)
submit_button = gr.Button("✨ Submit ✨")
output = gr.HTML(label="Results")
submit_button.click(
process, inputs=[model_dropdown, Template_text], outputs=output
)
return demo
if __name__ == "__main__":
demo = create_batch_evaluation_interface()
demo.launch()