Spaces:

open-nlp
/

Chris-lab

Sleeping

App Files Files Community

kz209 commited on 12 days ago

Commit

031841d

1 Parent(s): 8e22bd4

update format

Browse files

Files changed (11) hide show

app.py +2 -5
pages/__init__.py +1 -1
pages/arena.py +46 -22
pages/batch_evaluation.py +44 -30
pages/leaderboard.py +39 -21
pages/summarization_playground.py +103 -26
utils/__init__.py +1 -1
utils/data.py +3 -3
utils/metric.py +3 -2
utils/model.py +46 -29
utils/multiple_stream.py +19 -14

app.py CHANGED Viewed

@@ -13,13 +13,10 @@ This application is for **display** and is designed to facilitate **fast prototy
 Select a demo from the sidebar below to begin experimentation."""
 with gr.Blocks() as demo:
     with gr.Column(scale=4):
-        content = content = gr.Blocks(
-            gr.Markdown(
-                welcome_message()
-            )
-        )
         with gr.Tabs() as tabs:
             with gr.TabItem("Summarization"):

 Select a demo from the sidebar below to begin experimentation."""
 with gr.Blocks() as demo:
     with gr.Column(scale=4):
+        content = content = gr.Blocks(gr.Markdown(welcome_message()))
         with gr.Tabs() as tabs:
             with gr.TabItem("Summarization"):

pages/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # This is the __init__.py file for the utils package
 # You can add any initialization code or import statements here
-__all__ = ['arena', 'batch_evaluation', 'leaderboard', 'summarization_playground']

 # This is the __init__.py file for the utils package
 # You can add any initialization code or import statements here
+__all__ = ["arena", "batch_evaluation", "leaderboard", "summarization_playground"]

pages/arena.py CHANGED Viewed

@@ -10,9 +10,10 @@ from utils.multiple_stream import stream_data
 def random_data_selection():
     datapoint = random.choice(dataset)
-    datapoint = datapoint['section_text'] + '\n\nDialogue:\n' + datapoint['dialogue']
     return datapoint
 def create_arena():
     with open("prompt/prompt.json", "r") as file:
         json_data = file.read()
@@ -21,19 +22,24 @@ def create_arena():
     with gr.Blocks(css=custom_css) as demo:
         with gr.Group():
             datapoint = random_data_selection()
-            gr.Markdown("""This arena is designed to compare different prompts. Click the button to stream responses from randomly shuffled prompts. Each column represents a response generated from one randomly selected prompt.
-Once the streaming is complete, you can choose the best response.\u2764\ufe0f""")
-            data_textbox = gr.Textbox(label="Data", lines=10, placeholder="Datapoints to test...", value=datapoint)
             with gr.Row():
                 random_selection_button = gr.Button("Change Data")
                 stream_button = gr.Button("✨ Click to Streaming ✨")
             random_selection_button.click(
-                fn=random_data_selection,
-                inputs=[],
-                outputs=[data_textbox]
             )
             random.shuffle(prompts)
@@ -42,43 +48,56 @@ Once the streaming is complete, you can choose the best response.\u2764\ufe0f"""
             # Store prompts in state components
             state_prompts = gr.State(value=prompts)
             state_random_selected_prompts = gr.State(value=random_selected_prompts)
             with gr.Row():
-                columns = [gr.Textbox(label=f"Prompt {i+1}", lines=10) for i in range(len(random_selected_prompts))]
             model = get_model_batch_generation("Qwen/Qwen2-1.5B-Instruct")
             def start_streaming(data, random_selected_prompts):
-                content_list = [prompt['prompt'] + '\n{' + data + '}\n\nsummary:' for prompt in random_selected_prompts]
                 for response_data in stream_data(content_list, model):
-                    updates = [gr.update(value=response_data[i]) for i in range(len(columns))]
                     yield tuple(updates)
             stream_button.click(
                 fn=start_streaming,
                 inputs=[data_textbox, state_random_selected_prompts],
                 outputs=columns,
-                show_progress=False
             )
-            choice = gr.Radio(label="Choose the best response:", choices=["Response 1", "Response 2", "Response 3"])
             submit_button = gr.Button("Submit")
             output = gr.Textbox(label="You selected:", visible=False)
-            def update_prompt_metrics(selected_choice, prompts, random_selected_prompts):
                 if selected_choice == "Response 1":
-                    prompt_id = random_selected_prompts[0]['id']
                 elif selected_choice == "Response 2":
-                    prompt_id = random_selected_prompts[1]['id']
                 elif selected_choice == "Response 3":
-                    prompt_id = random_selected_prompts[2]['id']
                 else:
                     raise ValueError(f"No corresponding response of {selected_choice}")
                 for prompt in prompts:
-                    if prompt['id'] == prompt_id:
                         prompt["metric"]["winning_number"] += 1
                         break
                 else:
@@ -87,7 +106,11 @@ Once the streaming is complete, you can choose the best response.\u2764\ufe0f"""
                 with open("prompt/prompt.json", "w") as f:
                     json.dump(prompts, f)
-                return gr.update(value=f"You selected: {selected_choice}", visible=True), gr.update(interactive=False), gr.update(interactive=False)
             submit_button.click(
                 fn=update_prompt_metrics,
@@ -97,6 +120,7 @@ Once the streaming is complete, you can choose the best response.\u2764\ufe0f"""
     return demo
 if __name__ == "__main__":
     demo = create_arena()
     demo.queue()

 def random_data_selection():
     datapoint = random.choice(dataset)
+    datapoint = datapoint["section_text"] + "\n\nDialogue:\n" + datapoint["dialogue"]
     return datapoint
 def create_arena():
     with open("prompt/prompt.json", "r") as file:
         json_data = file.read()
     with gr.Blocks(css=custom_css) as demo:
         with gr.Group():
             datapoint = random_data_selection()
+            gr.Markdown(
+                """This arena is designed to compare different prompts. Click the button to stream responses from randomly shuffled prompts. Each column represents a response generated from one randomly selected prompt.
+Once the streaming is complete, you can choose the best response.\u2764\ufe0f"""
+            )
+            data_textbox = gr.Textbox(
+                label="Data",
+                lines=10,
+                placeholder="Datapoints to test...",
+                value=datapoint,
+            )
             with gr.Row():
                 random_selection_button = gr.Button("Change Data")
                 stream_button = gr.Button("✨ Click to Streaming ✨")
             random_selection_button.click(
+                fn=random_data_selection, inputs=[], outputs=[data_textbox]
             )
             random.shuffle(prompts)
             # Store prompts in state components
             state_prompts = gr.State(value=prompts)
             state_random_selected_prompts = gr.State(value=random_selected_prompts)
             with gr.Row():
+                columns = [
+                    gr.Textbox(label=f"Prompt {i+1}", lines=10)
+                    for i in range(len(random_selected_prompts))
+                ]
             model = get_model_batch_generation("Qwen/Qwen2-1.5B-Instruct")
             def start_streaming(data, random_selected_prompts):
+                content_list = [
+                    prompt["prompt"] + "\n{" + data + "}\n\nsummary:"
+                    for prompt in random_selected_prompts
+                ]
                 for response_data in stream_data(content_list, model):
+                    updates = [
+                        gr.update(value=response_data[i]) for i in range(len(columns))
+                    ]
                     yield tuple(updates)
             stream_button.click(
                 fn=start_streaming,
                 inputs=[data_textbox, state_random_selected_prompts],
                 outputs=columns,
+                show_progress=False,
+            )
+            choice = gr.Radio(
+                label="Choose the best response:",
+                choices=["Response 1", "Response 2", "Response 3"],
             )
             submit_button = gr.Button("Submit")
             output = gr.Textbox(label="You selected:", visible=False)
+            def update_prompt_metrics(
+                selected_choice, prompts, random_selected_prompts
+            ):
                 if selected_choice == "Response 1":
+                    prompt_id = random_selected_prompts[0]["id"]
                 elif selected_choice == "Response 2":
+                    prompt_id = random_selected_prompts[1]["id"]
                 elif selected_choice == "Response 3":
+                    prompt_id = random_selected_prompts[2]["id"]
                 else:
                     raise ValueError(f"No corresponding response of {selected_choice}")
                 for prompt in prompts:
+                    if prompt["id"] == prompt_id:
                         prompt["metric"]["winning_number"] += 1
                         break
                 else:
                 with open("prompt/prompt.json", "w") as f:
                     json.dump(prompts, f)
+                return (
+                    gr.update(value=f"You selected: {selected_choice}", visible=True),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                )
             submit_button.click(
                 fn=update_prompt_metrics,
     return demo
 if __name__ == "__main__":
     demo = create_arena()
     demo.queue()

pages/batch_evaluation.py CHANGED Viewed

@@ -12,21 +12,22 @@ from utils.model import Model
 load_dotenv()
 def display_results(response_list):
-    overall_score = np.mean([r['metric_score']['rouge_score'] for r in response_list])
     html_output = f"<h2>Overall Score: {overall_score:.2f}</h2>"
     for i, item in enumerate(response_list, 1):
-        dialogue = item['dialogue']
-        summary = item['summary']
-        response = item['response']
-        rouge_score = item['metric_score']['rouge_score']
-        dialogue = html.escape(item['dialogue']).replace('\n', '<br>')
-        summary = html.escape(item['summary']).replace('\n', '<br>')
-        response = html.escape(item['response']).replace('\n', '<br>')
         html_output += f"""
         <details>
         <summary>Response {i} (Rouge Score: {rouge_score:.2f})</summary>
@@ -49,6 +50,7 @@ def display_results(response_list):
     return html_output
 def process(model_selection, prompt, num=10):
     response_list = []
     with open("test_samples/test_data.json", "r") as file:
@@ -57,21 +59,21 @@ def process(model_selection, prompt, num=10):
     for i, data in enumerate(dataset):
         logging.info(f"Start testing datapoint {i+1}")
-        dialogue = data['dialogue']
-        format = data['format']
-        summary = data['summary']
-        response = generate_answer(dialogue, model_selection, prompt + f' Output following {format} format.')
         rouge_score = metric_rouge_score(response, summary)
         response_list.append(
             {
-                'dialogue': dialogue,
-                'summary': summary,
-                'response': response,
-                'metric_score': {
-                    'rouge_score': rouge_score
-                }
             }
         )
@@ -81,22 +83,34 @@ def process(model_selection, prompt, num=10):
 def create_batch_evaluation_interface():
-    with gr.Blocks(theme=gr.themes.Soft(spacing_size="sm",text_size="sm"), css=custom_css) as demo:
-        gr.Markdown("## Here are evaluation setups. It will run though datapoints in test_data.josn to generate and evaluate. Show results once finished.")
-        model_dropdown = gr.Dropdown(choices=Model.__model_list__, label="Choose a model", value=Model.__model_list__[0])
-        Template_text = gr.Textbox(value="""Summarize the following dialogue""", label='Input Prompting Template', lines=8, placeholder='Input your prompts')
         submit_button = gr.Button("✨ Submit ✨")
         output = gr.HTML(label="Results")
         submit_button.click(
-            process,
-            inputs=[model_dropdown, Template_text],
-            outputs=output
         )
     return demo
 if __name__ == "__main__":
     demo = create_batch_evaluation_interface()
-    demo.launch()

 load_dotenv()
 def display_results(response_list):
+    overall_score = np.mean([r["metric_score"]["rouge_score"] for r in response_list])
     html_output = f"<h2>Overall Score: {overall_score:.2f}</h2>"
     for i, item in enumerate(response_list, 1):
+        dialogue = item["dialogue"]
+        summary = item["summary"]
+        response = item["response"]
+        rouge_score = item["metric_score"]["rouge_score"]
+        dialogue = html.escape(item["dialogue"]).replace("\n", "<br>")
+        summary = html.escape(item["summary"]).replace("\n", "<br>")
+        response = html.escape(item["response"]).replace("\n", "<br>")
         html_output += f"""
         <details>
         <summary>Response {i} (Rouge Score: {rouge_score:.2f})</summary>
     return html_output
 def process(model_selection, prompt, num=10):
     response_list = []
     with open("test_samples/test_data.json", "r") as file:
     for i, data in enumerate(dataset):
         logging.info(f"Start testing datapoint {i+1}")
+        dialogue = data["dialogue"]
+        format = data["format"]
+        summary = data["summary"]
+        response = generate_answer(
+            dialogue, model_selection, prompt + f" Output following {format} format."
+        )
         rouge_score = metric_rouge_score(response, summary)
         response_list.append(
             {
+                "dialogue": dialogue,
+                "summary": summary,
+                "response": response,
+                "metric_score": {"rouge_score": rouge_score},
             }
         )
 def create_batch_evaluation_interface():
+    with gr.Blocks(
+        theme=gr.themes.Soft(spacing_size="sm", text_size="sm"), css=custom_css
+    ) as demo:
+        gr.Markdown(
+            "## Here are evaluation setups. It will run though datapoints in test_data.josn to generate and evaluate. Show results once finished."
+        )
+        model_dropdown = gr.Dropdown(
+            choices=Model.__model_list__,
+            label="Choose a model",
+            value=Model.__model_list__[0],
+        )
+        Template_text = gr.Textbox(
+            value="""Summarize the following dialogue""",
+            label="Input Prompting Template",
+            lines=8,
+            placeholder="Input your prompts",
+        )
         submit_button = gr.Button("✨ Submit ✨")
         output = gr.HTML(label="Results")
         submit_button.click(
+            process, inputs=[model_dropdown, Template_text], outputs=output
         )
     return demo
 if __name__ == "__main__":
     demo = create_batch_evaluation_interface()
+    demo.launch()

pages/leaderboard.py CHANGED Viewed

@@ -9,72 +9,90 @@ import pandas as pd
 def create_html_with_tooltip(id, base_url):
     return f'<a href="{base_url}"target="_blank">{id}</a>'
 # Load prompts from JSON
 with open("prompt/prompt.json", "r") as file:
     json_data = file.read()
     prompts = json.loads(json_data)
 # Prepare leaderboard data
-winning_rate = [prompt['metric']['winning_number'] for prompt in prompts]
-winning_rate = [round(num / sum(winning_rate), 4)for num in winning_rate]
 data = {
-    'Rank': [i+1 for i in range(len(prompts))],
-    'Methods': [create_html_with_tooltip(prompt['id'], prompt['url']) for prompt in prompts],
-    'Rouge Score': [prompt['metric']['Rouge'] for prompt in prompts],
-    'Winning Rate': winning_rate,
-    'Authors': [prompt['author'] for prompt in prompts],
 }
 # Create DataFrame and sort by Rouge Score
 df = pd.DataFrame(data)
-df.sort_values(by='Rouge Score', ascending=False, inplace=True, ignore_index=True)
-df['Rank'] = range(1, len(df) + 1)
 # Assign medals for top 3 authors
-medals = ['🏅', '🥈', '🥉']
 for i in range(3):
-    df.loc[i, 'Authors'] = f"{medals[i]} {df.loc[i, 'Authors']}"
 # Function to update the leaderboard
 def update_leaderboard(sort_by):
     sorted_df = df.sort_values(by=sort_by, ascending=False, ignore_index=True)
-    sorted_df['Rank'] = range(1, len(sorted_df) + 1)
     # Convert DataFrame to HTML with clickable headers for sorting
     table_html = sorted_df.to_html(index=False, escape=False)
     # Add sorting links to column headers
     for column in sorted_df.columns:
-        table_html = table_html.replace(f'<th>{column}</th>',
-                        f'<th><a href="#" onclick="sortBy(\'{column}\'); return false;">{column}</a></th>')
     return table_html
 # Define Gradio interface
 def create_leaderboard():
-    with gr.Blocks(css="""
         .tooltip { cursor: pointer; color: blue; text-decoration: underline; }
         table { border-collapse: collapse; width: 100%; }
         th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
         th { background-color: #f2f2f2; }
         #prompt-display { display: none; }
-    """) as demo:
         gr.Markdown("# 🏆 Summarization Arena Leaderboard")
         with gr.Row():
-            gr.Markdown("[Blog](placeholder) | [GitHub](placeholder) | [Paper](placeholder) | [Dataset](placeholder) | [Twitter](placeholder) | [Discord](placeholder)")
-        gr.Markdown("Welcome to our open platform for evaluating LLM summarization capabilities.")
         # Dropdown for sorting
         sort_by = gr.Dropdown(list(df.columns), label="Sort by", value="Rouge Score")
         # Display the leaderboard
         leaderboard = gr.HTML(update_leaderboard("Rouge Score"), elem_id="leaderboard")
         # Change sorting when dropdown is changed
-        sort_by.change(fn=lambda sort: update_leaderboard(sort), inputs=sort_by, outputs=leaderboard)
     return demo
 # Launch Gradio interface
 if __name__ == "__main__":
     demo = create_leaderboard()

 def create_html_with_tooltip(id, base_url):
     return f'<a href="{base_url}"target="_blank">{id}</a>'
 # Load prompts from JSON
 with open("prompt/prompt.json", "r") as file:
     json_data = file.read()
     prompts = json.loads(json_data)
 # Prepare leaderboard data
+winning_rate = [prompt["metric"]["winning_number"] for prompt in prompts]
+winning_rate = [round(num / sum(winning_rate), 4) for num in winning_rate]
 data = {
+    "Rank": [i + 1 for i in range(len(prompts))],
+    "Methods": [
+        create_html_with_tooltip(prompt["id"], prompt["url"]) for prompt in prompts
+    ],
+    "Rouge Score": [prompt["metric"]["Rouge"] for prompt in prompts],
+    "Winning Rate": winning_rate,
+    "Authors": [prompt["author"] for prompt in prompts],
 }
 # Create DataFrame and sort by Rouge Score
 df = pd.DataFrame(data)
+df.sort_values(by="Rouge Score", ascending=False, inplace=True, ignore_index=True)
+df["Rank"] = range(1, len(df) + 1)
 # Assign medals for top 3 authors
+medals = ["🏅", "🥈", "🥉"]
 for i in range(3):
+    df.loc[i, "Authors"] = f"{medals[i]} {df.loc[i, 'Authors']}"
 # Function to update the leaderboard
 def update_leaderboard(sort_by):
     sorted_df = df.sort_values(by=sort_by, ascending=False, ignore_index=True)
+    sorted_df["Rank"] = range(1, len(sorted_df) + 1)
     # Convert DataFrame to HTML with clickable headers for sorting
     table_html = sorted_df.to_html(index=False, escape=False)
     # Add sorting links to column headers
     for column in sorted_df.columns:
+        table_html = table_html.replace(
+            f"<th>{column}</th>",
+            f'<th><a href="#" onclick="sortBy(\'{column}\'); return false;">{column}</a></th>',
+        )
     return table_html
 # Define Gradio interface
 def create_leaderboard():
+    with gr.Blocks(
+        css="""
         .tooltip { cursor: pointer; color: blue; text-decoration: underline; }
         table { border-collapse: collapse; width: 100%; }
         th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
         th { background-color: #f2f2f2; }
         #prompt-display { display: none; }
+    """
+    ) as demo:
         gr.Markdown("# 🏆 Summarization Arena Leaderboard")
         with gr.Row():
+            gr.Markdown(
+                "[Blog](placeholder) | [GitHub](placeholder) | [Paper](placeholder) | [Dataset](placeholder) | [Twitter](placeholder) | [Discord](placeholder)"
+            )
+        gr.Markdown(
+            "Welcome to our open platform for evaluating LLM summarization capabilities."
+        )
         # Dropdown for sorting
         sort_by = gr.Dropdown(list(df.columns), label="Sort by", value="Rouge Score")
         # Display the leaderboard
         leaderboard = gr.HTML(update_leaderboard("Rouge Score"), elem_id="leaderboard")
         # Change sorting when dropdown is changed
+        sort_by.change(
+            fn=lambda sort: update_leaderboard(sort),
+            inputs=sort_by,
+            outputs=leaderboard,
+        )
     return demo
 # Launch Gradio interface
 if __name__ == "__main__":
     demo = create_leaderboard()

pages/summarization_playground.py CHANGED Viewed

@@ -65,27 +65,26 @@ input-label {
 }
 """
-__model_on_gpu__ = ''
 model = {model_name: None for model_name in Model.__model_list__}
-random_label = '🔀 Random dialogue from dataset'
 examples = {
     "example 1": """Boston's injury reporting for Kristaps Porziņģis has been fairly coy. He missed Game 3, but his coach told reporters just before Game 4 that was technically available, but with a catch.
 Joe Mazzulla said Porziņģis would "only be used in specific instances, if necessary." That sounds like the team doesn't want to risk further injury to his dislocated Posterior Tibialis (or some other body part, due to overcompensation for the ankle), unless it's in a desperate situation.
 Being up 3-1, with Game 5 at home, doesn't qualify as desperate. So, expect the Celtics to continue slow-playing KP's return.
 It'd obviously be nice for Boston to have his rim protection and jump shooting back. It was missed in the Game 4 blowout, but the Celtics have also demonstrated they can win without the big man throughout this campaign.
 On top of winning Game 3 of this series, Boston is plus-10.9 points per 100 possessions when Porziņģis has been off the floor this regular and postseason.""",
     "example 2": """Prior to the Finals, we predicted that Dereck Lively II's minutes would swell over the course of the series, and that's starting to play out.
 He averaged 18.8 minutes in Games 1 and 2 and was up to 26.2 in Games 3 and 4. That's with the regulars being pulled long before the final buzzer in Friday's game, too.
 Expect the rookie's playing time to continue to climb in Game 5. It seems increasingly clear that coach Jason Kidd trusts him over the rest of Dallas' bigs, and it's not hard to see why.
 Lively has been absolutely relentless on the offensive glass all postseason. He makes solid decisions as a passer when his rolls don't immediately lead to dunks. And he's not a liability when caught defending guards or wings outside.
 All of that has led to postseason averages of 8.2 points, 7.6 rebounds, 1.4 assists and 1.0 blocks in just 21.9 minutes, as well as a double-double in 22 minutes of Game 4.
 Back in Boston, Kidd is going to rely on Lively even more. He'll play close to 30 minutes and reach double-figures in both scoring and rebounding again.""",
-    random_label: ""
 }
 def model_device_check(model_name):
     global __model_on_gpu__
@@ -106,56 +105,134 @@ def get_model_batch_generation(model_name):
     return model[model_name]
-def generate_answer(sources, model_name, prompt, temperature=0.0001, max_new_tokens=500, do_sample=True):
     model_device_check(model_name)
-    content = prompt + '\n{' + sources + '}\n\nsummary:'
-    answer = model[model_name].gen(content,temperature,max_new_tokens,do_sample)[0].strip()
     return answer
-def process_input(input_text, model_selection, prompt, temperature=0.0001, max_new_tokens=500, do_sample=True):
     if input_text:
         logging.info("Start generation")
-        response = generate_answer(input_text, model_selection, prompt, temperature, max_new_tokens, do_sample)
-        return f"## Original Dialogue:\n\n{input_text}\n\n## Summarization:\n\n{response}"
     else:
         return "Please fill the input to generate outputs."
 def update_input(example):
     if example == random_label:
         datapoint = random.choice(dataset)
-        return datapoint['section_text'] + '\n\nDialogue:\n' + datapoint['dialogue']
     return examples[example]
 def create_summarization_interface():
-    with gr.Blocks(theme=gr.themes.Soft(spacing_size="sm",text_size="sm"), css=custom_css) as demo:
-        gr.Markdown("## This is a playground to test prompts for clinical dialogue summarizations")
         with gr.Row():
-            example_dropdown = gr.Dropdown(choices=list(examples.keys()), label="Choose an example", value=random_label)
-            model_dropdown = gr.Dropdown(choices=Model.__model_list__, label="Choose a model", value=Model.__model_list__[0])
-        gr.Markdown("<div style='border: 4px solid white; padding: 3px; border-radius: 5px;width:100px;padding-top: 0.5px;padding-bottom: 10px;'><h3>Prompt 👥</h3></center></div>")
-        Template_text = gr.Textbox(value="""Summarize the following dialogue""", label='Input Prompting Template', lines=4, placeholder='Input your prompts')
         datapoint = random.choice(dataset)
-        input_text = gr.Textbox(label="Input Dialogue", lines=7, placeholder="Enter text here...", value=datapoint['section_text'] + '\n\nDialogue:\n' + datapoint['dialogue'])
         submit_button = gr.Button("✨ Submit ✨")
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("<div style='border: 4px solid white; padding: 2px; border-radius: 5px;width:130px;padding-bottom: 10px;'><b><h3>Parameters 📈</h3></center></b></div>")
                 with gr.Column():
-                    temperature = gr.Number(label="Temperature",elem_classes="parameter-text", value=0.0001, minimum=0.000001, maximum=1.0)
-                    max_new_tokens = gr.Number(label="Max New Tokens",elem_classes="parameter-text", value=500, precision=0, minimum=0, maximum=500)
-                    do_sample = gr.Dropdown([True,False],label="Do Sample",elem_classes="parameter-text", value=True)
             with gr.Column(scale=3):
                 output = gr.Markdown(line_breaks=True)
-        example_dropdown.change(update_input, inputs=[example_dropdown], outputs=[input_text])
-        submit_button.click(process_input, inputs=[input_text,model_dropdown,Template_text,temperature,max_new_tokens,do_sample], outputs=[output])
     return demo
 if __name__ == "__main__":
     demo = create_summarization_interface()
     demo.launch()

 }
 """
+__model_on_gpu__ = ""
 model = {model_name: None for model_name in Model.__model_list__}
+random_label = "🔀 Random dialogue from dataset"
 examples = {
     "example 1": """Boston's injury reporting for Kristaps Porziņģis has been fairly coy. He missed Game 3, but his coach told reporters just before Game 4 that was technically available, but with a catch.
 Joe Mazzulla said Porziņģis would "only be used in specific instances, if necessary." That sounds like the team doesn't want to risk further injury to his dislocated Posterior Tibialis (or some other body part, due to overcompensation for the ankle), unless it's in a desperate situation.
 Being up 3-1, with Game 5 at home, doesn't qualify as desperate. So, expect the Celtics to continue slow-playing KP's return.
 It'd obviously be nice for Boston to have his rim protection and jump shooting back. It was missed in the Game 4 blowout, but the Celtics have also demonstrated they can win without the big man throughout this campaign.
 On top of winning Game 3 of this series, Boston is plus-10.9 points per 100 possessions when Porziņģis has been off the floor this regular and postseason.""",
     "example 2": """Prior to the Finals, we predicted that Dereck Lively II's minutes would swell over the course of the series, and that's starting to play out.
 He averaged 18.8 minutes in Games 1 and 2 and was up to 26.2 in Games 3 and 4. That's with the regulars being pulled long before the final buzzer in Friday's game, too.
 Expect the rookie's playing time to continue to climb in Game 5. It seems increasingly clear that coach Jason Kidd trusts him over the rest of Dallas' bigs, and it's not hard to see why.
 Lively has been absolutely relentless on the offensive glass all postseason. He makes solid decisions as a passer when his rolls don't immediately lead to dunks. And he's not a liability when caught defending guards or wings outside.
 All of that has led to postseason averages of 8.2 points, 7.6 rebounds, 1.4 assists and 1.0 blocks in just 21.9 minutes, as well as a double-double in 22 minutes of Game 4.
 Back in Boston, Kidd is going to rely on Lively even more. He'll play close to 30 minutes and reach double-figures in both scoring and rebounding again.""",
+    random_label: "",
 }
 def model_device_check(model_name):
     global __model_on_gpu__
     return model[model_name]
+def generate_answer(
+    sources, model_name, prompt, temperature=0.0001, max_new_tokens=500, do_sample=True
+):
     model_device_check(model_name)
+    content = prompt + "\n{" + sources + "}\n\nsummary:"
+    answer = (
+        model[model_name]
+        .gen(content, temperature, max_new_tokens, do_sample)[0]
+        .strip()
+    )
     return answer
+def process_input(
+    input_text,
+    model_selection,
+    prompt,
+    temperature=0.0001,
+    max_new_tokens=500,
+    do_sample=True,
+):
     if input_text:
         logging.info("Start generation")
+        response = generate_answer(
+            input_text, model_selection, prompt, temperature, max_new_tokens, do_sample
+        )
+        return (
+            f"## Original Dialogue:\n\n{input_text}\n\n## Summarization:\n\n{response}"
+        )
     else:
         return "Please fill the input to generate outputs."
 def update_input(example):
     if example == random_label:
         datapoint = random.choice(dataset)
+        return datapoint["section_text"] + "\n\nDialogue:\n" + datapoint["dialogue"]
     return examples[example]
 def create_summarization_interface():
+    with gr.Blocks(
+        theme=gr.themes.Soft(spacing_size="sm", text_size="sm"), css=custom_css
+    ) as demo:
+        gr.Markdown(
+            "## This is a playground to test prompts for clinical dialogue summarizations"
+        )
         with gr.Row():
+            example_dropdown = gr.Dropdown(
+                choices=list(examples.keys()),
+                label="Choose an example",
+                value=random_label,
+            )
+            model_dropdown = gr.Dropdown(
+                choices=Model.__model_list__,
+                label="Choose a model",
+                value=Model.__model_list__[0],
+            )
+        gr.Markdown(
+            "<div style='border: 4px solid white; padding: 3px; border-radius: 5px;width:100px;padding-top: 0.5px;padding-bottom: 10px;'><h3>Prompt 👥</h3></center></div>"
+        )
+        Template_text = gr.Textbox(
+            value="""Summarize the following dialogue""",
+            label="Input Prompting Template",
+            lines=4,
+            placeholder="Input your prompts",
+        )
         datapoint = random.choice(dataset)
+        input_text = gr.Textbox(
+            label="Input Dialogue",
+            lines=7,
+            placeholder="Enter text here...",
+            value=datapoint["section_text"] + "\n\nDialogue:\n" + datapoint["dialogue"],
+        )
         submit_button = gr.Button("✨ Submit ✨")
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown(
+                    "<div style='border: 4px solid white; padding: 2px; border-radius: 5px;width:130px;padding-bottom: 10px;'><b><h3>Parameters 📈</h3></center></b></div>"
+                )
                 with gr.Column():
+                    temperature = gr.Number(
+                        label="Temperature",
+                        elem_classes="parameter-text",
+                        value=0.0001,
+                        minimum=0.000001,
+                        maximum=1.0,
+                    )
+                    max_new_tokens = gr.Number(
+                        label="Max New Tokens",
+                        elem_classes="parameter-text",
+                        value=500,
+                        precision=0,
+                        minimum=0,
+                        maximum=500,
+                    )
+                    do_sample = gr.Dropdown(
+                        [True, False],
+                        label="Do Sample",
+                        elem_classes="parameter-text",
+                        value=True,
+                    )
             with gr.Column(scale=3):
                 output = gr.Markdown(line_breaks=True)
+        example_dropdown.change(
+            update_input, inputs=[example_dropdown], outputs=[input_text]
+        )
+        submit_button.click(
+            process_input,
+            inputs=[
+                input_text,
+                model_dropdown,
+                Template_text,
+                temperature,
+                max_new_tokens,
+                do_sample,
+            ],
+            outputs=[output],
+        )
     return demo
 if __name__ == "__main__":
     demo = create_summarization_interface()
     demo.launch()

utils/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # This is the __init__.py file for the utils package
 # You can add any initialization code or import statements here
-__all__ = ['multiple_stream', 'model', 'data', 'metric']

 # This is the __init__.py file for the utils package
 # You can add any initialization code or import statements here
+__all__ = ["multiple_stream", "model", "data", "metric"]

utils/data.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from datasets import load_dataset
-dialogsum = load_dataset('har1/MTS_Dialogue-Clinical_Note')
-dataset = list(dialogsum['train'])

+from datasets import load_dataset
+dialogsum = load_dataset("har1/MTS_Dialogue-Clinical_Note")
+dataset = list(dialogsum["train"])

utils/metric.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from rouge_score import rouge_scorer
-scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
 def metric_rouge_score(pred, ref):
-    return scorer.score(pred, ref)['rougeL'].fmeasure

 from rouge_score import rouge_scorer
+scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
 def metric_rouge_score(pred, ref):
+    return scorer.score(pred, ref)["rougeL"].fmeasure

utils/model.py CHANGED Viewed

@@ -6,7 +6,8 @@ from huggingface_hub import login
 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
 from vllm import LLM, SamplingParams
-login(token=os.getenv('HF_TOKEN'))
 class Model(torch.nn.Module):
     number_of_models = 0
@@ -15,17 +16,17 @@ class Model(torch.nn.Module):
         "lmsys/vicuna-7b-v1.5",
         "google-t5/t5-large",
         "mistralai/Mistral-7B-Instruct-v0.1",
-        "meta-llama/Meta-Llama-3.1-8B-Instruct"
     ]
     def __init__(self, model_name="Qwen/Qwen2-1.5B-Instruct") -> None:
         super(Model, self).__init__()
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.name = model_name
         self.use_vllm = model_name != "google-t5/t5-large"
-        logging.info(f'Start loading model {self.name}')
         if self.use_vllm:
             # 使用vLLM加载模型
@@ -33,18 +34,16 @@ class Model(torch.nn.Module):
                 model=model_name,
                 dtype="half",
                 tokenizer=model_name,
-                trust_remote_code=True
             )
         else:
             # 加载原始transformers模型
             self.model = AutoModelForSeq2SeqLM.from_pretrained(
-                model_name,
-                torch_dtype=torch.bfloat16,
-                device_map="auto"
             )
             self.model.eval()
-        logging.info(f'Loaded model {self.name}')
         self.update()
     @classmethod
@@ -56,13 +55,15 @@ class Model(torch.nn.Module):
             sampling_params = SamplingParams(
                 temperature=temp,
                 max_tokens=max_length,
-                #top_p=0.95 if do_sample else 1.0,
-                stop_token_ids=[self.tokenizer.eos_token_id]
             )
             outputs = self.llm.generate(content_list, sampling_params)
             return [output.outputs[0].text for output in outputs]
         else:
-            input_ids = self.tokenizer(content_list, return_tensors="pt", padding=True, truncation=True).input_ids.to(self.model.device)
             outputs = self.model.generate(
                 input_ids,
                 max_new_tokens=max_length,
@@ -70,7 +71,9 @@ class Model(torch.nn.Module):
                 temperature=temp,
                 eos_token_id=self.tokenizer.eos_token_id,
             )
-            return self.tokenizer.batch_decode(outputs[:, input_ids.shape[1]:], skip_special_tokens=True)
     def streaming(self, content_list, temp=0.001, max_length=500, do_sample=True):
         if self.use_vllm:
@@ -78,24 +81,28 @@ class Model(torch.nn.Module):
                 temperature=temp,
                 max_tokens=max_length,
                 top_p=0.95 if do_sample else 1.0,
-                stop_token_ids=[self.tokenizer.eos_token_id]
             )
             outputs = self.llm.generate(content_list, sampling_params, stream=True)
             prev_token_ids = [[] for _ in content_list]
             for output in outputs:
                 for i, request_output in enumerate(output.outputs):
                     current_token_ids = request_output.token_ids
-                    new_token_ids = current_token_ids[len(prev_token_ids[i]):]
                     prev_token_ids[i] = current_token_ids.copy()
                     for token_id in new_token_ids:
-                        token_text = self.tokenizer.decode(token_id, skip_special_tokens=True)
                         yield i, token_text
         else:
-            input_ids = self.tokenizer(content_list, return_tensors="pt", padding=True, truncation=True).input_ids.to(self.model.device)
             gen_kwargs = {
                 "input_ids": input_ids,
                 "do_sample": do_sample,
@@ -103,7 +110,7 @@ class Model(torch.nn.Module):
                 "eos_token_id": self.tokenizer.eos_token_id,
                 "max_new_tokens": 1,
                 "return_dict_in_generate": True,
-                "output_scores": True
             }
             generated_tokens = 0
@@ -113,16 +120,26 @@ class Model(torch.nn.Module):
             while generated_tokens < max_length and len(active_sequences) > 0:
                 with torch.no_grad():
                     output = self.model.generate(**gen_kwargs)
                 next_tokens = output.sequences[:, -1].unsqueeze(-1)
                 for i, token in zip(active_sequences, next_tokens):
-                    yield i.item(), self.tokenizer.decode(token[0], skip_special_tokens=True)
-                gen_kwargs["input_ids"] = torch.cat([gen_kwargs["input_ids"], next_tokens], dim=-1)
                 generated_tokens += 1
-                completed = (next_tokens.squeeze(-1) == self.tokenizer.eos_token_id).nonzero().squeeze(-1)
-                active_sequences = torch.tensor([i for i in active_sequences if i not in completed])
                 if len(active_sequences) > 0:
-                    gen_kwargs["input_ids"] = gen_kwargs["input_ids"][active_sequences]

 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
 from vllm import LLM, SamplingParams
+login(token=os.getenv("HF_TOKEN"))
 class Model(torch.nn.Module):
     number_of_models = 0
         "lmsys/vicuna-7b-v1.5",
         "google-t5/t5-large",
         "mistralai/Mistral-7B-Instruct-v0.1",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
     ]
     def __init__(self, model_name="Qwen/Qwen2-1.5B-Instruct") -> None:
         super(Model, self).__init__()
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.name = model_name
         self.use_vllm = model_name != "google-t5/t5-large"
+        logging.info(f"Start loading model {self.name}")
         if self.use_vllm:
             # 使用vLLM加载模型
                 model=model_name,
                 dtype="half",
                 tokenizer=model_name,
+                trust_remote_code=True,
             )
         else:
             # 加载原始transformers模型
             self.model = AutoModelForSeq2SeqLM.from_pretrained(
+                model_name, torch_dtype=torch.bfloat16, device_map="auto"
             )
             self.model.eval()
+        logging.info(f"Loaded model {self.name}")
         self.update()
     @classmethod
             sampling_params = SamplingParams(
                 temperature=temp,
                 max_tokens=max_length,
+                # top_p=0.95 if do_sample else 1.0,
+                stop_token_ids=[self.tokenizer.eos_token_id],
             )
             outputs = self.llm.generate(content_list, sampling_params)
             return [output.outputs[0].text for output in outputs]
         else:
+            input_ids = self.tokenizer(
+                content_list, return_tensors="pt", padding=True, truncation=True
+            ).input_ids.to(self.model.device)
             outputs = self.model.generate(
                 input_ids,
                 max_new_tokens=max_length,
                 temperature=temp,
                 eos_token_id=self.tokenizer.eos_token_id,
             )
+            return self.tokenizer.batch_decode(
+                outputs[:, input_ids.shape[1] :], skip_special_tokens=True
+            )
     def streaming(self, content_list, temp=0.001, max_length=500, do_sample=True):
         if self.use_vllm:
                 temperature=temp,
                 max_tokens=max_length,
                 top_p=0.95 if do_sample else 1.0,
+                stop_token_ids=[self.tokenizer.eos_token_id],
             )
             outputs = self.llm.generate(content_list, sampling_params, stream=True)
             prev_token_ids = [[] for _ in content_list]
             for output in outputs:
                 for i, request_output in enumerate(output.outputs):
                     current_token_ids = request_output.token_ids
+                    new_token_ids = current_token_ids[len(prev_token_ids[i]) :]
                     prev_token_ids[i] = current_token_ids.copy()
                     for token_id in new_token_ids:
+                        token_text = self.tokenizer.decode(
+                            token_id, skip_special_tokens=True
+                        )
                         yield i, token_text
         else:
+            input_ids = self.tokenizer(
+                content_list, return_tensors="pt", padding=True, truncation=True
+            ).input_ids.to(self.model.device)
             gen_kwargs = {
                 "input_ids": input_ids,
                 "do_sample": do_sample,
                 "eos_token_id": self.tokenizer.eos_token_id,
                 "max_new_tokens": 1,
                 "return_dict_in_generate": True,
+                "output_scores": True,
             }
             generated_tokens = 0
             while generated_tokens < max_length and len(active_sequences) > 0:
                 with torch.no_grad():
                     output = self.model.generate(**gen_kwargs)
                 next_tokens = output.sequences[:, -1].unsqueeze(-1)
                 for i, token in zip(active_sequences, next_tokens):
+                    yield i.item(), self.tokenizer.decode(
+                        token[0], skip_special_tokens=True
+                    )
+                gen_kwargs["input_ids"] = torch.cat(
+                    [gen_kwargs["input_ids"], next_tokens], dim=-1
+                )
                 generated_tokens += 1
+                completed = (
+                    (next_tokens.squeeze(-1) == self.tokenizer.eos_token_id)
+                    .nonzero()
+                    .squeeze(-1)
+                )
+                active_sequences = torch.tensor(
+                    [i for i in active_sequences if i not in completed]
+                )
                 if len(active_sequences) > 0:
+                    gen_kwargs["input_ids"] = gen_kwargs["input_ids"][active_sequences]

utils/multiple_stream.py CHANGED Viewed

@@ -7,32 +7,36 @@ TEST = """ Test of Time. A Benchmark for Evaluating LLMs on Temporal Reasoning.
 showcased remarkable reasoning capabilities, yet they remain susceptible to errors, particularly in temporal
 reasoning tasks involving complex temporal logic. """
 def generate_data_test():
     """Generator to yield words"""
     temp = copy.deepcopy(TEST)
     l1 = temp.split()
     random.shuffle(l1)
-    temp = ' '.join(l1)
     for word in temp.split(" "):
         yield word + " "
 def stream_data(content_list, model):
     """Stream data to three columns"""
     outputs = ["" for _ in content_list]
     # Use the gen method to handle batch generation
     generator = model.streaming(content_list)
     while True:
         updated = False
         try:
-            id, word = next(generator)  # Get the next generated word for the corresponding content
             outputs[id] += f"{word} "
             updated = True
         except StopIteration:
             break
         if updated:
             yield tuple(outputs)
@@ -41,21 +45,22 @@ def create_interface():
     with gr.Blocks() as demo:
         with gr.Group():
             with gr.Row():
-                columns = [gr.Textbox(label=f"Column {i+1}", lines=10) for i in range(3)]
             start_btn = gr.Button("Start Streaming")
             def start_streaming():
-                content_list = [col.value for col in columns]  # Get input texts from text boxes
                 for data in stream_data(content_list):
                     updates = [gr.update(value=data[i]) for i in range(len(columns))]
                     yield tuple(updates)
             start_btn.click(
-                fn=start_streaming,
-                inputs=[],
-                outputs=columns,
-                show_progress=False
             )
     return demo
@@ -64,4 +69,4 @@ def create_interface():
 if __name__ == "__main__":
     demo = create_interface()
     demo.queue()
-    demo.launch()

 showcased remarkable reasoning capabilities, yet they remain susceptible to errors, particularly in temporal
 reasoning tasks involving complex temporal logic. """
 def generate_data_test():
     """Generator to yield words"""
     temp = copy.deepcopy(TEST)
     l1 = temp.split()
     random.shuffle(l1)
+    temp = " ".join(l1)
     for word in temp.split(" "):
         yield word + " "
 def stream_data(content_list, model):
     """Stream data to three columns"""
     outputs = ["" for _ in content_list]
     # Use the gen method to handle batch generation
     generator = model.streaming(content_list)
     while True:
         updated = False
         try:
+            id, word = next(
+                generator
+            )  # Get the next generated word for the corresponding content
             outputs[id] += f"{word} "
             updated = True
         except StopIteration:
             break
         if updated:
             yield tuple(outputs)
     with gr.Blocks() as demo:
         with gr.Group():
             with gr.Row():
+                columns = [
+                    gr.Textbox(label=f"Column {i+1}", lines=10) for i in range(3)
+                ]
             start_btn = gr.Button("Start Streaming")
             def start_streaming():
+                content_list = [
+                    col.value for col in columns
+                ]  # Get input texts from text boxes
                 for data in stream_data(content_list):
                     updates = [gr.update(value=data[i]) for i in range(len(columns))]
                     yield tuple(updates)
             start_btn.click(
+                fn=start_streaming, inputs=[], outputs=columns, show_progress=False
             )
     return demo
 if __name__ == "__main__":
     demo = create_interface()
     demo.queue()
+    demo.launch()