Spaces:

UshaKiranmai
/

text_summarization

Running

File size: 3,587 Bytes

57682dc

import gradio as gr
from transformers import pipeline
import torch
from fpdf import FPDF
import pandas as pd
import json
import csv

# Load the summarization pipeline
text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.float32)

def chunk_text(input_text, max_chunk_size=1024):
    """
    Splits the input text into smaller chunks of size `max_chunk_size` or smaller.
    """
    words = input_text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(" ".join(current_chunk + [word])) <= max_chunk_size:
            current_chunk.append(word)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

def summary(input_text, max_length=130, min_length=30, output_format="Plain Text"):
    """
    Summarizes the input text, handling cases where the text exceeds the model's maximum sequence length.
    Supports different output formats (Plain Text, JSON, HTML, CSV, Markdown, PDF, Excel).
    """
    chunks = chunk_text(input_text)
    summarized_chunks = []

    for chunk in chunks:
        output = text_summary(chunk, max_length=max_length, min_length=min_length)
        summarized_chunks.append(output[0]['summary_text'])

    summary_text = " ".join(summarized_chunks)
    
    # Return the output in the selected format
    if output_format == "Plain Text":
        return summary_text
    
    elif output_format == "JSON":
        result = {
            "summary": summary_text,
            "chunk_count": len(chunks),
            "original_length": len(input_text.split()),
            "summary_length": len(summary_text.split())
        }
        return json.dumps(result, indent=4)
    
    elif output_format == "HTML":
        html_output = f"<html><body><h2>Summary</h2><p>{summary_text}</p></body></html>"
        return html_output
    
    elif output_format == "CSV":
        csv_output = "Original Text, Summary\n"
        for chunk, summary in zip(chunks, summarized_chunks):
            csv_output += f'"{chunk}", "{summary}"\n'
        return csv_output
    
    elif output_format == "Markdown":
        markdown_output = f"## Summary\n\n{summary_text}"
        return markdown_output

    elif output_format == "PDF":
        pdf = FPDF()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.add_page()
        pdf.set_font("Arial", size=12)
        pdf.multi_cell(0, 10, summary_text)
        pdf_output = "summary.pdf"
        pdf.output(pdf_output)
        return f"PDF generated: {pdf_output}"

    elif output_format == "Excel":
        data = {
            "Original Text": chunks,
            "Summary": summarized_chunks
        }
        df = pd.DataFrame(data)
        excel_output = "summary.xlsx"
        df.to_excel(excel_output, index=False)
        return f"Excel file generated: {excel_output}"

# Create a Gradio interface with an additional output format selection
iface = gr.Interface(
    fn=summary,
    inputs=[
        gr.Textbox(label="Input Text", lines=10),
        gr.Slider(label="Max Length", minimum=30, maximum=300, step=10, value=130),
        gr.Slider(label="Min Length", minimum=20, maximum=100, step=10, value=30),
        gr.Dropdown(label="Output Format", choices=["Plain Text", "JSON", "HTML", "CSV", "Markdown", "PDF", "Excel"], value="Plain Text")
    ],
    outputs=gr.Textbox(label="Summarized Output"),
    title="Text Summarization with Advanced Output Formats"
)

iface.launch()