Spaces:

ikraamkb
/

Summarization

Running

App Files Files Community

ikraamkb commited on 6 days ago

Commit

da10ca7

verified ·

1 Parent(s): 7cab805

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -70

app.py CHANGED Viewed

@@ -1,11 +1,5 @@
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException
-from fastapi.responses import JSONResponse
-from fastapi.middleware.cors import CORSMiddleware
-import os
-import tempfile
-from gtts import gTTS
-from fpdf import FPDF
-import datetime
 import fitz  # PyMuPDF
 import docx
 import pptx
@@ -13,26 +7,22 @@ import openpyxl
 import re
 import nltk
 from nltk.tokenize import sent_tokenize
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 import easyocr
-import shutil
 import hashlib
 nltk.download('punkt', quiet=True)
 app = FastAPI()
-# CORS Configuration
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Initialize models
 MODEL_NAME = "facebook/bart-large-cnn"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
@@ -40,6 +30,8 @@ model.eval()
 summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
 reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
 summary_cache = {}
 def clean_text(text: str) -> str:
@@ -65,7 +57,7 @@ def extract_text(file_path: str, file_extension: str):
         elif file_extension == "docx":
             doc = docx.Document(file_path)
-            return clean_text("\n".join(p.text for p in doc.paragraphs), ""
         elif file_extension == "pptx":
             prs = pptx.Presentation(file_path)
@@ -77,6 +69,10 @@ def extract_text(file_path: str, file_extension: str):
             text = [" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True)]
             return clean_text("\n".join(text)), ""
         return "", "Unsupported file format"
     except Exception as e:
         return "", f"Error reading {file_extension.upper()} file: {str(e)}"
@@ -86,7 +82,7 @@ def chunk_text(text: str, max_tokens: int = 950):
         sentences = sent_tokenize(text)
     except:
         words = text.split()
-        sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20]
     chunks = []
     current_chunk = ""
@@ -165,57 +161,71 @@ def create_pdf(summary: str, original_filename: str):
         print(f"Error creating PDF: {e}")
         return ""
-@app.post("/summarize/")
-async def summarize_api(file: UploadFile = File(...), length: str = Form("medium")):
-    # Validate file type
-    valid_types = [
-        'application/pdf',
-        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-        'application/vnd.openxmlformats-officedocument.presentationml.presentation',
-        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
-    ]
-    if file.content_type not in valid_types:
-        raise HTTPException(
-            status_code=400,
-            detail="Please upload a valid document (PDF, DOCX, PPTX, or XLSX)"
-        )
     try:
-        # Save temp file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp:
-            shutil.copyfileobj(file.file, temp)
-            temp_path = temp.name
-        # Process file
-        text, error = extract_text(temp_path, os.path.splitext(file.filename)[1][1:].lower())
-        if error:
-            raise HTTPException(status_code=400, detail=error)
-        summary = generate_summary(text, length)
-        audio_path = text_to_speech(summary)
-        pdf_path = create_pdf(summary, file.filename)
-        return {
-            "summary": summary,
-            "audio_url": f"/files/{os.path.basename(audio_path)}" if audio_path else None,
-            "pdf_url": f"/files/{os.path.basename(pdf_path)}" if pdf_path else None
-        }
-    except HTTPException:
-        raise
     except Exception as e:
-        raise HTTPException(
-            status_code=500,
-            detail=f"Summarization failed: {str(e)}"
         )
-    finally:
-        if 'temp_path' in locals() and os.path.exists(temp_path):
-            os.unlink(temp_path)
-@app.get("/files/{filename}")
-async def get_file(filename: str):
-    file_path = os.path.join(tempfile.gettempdir(), filename)
     if os.path.exists(file_path):
         return FileResponse(file_path)
-    raise HTTPException(status_code=404, detail="File not found")

+ import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 import fitz  # PyMuPDF
 import docx
 import pptx
 import re
 import nltk
 from nltk.tokenize import sent_tokenize
 import torch
+from fastapi import FastAPI
+from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
+from gtts import gTTS
+import tempfile
+import os
 import easyocr
+from fpdf import FPDF
+import datetime
+from concurrent.futures import ThreadPoolExecutor
 import hashlib
 nltk.download('punkt', quiet=True)
 app = FastAPI()
 MODEL_NAME = "facebook/bart-large-cnn"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
 summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
 reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
+executor = ThreadPoolExecutor()
 summary_cache = {}
 def clean_text(text: str) -> str:
         elif file_extension == "docx":
             doc = docx.Document(file_path)
+            return clean_text("\n".join(p.text for p in doc.paragraphs)), ""
         elif file_extension == "pptx":
             prs = pptx.Presentation(file_path)
             text = [" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True)]
             return clean_text("\n".join(text)), ""
+        elif file_extension in ["jpg", "jpeg", "png"]:
+            ocr_result = reader.readtext(file_path, detail=0)
+            return clean_text("\n".join(ocr_result)), ""
         return "", "Unsupported file format"
     except Exception as e:
         return "", f"Error reading {file_extension.upper()} file: {str(e)}"
         sentences = sent_tokenize(text)
     except:
         words = text.split()
+        sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20)]
     chunks = []
     current_chunk = ""
         print(f"Error creating PDF: {e}")
         return ""
+def summarize_document(file, summary_length: str, enable_tts: bool = True):
+    if file is None:
+        return "Please upload a document first", "", None, None
+    file_path = file.name
+    file_extension = file_path.split(".")[-1].lower()
+    original_filename = os.path.basename(file_path)
+    text, error = extract_text(file_path, file_extension)
+    if error:
+        return error, "", None, None
+    if not text or len(text.split()) < 30:
+        return "Document is too short or contains too little text to summarize", "", None, None
     try:
+        summary = generate_summary(text, summary_length)
+        audio_path = text_to_speech(summary) if enable_tts else None
+        pdf_path = create_pdf(summary, original_filename) if summary else None
+        return summary, "", audio_path, pdf_path
     except Exception as e:
+        return f"Summarization error: {str(e)}", "", None, None
+with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📄 Advanced Document Summarizer")
+    gr.Markdown("Upload a document to generate a summary with audio and optional PDF download")
+    with gr.Row():
+        with gr.Column():
+            file_input = gr.File(
+                label="Upload Document",
+                file_types=[".pdf", ".docx", ".pptx", ".xlsx", ".jpg", ".jpeg", ".png"],
+                type="filepath"
+            )
+            length_radio = gr.Radio(
+                ["short", "medium", "long"],
+                value="medium",
+                label="Summary Length"
+            )
+            submit_btn = gr.Button("Generate Summary", variant="primary")
+        with gr.Column():
+            output = gr.Textbox(label="Summary", lines=10)
+            audio_output = gr.Audio(label="Audio Summary")
+            pdf_download = gr.File(label="Download Summary as PDF", visible=False)
+    def summarize_and_return_ui(file, summary_length):
+        summary, _, audio_path, pdf_path = summarize_document(file, summary_length)
+        return (
+            summary,
+            audio_path,
+            gr.File(visible=pdf_path is not None, value=pdf_path)
         )
+    submit_btn.click(
+        fn=summarize_and_return_ui,
+        inputs=[file_input, length_radio],
+        outputs=[output, audio_output, pdf_download]
+    )
+@app.get("/files/{file_name}")
+async def get_file(file_name: str):
+    file_path = os.path.join(tempfile.gettempdir(), file_name)
     if os.path.exists(file_path):
         return FileResponse(file_path)
+    return JSONResponse({"error": "File not found"}, status_code=404)
+app = gr.mount_gradio_app(app, demo, path="/")
+@app.get("/")
+def redirect_to_interface():
+    return RedirectResponse(url="/")