ikraamkb commited on
Commit
da10ca7
·
verified ·
1 Parent(s): 7cab805

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -70
app.py CHANGED
@@ -1,11 +1,5 @@
1
- from fastapi import FastAPI, UploadFile, File, Form, HTTPException
2
- from fastapi.responses import JSONResponse
3
- from fastapi.middleware.cors import CORSMiddleware
4
- import os
5
- import tempfile
6
- from gtts import gTTS
7
- from fpdf import FPDF
8
- import datetime
9
  import fitz # PyMuPDF
10
  import docx
11
  import pptx
@@ -13,26 +7,22 @@ import openpyxl
13
  import re
14
  import nltk
15
  from nltk.tokenize import sent_tokenize
16
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
17
  import torch
 
 
 
 
 
18
  import easyocr
19
- import shutil
 
 
20
  import hashlib
21
 
22
  nltk.download('punkt', quiet=True)
23
 
24
  app = FastAPI()
25
 
26
- # CORS Configuration
27
- app.add_middleware(
28
- CORSMiddleware,
29
- allow_origins=["*"],
30
- allow_credentials=True,
31
- allow_methods=["*"],
32
- allow_headers=["*"],
33
- )
34
-
35
- # Initialize models
36
  MODEL_NAME = "facebook/bart-large-cnn"
37
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
38
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
@@ -40,6 +30,8 @@ model.eval()
40
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
41
 
42
  reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
 
 
43
  summary_cache = {}
44
 
45
  def clean_text(text: str) -> str:
@@ -65,7 +57,7 @@ def extract_text(file_path: str, file_extension: str):
65
 
66
  elif file_extension == "docx":
67
  doc = docx.Document(file_path)
68
- return clean_text("\n".join(p.text for p in doc.paragraphs), ""
69
 
70
  elif file_extension == "pptx":
71
  prs = pptx.Presentation(file_path)
@@ -77,6 +69,10 @@ def extract_text(file_path: str, file_extension: str):
77
  text = [" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True)]
78
  return clean_text("\n".join(text)), ""
79
 
 
 
 
 
80
  return "", "Unsupported file format"
81
  except Exception as e:
82
  return "", f"Error reading {file_extension.upper()} file: {str(e)}"
@@ -86,7 +82,7 @@ def chunk_text(text: str, max_tokens: int = 950):
86
  sentences = sent_tokenize(text)
87
  except:
88
  words = text.split()
89
- sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20]
90
 
91
  chunks = []
92
  current_chunk = ""
@@ -165,57 +161,71 @@ def create_pdf(summary: str, original_filename: str):
165
  print(f"Error creating PDF: {e}")
166
  return ""
167
 
168
- @app.post("/summarize/")
169
- async def summarize_api(file: UploadFile = File(...), length: str = Form("medium")):
170
- # Validate file type
171
- valid_types = [
172
- 'application/pdf',
173
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
174
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
175
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
176
- ]
177
-
178
- if file.content_type not in valid_types:
179
- raise HTTPException(
180
- status_code=400,
181
- detail="Please upload a valid document (PDF, DOCX, PPTX, or XLSX)"
182
- )
183
-
184
  try:
185
- # Save temp file
186
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp:
187
- shutil.copyfileobj(file.file, temp)
188
- temp_path = temp.name
189
-
190
- # Process file
191
- text, error = extract_text(temp_path, os.path.splitext(file.filename)[1][1:].lower())
192
- if error:
193
- raise HTTPException(status_code=400, detail=error)
194
-
195
- summary = generate_summary(text, length)
196
- audio_path = text_to_speech(summary)
197
- pdf_path = create_pdf(summary, file.filename)
198
-
199
- return {
200
- "summary": summary,
201
- "audio_url": f"/files/{os.path.basename(audio_path)}" if audio_path else None,
202
- "pdf_url": f"/files/{os.path.basename(pdf_path)}" if pdf_path else None
203
- }
204
-
205
- except HTTPException:
206
- raise
207
  except Exception as e:
208
- raise HTTPException(
209
- status_code=500,
210
- detail=f"Summarization failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  )
212
- finally:
213
- if 'temp_path' in locals() and os.path.exists(temp_path):
214
- os.unlink(temp_path)
215
 
216
- @app.get("/files/{filename}")
217
- async def get_file(filename: str):
218
- file_path = os.path.join(tempfile.gettempdir(), filename)
 
 
 
 
 
 
219
  if os.path.exists(file_path):
220
  return FileResponse(file_path)
221
- raise HTTPException(status_code=404, detail="File not found")
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 
 
 
 
 
 
3
  import fitz # PyMuPDF
4
  import docx
5
  import pptx
 
7
  import re
8
  import nltk
9
  from nltk.tokenize import sent_tokenize
 
10
  import torch
11
+ from fastapi import FastAPI
12
+ from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
13
+ from gtts import gTTS
14
+ import tempfile
15
+ import os
16
  import easyocr
17
+ from fpdf import FPDF
18
+ import datetime
19
+ from concurrent.futures import ThreadPoolExecutor
20
  import hashlib
21
 
22
  nltk.download('punkt', quiet=True)
23
 
24
  app = FastAPI()
25
 
 
 
 
 
 
 
 
 
 
 
26
  MODEL_NAME = "facebook/bart-large-cnn"
27
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
28
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
 
30
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
31
 
32
  reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
33
+ executor = ThreadPoolExecutor()
34
+
35
  summary_cache = {}
36
 
37
  def clean_text(text: str) -> str:
 
57
 
58
  elif file_extension == "docx":
59
  doc = docx.Document(file_path)
60
+ return clean_text("\n".join(p.text for p in doc.paragraphs)), ""
61
 
62
  elif file_extension == "pptx":
63
  prs = pptx.Presentation(file_path)
 
69
  text = [" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True)]
70
  return clean_text("\n".join(text)), ""
71
 
72
+ elif file_extension in ["jpg", "jpeg", "png"]:
73
+ ocr_result = reader.readtext(file_path, detail=0)
74
+ return clean_text("\n".join(ocr_result)), ""
75
+
76
  return "", "Unsupported file format"
77
  except Exception as e:
78
  return "", f"Error reading {file_extension.upper()} file: {str(e)}"
 
82
  sentences = sent_tokenize(text)
83
  except:
84
  words = text.split()
85
+ sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20)]
86
 
87
  chunks = []
88
  current_chunk = ""
 
161
  print(f"Error creating PDF: {e}")
162
  return ""
163
 
164
+ def summarize_document(file, summary_length: str, enable_tts: bool = True):
165
+ if file is None:
166
+ return "Please upload a document first", "", None, None
167
+ file_path = file.name
168
+ file_extension = file_path.split(".")[-1].lower()
169
+ original_filename = os.path.basename(file_path)
170
+ text, error = extract_text(file_path, file_extension)
171
+ if error:
172
+ return error, "", None, None
173
+ if not text or len(text.split()) < 30:
174
+ return "Document is too short or contains too little text to summarize", "", None, None
 
 
 
 
 
175
  try:
176
+ summary = generate_summary(text, summary_length)
177
+ audio_path = text_to_speech(summary) if enable_tts else None
178
+ pdf_path = create_pdf(summary, original_filename) if summary else None
179
+ return summary, "", audio_path, pdf_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  except Exception as e:
181
+ return f"Summarization error: {str(e)}", "", None, None
182
+
183
+ with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
184
+ gr.Markdown("# 📄 Advanced Document Summarizer")
185
+ gr.Markdown("Upload a document to generate a summary with audio and optional PDF download")
186
+
187
+ with gr.Row():
188
+ with gr.Column():
189
+ file_input = gr.File(
190
+ label="Upload Document",
191
+ file_types=[".pdf", ".docx", ".pptx", ".xlsx", ".jpg", ".jpeg", ".png"],
192
+ type="filepath"
193
+ )
194
+ length_radio = gr.Radio(
195
+ ["short", "medium", "long"],
196
+ value="medium",
197
+ label="Summary Length"
198
+ )
199
+ submit_btn = gr.Button("Generate Summary", variant="primary")
200
+
201
+ with gr.Column():
202
+ output = gr.Textbox(label="Summary", lines=10)
203
+ audio_output = gr.Audio(label="Audio Summary")
204
+ pdf_download = gr.File(label="Download Summary as PDF", visible=False)
205
+
206
+ def summarize_and_return_ui(file, summary_length):
207
+ summary, _, audio_path, pdf_path = summarize_document(file, summary_length)
208
+ return (
209
+ summary,
210
+ audio_path,
211
+ gr.File(visible=pdf_path is not None, value=pdf_path)
212
  )
 
 
 
213
 
214
+ submit_btn.click(
215
+ fn=summarize_and_return_ui,
216
+ inputs=[file_input, length_radio],
217
+ outputs=[output, audio_output, pdf_download]
218
+ )
219
+
220
+ @app.get("/files/{file_name}")
221
+ async def get_file(file_name: str):
222
+ file_path = os.path.join(tempfile.gettempdir(), file_name)
223
  if os.path.exists(file_path):
224
  return FileResponse(file_path)
225
+ return JSONResponse({"error": "File not found"}, status_code=404)
226
+
227
+ app = gr.mount_gradio_app(app, demo, path="/")
228
+
229
+ @app.get("/")
230
+ def redirect_to_interface():
231
+ return RedirectResponse(url="/")