ikraamkb commited on
Commit
c98b8c8
·
verified ·
1 Parent(s): c114357

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -103
app.py CHANGED
@@ -17,42 +17,30 @@ import easyocr
17
  from fpdf import FPDF
18
  import datetime
19
 
20
- # Download required NLTK data
21
  nltk.download('punkt', quiet=True)
22
 
23
- # Initialize components
24
  app = FastAPI()
25
 
26
- # Load models (CPU optimized)
27
  MODEL_NAME = "facebook/bart-large-cnn"
28
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
29
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
30
- summarizer = pipeline(
31
- "summarization",
32
- model=model,
33
- tokenizer=tokenizer,
34
- device=-1, # Force CPU usage
35
- torch_dtype=torch.float32
36
- )
37
 
38
- # Initialize EasyOCR reader
39
- reader = easyocr.Reader(['en']) # English only for faster initialization
40
 
41
  def clean_text(text: str) -> str:
42
- """Clean and normalize document text"""
43
- text = re.sub(r'\s+', ' ', text) # Normalize whitespace
44
- text = re.sub(r'•\s*|\d\.\s+', '', text) # Remove bullets and numbering
45
- text = re.sub(r'\[.*?\]|\(.*?\)', '', text) # Remove brackets/parentheses
46
- text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE) # Remove page numbers
47
  return text.strip()
48
 
49
- def extract_text(file_path: str, file_extension: str) -> tuple[str, str]:
50
- """Extract text from various document formats"""
51
  try:
52
  if file_extension == "pdf":
53
  with fitz.open(file_path) as doc:
54
  text = "\n".join(page.get_text("text") for page in doc)
55
- # Try OCR for scanned PDFs if text extraction fails
56
  if len(text.strip()) < 50:
57
  images = [page.get_pixmap() for page in doc]
58
  temp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
@@ -61,71 +49,58 @@ def extract_text(file_path: str, file_extension: str) -> tuple[str, str]:
61
  os.unlink(temp_img.name)
62
  text = "\n".join(ocr_result) if ocr_result else text
63
  return clean_text(text), ""
64
-
65
  elif file_extension == "docx":
66
  doc = docx.Document(file_path)
67
  return clean_text("\n".join(p.text for p in doc.paragraphs)), ""
68
-
69
  elif file_extension == "pptx":
70
  prs = pptx.Presentation(file_path)
71
- text = []
72
- for slide in prs.slides:
73
- for shape in slide.shapes:
74
- if hasattr(shape, "text"):
75
- text.append(shape.text)
76
  return clean_text("\n".join(text)), ""
77
-
78
  elif file_extension == "xlsx":
79
  wb = openpyxl.load_workbook(file_path, read_only=True)
80
- text = []
81
- for sheet in wb.sheetnames:
82
- for row in wb[sheet].iter_rows(values_only=True):
83
- text.append(" ".join(str(cell) for cell in row if cell))
84
  return clean_text("\n".join(text)), ""
85
-
86
  elif file_extension in ["jpg", "jpeg", "png"]:
87
  ocr_result = reader.readtext(file_path, detail=0)
88
  return clean_text("\n".join(ocr_result)), ""
89
-
90
  return "", "Unsupported file format"
91
  except Exception as e:
92
  return "", f"Error reading {file_extension.upper()} file: {str(e)}"
93
 
94
- def chunk_text(text: str, max_tokens: int = 768) -> list[str]:
95
- """Split text into manageable chunks for summarization"""
96
  try:
97
  sentences = sent_tokenize(text)
98
  except:
99
- # Fallback if sentence tokenization fails
100
  words = text.split()
101
  sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20)]
102
-
103
  chunks = []
104
  current_chunk = ""
105
-
106
  for sentence in sentences:
107
  if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
108
  current_chunk += " " + sentence
109
  else:
110
  chunks.append(current_chunk.strip())
111
  current_chunk = sentence
112
-
113
  if current_chunk:
114
  chunks.append(current_chunk.strip())
115
-
116
  return chunks
117
 
118
  def generate_summary(text: str, length: str = "medium") -> str:
119
- """Generate summary with appropriate length parameters"""
120
  length_params = {
121
  "short": {"max_length": 80, "min_length": 30},
122
  "medium": {"max_length": 200, "min_length": 80},
123
  "long": {"max_length": 300, "min_length": 210}
124
  }
125
-
126
  chunks = chunk_text(text)
127
  summaries = []
128
-
129
  for chunk in chunks:
130
  try:
131
  summary = summarizer(
@@ -141,14 +116,11 @@ def generate_summary(text: str, length: str = "medium") -> str:
141
  summaries.append(summary[0]['summary_text'])
142
  except Exception as e:
143
  summaries.append(f"[Chunk error: {str(e)}]")
144
-
145
- # Combine and format the final summary
146
  final_summary = " ".join(summaries)
147
  final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
148
  return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
149
 
150
- def text_to_speech(text: str) -> str:
151
- """Convert text to speech and return temporary audio file path"""
152
  try:
153
  tts = gTTS(text)
154
  temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
@@ -158,28 +130,18 @@ def text_to_speech(text: str) -> str:
158
  print(f"Error in text-to-speech: {e}")
159
  return ""
160
 
161
- def create_pdf(summary: str, original_filename: str) -> str:
162
- """Create a PDF file from the summary text"""
163
  try:
164
- # Create PDF object
165
  pdf = FPDF()
166
  pdf.add_page()
167
  pdf.set_font("Arial", size=12)
168
-
169
- # Add title
170
  pdf.set_font("Arial", 'B', 16)
171
  pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
172
  pdf.set_font("Arial", size=12)
173
-
174
- # Add metadata
175
  pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
176
  pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
177
  pdf.ln(10)
178
-
179
- # Add summary content
180
  pdf.multi_cell(0, 10, txt=summary)
181
-
182
- # Save to temporary file
183
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
184
  pdf.output(temp_pdf.name)
185
  return temp_pdf.name
@@ -187,35 +149,29 @@ def create_pdf(summary: str, original_filename: str) -> str:
187
  print(f"Error creating PDF: {e}")
188
  return ""
189
 
190
- def summarize_document(file, summary_length: str, enable_tts: bool):
191
- """Main processing function for Gradio interface"""
192
  if file is None:
193
- return "Please upload a document first", "Ready", None, None
194
-
195
  file_path = file.name
196
  file_extension = file_path.split(".")[-1].lower()
197
  original_filename = os.path.basename(file_path)
198
-
199
  text, error = extract_text(file_path, file_extension)
200
  if error:
201
- return error, "Error", None, None
202
-
203
  if not text or len(text.split()) < 30:
204
- return "Document is too short or contains too little text to summarize", "Ready", None, None
205
-
206
  try:
207
  summary = generate_summary(text, summary_length)
208
  audio_path = text_to_speech(summary) if enable_tts else None
209
  pdf_path = create_pdf(summary, original_filename) if summary else None
210
- return summary, "Summary complete", audio_path, pdf_path
211
  except Exception as e:
212
- return f"Summarization error: {str(e)}", "Error", None, None
213
 
214
- # Gradio Interface
215
  with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
216
  gr.Markdown("# 📄 Advanced Document Summarizer")
217
- gr.Markdown("Upload a document to generate a summary with optional audio reading and PDF download")
218
-
219
  with gr.Row():
220
  with gr.Column():
221
  file_input = gr.File(
@@ -228,46 +184,27 @@ with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
228
  value="medium",
229
  label="Summary Length"
230
  )
231
- tts_checkbox = gr.Checkbox(
232
- label="Enable Text-to-Speech",
233
- value=False
234
- )
235
  submit_btn = gr.Button("Generate Summary", variant="primary")
236
-
237
  with gr.Column():
238
  output = gr.Textbox(label="Summary", lines=10)
239
- status = gr.Textbox(label="Status", interactive=False)
240
- audio_output = gr.Audio(label="Audio Summary", visible=False)
241
  pdf_download = gr.File(label="Download Summary as PDF", visible=False)
242
-
243
- def toggle_audio_visibility(enable_tts):
244
- return gr.Audio(visible=enable_tts)
245
-
246
- def update_ui(summary, status, audio_path, pdf_path):
247
  return (
248
  summary,
249
- status,
250
- gr.Audio(visible=audio_path is not None, value=audio_path),
251
  gr.File(visible=pdf_path is not None, value=pdf_path)
252
  )
253
-
254
- tts_checkbox.change(
255
- fn=toggle_audio_visibility,
256
- inputs=tts_checkbox,
257
- outputs=audio_output
258
- )
259
-
260
  submit_btn.click(
261
- fn=summarize_document,
262
- inputs=[file_input, length_radio, tts_checkbox],
263
- outputs=[output, status, audio_output, pdf_download]
264
- ).then(
265
- fn=update_ui,
266
- inputs=[output, status, audio_output, pdf_download],
267
- outputs=[output, status, audio_output, pdf_download]
268
  )
269
 
270
- # FastAPI endpoints for files
271
  @app.get("/files/{file_name}")
272
  async def get_file(file_name: str):
273
  file_path = os.path.join(tempfile.gettempdir(), file_name)
@@ -275,7 +212,6 @@ async def get_file(file_name: str):
275
  return FileResponse(file_path)
276
  return JSONResponse({"error": "File not found"}, status_code=404)
277
 
278
- # Mount Gradio app to FastAPI
279
  app = gr.mount_gradio_app(app, demo, path="/")
280
 
281
  @app.get("/")
 
17
  from fpdf import FPDF
18
  import datetime
19
 
 
20
  nltk.download('punkt', quiet=True)
21
 
 
22
  app = FastAPI()
23
 
24
+ # Load models
25
  MODEL_NAME = "facebook/bart-large-cnn"
26
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
27
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
28
+ summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1)
 
 
 
 
 
 
29
 
30
+ reader = easyocr.Reader(['en'])
 
31
 
32
  def clean_text(text: str) -> str:
33
+ text = re.sub(r'\s+', ' ', text)
34
+ text = re.sub(r'•\s*|\d\.\s+', '', text)
35
+ text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
36
+ text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
 
37
  return text.strip()
38
 
39
+ def extract_text(file_path: str, file_extension: str):
 
40
  try:
41
  if file_extension == "pdf":
42
  with fitz.open(file_path) as doc:
43
  text = "\n".join(page.get_text("text") for page in doc)
 
44
  if len(text.strip()) < 50:
45
  images = [page.get_pixmap() for page in doc]
46
  temp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
 
49
  os.unlink(temp_img.name)
50
  text = "\n".join(ocr_result) if ocr_result else text
51
  return clean_text(text), ""
52
+
53
  elif file_extension == "docx":
54
  doc = docx.Document(file_path)
55
  return clean_text("\n".join(p.text for p in doc.paragraphs)), ""
56
+
57
  elif file_extension == "pptx":
58
  prs = pptx.Presentation(file_path)
59
+ text = [shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")]
 
 
 
 
60
  return clean_text("\n".join(text)), ""
61
+
62
  elif file_extension == "xlsx":
63
  wb = openpyxl.load_workbook(file_path, read_only=True)
64
+ text = [" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True)]
 
 
 
65
  return clean_text("\n".join(text)), ""
66
+
67
  elif file_extension in ["jpg", "jpeg", "png"]:
68
  ocr_result = reader.readtext(file_path, detail=0)
69
  return clean_text("\n".join(ocr_result)), ""
70
+
71
  return "", "Unsupported file format"
72
  except Exception as e:
73
  return "", f"Error reading {file_extension.upper()} file: {str(e)}"
74
 
75
+ def chunk_text(text: str, max_tokens: int = 768):
 
76
  try:
77
  sentences = sent_tokenize(text)
78
  except:
 
79
  words = text.split()
80
  sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20)]
81
+
82
  chunks = []
83
  current_chunk = ""
 
84
  for sentence in sentences:
85
  if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
86
  current_chunk += " " + sentence
87
  else:
88
  chunks.append(current_chunk.strip())
89
  current_chunk = sentence
90
+
91
  if current_chunk:
92
  chunks.append(current_chunk.strip())
93
+
94
  return chunks
95
 
96
  def generate_summary(text: str, length: str = "medium") -> str:
 
97
  length_params = {
98
  "short": {"max_length": 80, "min_length": 30},
99
  "medium": {"max_length": 200, "min_length": 80},
100
  "long": {"max_length": 300, "min_length": 210}
101
  }
 
102
  chunks = chunk_text(text)
103
  summaries = []
 
104
  for chunk in chunks:
105
  try:
106
  summary = summarizer(
 
116
  summaries.append(summary[0]['summary_text'])
117
  except Exception as e:
118
  summaries.append(f"[Chunk error: {str(e)}]")
 
 
119
  final_summary = " ".join(summaries)
120
  final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
121
  return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
122
 
123
+ def text_to_speech(text: str):
 
124
  try:
125
  tts = gTTS(text)
126
  temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
 
130
  print(f"Error in text-to-speech: {e}")
131
  return ""
132
 
133
+ def create_pdf(summary: str, original_filename: str):
 
134
  try:
 
135
  pdf = FPDF()
136
  pdf.add_page()
137
  pdf.set_font("Arial", size=12)
 
 
138
  pdf.set_font("Arial", 'B', 16)
139
  pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
140
  pdf.set_font("Arial", size=12)
 
 
141
  pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
142
  pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
143
  pdf.ln(10)
 
 
144
  pdf.multi_cell(0, 10, txt=summary)
 
 
145
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
146
  pdf.output(temp_pdf.name)
147
  return temp_pdf.name
 
149
  print(f"Error creating PDF: {e}")
150
  return ""
151
 
152
+ def summarize_document(file, summary_length: str, enable_tts: bool = True):
 
153
  if file is None:
154
+ return "Please upload a document first", "", None, None
 
155
  file_path = file.name
156
  file_extension = file_path.split(".")[-1].lower()
157
  original_filename = os.path.basename(file_path)
 
158
  text, error = extract_text(file_path, file_extension)
159
  if error:
160
+ return error, "", None, None
 
161
  if not text or len(text.split()) < 30:
162
+ return "Document is too short or contains too little text to summarize", "", None, None
 
163
  try:
164
  summary = generate_summary(text, summary_length)
165
  audio_path = text_to_speech(summary) if enable_tts else None
166
  pdf_path = create_pdf(summary, original_filename) if summary else None
167
+ return summary, "", audio_path, pdf_path
168
  except Exception as e:
169
+ return f"Summarization error: {str(e)}", "", None, None
170
 
 
171
  with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
172
  gr.Markdown("# 📄 Advanced Document Summarizer")
173
+ gr.Markdown("Upload a document to generate a summary with audio and optional PDF download")
174
+
175
  with gr.Row():
176
  with gr.Column():
177
  file_input = gr.File(
 
184
  value="medium",
185
  label="Summary Length"
186
  )
 
 
 
 
187
  submit_btn = gr.Button("Generate Summary", variant="primary")
188
+
189
  with gr.Column():
190
  output = gr.Textbox(label="Summary", lines=10)
191
+ audio_output = gr.Audio(label="Audio Summary")
 
192
  pdf_download = gr.File(label="Download Summary as PDF", visible=False)
193
+
194
+ def summarize_and_return_ui(file, summary_length):
195
+ summary, _, audio_path, pdf_path = summarize_document(file, summary_length)
 
 
196
  return (
197
  summary,
198
+ audio_path,
 
199
  gr.File(visible=pdf_path is not None, value=pdf_path)
200
  )
201
+
 
 
 
 
 
 
202
  submit_btn.click(
203
+ fn=summarize_and_return_ui,
204
+ inputs=[file_input, length_radio],
205
+ outputs=[output, audio_output, pdf_download]
 
 
 
 
206
  )
207
 
 
208
  @app.get("/files/{file_name}")
209
  async def get_file(file_name: str):
210
  file_path = os.path.join(tempfile.gettempdir(), file_name)
 
212
  return FileResponse(file_path)
213
  return JSONResponse({"error": "File not found"}, status_code=404)
214
 
 
215
  app = gr.mount_gradio_app(app, demo, path="/")
216
 
217
  @app.get("/")