ikraamkb commited on
Commit
bef3ff2
·
verified ·
1 Parent(s): 30a8162

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -14
app.py CHANGED
@@ -18,7 +18,6 @@ from fpdf import FPDF
18
  import datetime
19
  from concurrent.futures import ThreadPoolExecutor
20
  import hashlib
21
- import asyncio
22
 
23
  nltk.download('punkt', quiet=True)
24
 
@@ -36,18 +35,13 @@ executor = ThreadPoolExecutor()
36
  summary_cache = {}
37
 
38
  def clean_text(text: str) -> str:
39
- text = text.encode("utf-8", errors="ignore").decode("utf-8")
40
  text = re.sub(r'\s+', ' ', text)
41
  text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
42
  text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
43
  text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
44
  return text.strip()
45
 
46
- async def async_ocr(path):
47
- loop = asyncio.get_event_loop()
48
- return await loop.run_in_executor(executor, lambda: reader.readtext(path, detail=0))
49
-
50
- async def extract_text(file_path: str, file_extension: str):
51
  try:
52
  if file_extension == "pdf":
53
  with fitz.open(file_path) as doc:
@@ -56,7 +50,7 @@ async def extract_text(file_path: str, file_extension: str):
56
  images = [page.get_pixmap() for page in doc]
57
  temp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
58
  images[0].save(temp_img.name)
59
- ocr_result = await async_ocr(temp_img.name)
60
  os.unlink(temp_img.name)
61
  text = "\n".join(ocr_result) if ocr_result else text
62
  return clean_text(text), ""
@@ -76,7 +70,7 @@ async def extract_text(file_path: str, file_extension: str):
76
  return clean_text("\n".join(text)), ""
77
 
78
  elif file_extension in ["jpg", "jpeg", "png"]:
79
- ocr_result = await async_ocr(file_path)
80
  return clean_text("\n".join(ocr_result)), ""
81
 
82
  return "", "Unsupported file format"
@@ -156,6 +150,7 @@ def create_pdf(summary: str, original_filename: str):
156
  pdf.set_font("Arial", 'B', 16)
157
  pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
158
  pdf.set_font("Arial", size=12)
 
159
  pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
160
  pdf.ln(10)
161
  pdf.multi_cell(0, 10, txt=summary)
@@ -172,9 +167,7 @@ def summarize_document(file, summary_length: str, enable_tts: bool = True):
172
  file_path = file.name
173
  file_extension = file_path.split(".")[-1].lower()
174
  original_filename = os.path.basename(file_path)
175
-
176
- loop = asyncio.get_event_loop()
177
- text, error = loop.run_until_complete(extract_text(file_path, file_extension))
178
  if error:
179
  return error, "", None, None
180
  if not text or len(text.split()) < 30:
@@ -188,7 +181,7 @@ def summarize_document(file, summary_length: str, enable_tts: bool = True):
188
  return f"Summarization error: {str(e)}", "", None, None
189
 
190
  with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
191
- gr.Markdown("# \ud83d\udcc4 Advanced Document Summarizer")
192
  gr.Markdown("Upload a document to generate a summary with audio and optional PDF download")
193
 
194
  with gr.Row():
@@ -235,4 +228,4 @@ app = gr.mount_gradio_app(app, demo, path="/")
235
 
236
  @app.get("/")
237
  def redirect_to_interface():
238
- return RedirectResponse(url="/")
 
18
  import datetime
19
  from concurrent.futures import ThreadPoolExecutor
20
  import hashlib
 
21
 
22
  nltk.download('punkt', quiet=True)
23
 
 
35
  summary_cache = {}
36
 
37
  def clean_text(text: str) -> str:
 
38
  text = re.sub(r'\s+', ' ', text)
39
  text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
40
  text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
41
  text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
42
  return text.strip()
43
 
44
+ def extract_text(file_path: str, file_extension: str):
 
 
 
 
45
  try:
46
  if file_extension == "pdf":
47
  with fitz.open(file_path) as doc:
 
50
  images = [page.get_pixmap() for page in doc]
51
  temp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
52
  images[0].save(temp_img.name)
53
+ ocr_result = reader.readtext(temp_img.name, detail=0)
54
  os.unlink(temp_img.name)
55
  text = "\n".join(ocr_result) if ocr_result else text
56
  return clean_text(text), ""
 
70
  return clean_text("\n".join(text)), ""
71
 
72
  elif file_extension in ["jpg", "jpeg", "png"]:
73
+ ocr_result = reader.readtext(file_path, detail=0)
74
  return clean_text("\n".join(ocr_result)), ""
75
 
76
  return "", "Unsupported file format"
 
150
  pdf.set_font("Arial", 'B', 16)
151
  pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
152
  pdf.set_font("Arial", size=12)
153
+ pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
154
  pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
155
  pdf.ln(10)
156
  pdf.multi_cell(0, 10, txt=summary)
 
167
  file_path = file.name
168
  file_extension = file_path.split(".")[-1].lower()
169
  original_filename = os.path.basename(file_path)
170
+ text, error = extract_text(file_path, file_extension)
 
 
171
  if error:
172
  return error, "", None, None
173
  if not text or len(text.split()) < 30:
 
181
  return f"Summarization error: {str(e)}", "", None, None
182
 
183
  with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
184
+ gr.Markdown("# 📄 Advanced Document Summarizer")
185
  gr.Markdown("Upload a document to generate a summary with audio and optional PDF download")
186
 
187
  with gr.Row():
 
228
 
229
  @app.get("/")
230
  def redirect_to_interface():
231
+ return RedirectResponse(url="/")