Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -18,7 +18,6 @@ from fpdf import FPDF
|
|
18 |
import datetime
|
19 |
from concurrent.futures import ThreadPoolExecutor
|
20 |
import hashlib
|
21 |
-
import asyncio
|
22 |
|
23 |
nltk.download('punkt', quiet=True)
|
24 |
|
@@ -36,18 +35,13 @@ executor = ThreadPoolExecutor()
|
|
36 |
summary_cache = {}
|
37 |
|
38 |
def clean_text(text: str) -> str:
|
39 |
-
text = text.encode("utf-8", errors="ignore").decode("utf-8")
|
40 |
text = re.sub(r'\s+', ' ', text)
|
41 |
text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
|
42 |
text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
|
43 |
text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
|
44 |
return text.strip()
|
45 |
|
46 |
-
|
47 |
-
loop = asyncio.get_event_loop()
|
48 |
-
return await loop.run_in_executor(executor, lambda: reader.readtext(path, detail=0))
|
49 |
-
|
50 |
-
async def extract_text(file_path: str, file_extension: str):
|
51 |
try:
|
52 |
if file_extension == "pdf":
|
53 |
with fitz.open(file_path) as doc:
|
@@ -56,7 +50,7 @@ async def extract_text(file_path: str, file_extension: str):
|
|
56 |
images = [page.get_pixmap() for page in doc]
|
57 |
temp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
58 |
images[0].save(temp_img.name)
|
59 |
-
ocr_result =
|
60 |
os.unlink(temp_img.name)
|
61 |
text = "\n".join(ocr_result) if ocr_result else text
|
62 |
return clean_text(text), ""
|
@@ -76,7 +70,7 @@ async def extract_text(file_path: str, file_extension: str):
|
|
76 |
return clean_text("\n".join(text)), ""
|
77 |
|
78 |
elif file_extension in ["jpg", "jpeg", "png"]:
|
79 |
-
ocr_result =
|
80 |
return clean_text("\n".join(ocr_result)), ""
|
81 |
|
82 |
return "", "Unsupported file format"
|
@@ -156,6 +150,7 @@ def create_pdf(summary: str, original_filename: str):
|
|
156 |
pdf.set_font("Arial", 'B', 16)
|
157 |
pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
|
158 |
pdf.set_font("Arial", size=12)
|
|
|
159 |
pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
|
160 |
pdf.ln(10)
|
161 |
pdf.multi_cell(0, 10, txt=summary)
|
@@ -172,9 +167,7 @@ def summarize_document(file, summary_length: str, enable_tts: bool = True):
|
|
172 |
file_path = file.name
|
173 |
file_extension = file_path.split(".")[-1].lower()
|
174 |
original_filename = os.path.basename(file_path)
|
175 |
-
|
176 |
-
loop = asyncio.get_event_loop()
|
177 |
-
text, error = loop.run_until_complete(extract_text(file_path, file_extension))
|
178 |
if error:
|
179 |
return error, "", None, None
|
180 |
if not text or len(text.split()) < 30:
|
@@ -188,7 +181,7 @@ def summarize_document(file, summary_length: str, enable_tts: bool = True):
|
|
188 |
return f"Summarization error: {str(e)}", "", None, None
|
189 |
|
190 |
with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
|
191 |
-
gr.Markdown("#
|
192 |
gr.Markdown("Upload a document to generate a summary with audio and optional PDF download")
|
193 |
|
194 |
with gr.Row():
|
@@ -235,4 +228,4 @@ app = gr.mount_gradio_app(app, demo, path="/")
|
|
235 |
|
236 |
@app.get("/")
|
237 |
def redirect_to_interface():
|
238 |
-
return RedirectResponse(url="/")
|
|
|
18 |
import datetime
|
19 |
from concurrent.futures import ThreadPoolExecutor
|
20 |
import hashlib
|
|
|
21 |
|
22 |
nltk.download('punkt', quiet=True)
|
23 |
|
|
|
35 |
summary_cache = {}
|
36 |
|
37 |
def clean_text(text: str) -> str:
|
|
|
38 |
text = re.sub(r'\s+', ' ', text)
|
39 |
text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
|
40 |
text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
|
41 |
text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
|
42 |
return text.strip()
|
43 |
|
44 |
+
def extract_text(file_path: str, file_extension: str):
|
|
|
|
|
|
|
|
|
45 |
try:
|
46 |
if file_extension == "pdf":
|
47 |
with fitz.open(file_path) as doc:
|
|
|
50 |
images = [page.get_pixmap() for page in doc]
|
51 |
temp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
52 |
images[0].save(temp_img.name)
|
53 |
+
ocr_result = reader.readtext(temp_img.name, detail=0)
|
54 |
os.unlink(temp_img.name)
|
55 |
text = "\n".join(ocr_result) if ocr_result else text
|
56 |
return clean_text(text), ""
|
|
|
70 |
return clean_text("\n".join(text)), ""
|
71 |
|
72 |
elif file_extension in ["jpg", "jpeg", "png"]:
|
73 |
+
ocr_result = reader.readtext(file_path, detail=0)
|
74 |
return clean_text("\n".join(ocr_result)), ""
|
75 |
|
76 |
return "", "Unsupported file format"
|
|
|
150 |
pdf.set_font("Arial", 'B', 16)
|
151 |
pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
|
152 |
pdf.set_font("Arial", size=12)
|
153 |
+
pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
|
154 |
pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
|
155 |
pdf.ln(10)
|
156 |
pdf.multi_cell(0, 10, txt=summary)
|
|
|
167 |
file_path = file.name
|
168 |
file_extension = file_path.split(".")[-1].lower()
|
169 |
original_filename = os.path.basename(file_path)
|
170 |
+
text, error = extract_text(file_path, file_extension)
|
|
|
|
|
171 |
if error:
|
172 |
return error, "", None, None
|
173 |
if not text or len(text.split()) < 30:
|
|
|
181 |
return f"Summarization error: {str(e)}", "", None, None
|
182 |
|
183 |
with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
|
184 |
+
gr.Markdown("# 📄 Advanced Document Summarizer")
|
185 |
gr.Markdown("Upload a document to generate a summary with audio and optional PDF download")
|
186 |
|
187 |
with gr.Row():
|
|
|
228 |
|
229 |
@app.get("/")
|
230 |
def redirect_to_interface():
|
231 |
+
return RedirectResponse(url="/")
|