Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -16,25 +16,24 @@ import os
|
|
16 |
import easyocr
|
17 |
from fpdf import FPDF
|
18 |
import datetime
|
19 |
-
import asyncio
|
20 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
21 |
|
22 |
nltk.download('punkt', quiet=True)
|
23 |
|
24 |
app = FastAPI()
|
25 |
|
26 |
-
# Load models
|
27 |
MODEL_NAME = "facebook/bart-large-cnn"
|
28 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
29 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
30 |
-
model.eval()
|
31 |
-
|
32 |
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
|
33 |
|
34 |
-
reader = easyocr.Reader(['en'
|
35 |
-
|
36 |
executor = ThreadPoolExecutor()
|
37 |
|
|
|
|
|
38 |
def clean_text(text: str) -> str:
|
39 |
text = re.sub(r'\s+', ' ', text)
|
40 |
text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
|
@@ -101,6 +100,10 @@ def chunk_text(text: str, max_tokens: int = 950):
|
|
101 |
return chunks
|
102 |
|
103 |
def generate_summary(text: str, length: str = "medium") -> str:
|
|
|
|
|
|
|
|
|
104 |
length_params = {
|
105 |
"short": {"max_length": 80, "min_length": 30},
|
106 |
"medium": {"max_length": 200, "min_length": 80},
|
@@ -124,7 +127,10 @@ def generate_summary(text: str, length: str = "medium") -> str:
|
|
124 |
|
125 |
final_summary = " ".join(summary_texts)
|
126 |
final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
|
127 |
-
|
|
|
|
|
|
|
128 |
|
129 |
def text_to_speech(text: str):
|
130 |
try:
|
@@ -161,8 +167,7 @@ def summarize_document(file, summary_length: str, enable_tts: bool = True):
|
|
161 |
file_path = file.name
|
162 |
file_extension = file_path.split(".")[-1].lower()
|
163 |
original_filename = os.path.basename(file_path)
|
164 |
-
|
165 |
-
text, error = loop.run_until_complete(loop.run_in_executor(executor, extract_text, file_path, file_extension))
|
166 |
if error:
|
167 |
return error, "", None, None
|
168 |
if not text or len(text.split()) < 30:
|
|
|
16 |
import easyocr
|
17 |
from fpdf import FPDF
|
18 |
import datetime
|
|
|
19 |
from concurrent.futures import ThreadPoolExecutor
|
20 |
+
import hashlib
|
21 |
|
22 |
nltk.download('punkt', quiet=True)
|
23 |
|
24 |
app = FastAPI()
|
25 |
|
|
|
26 |
MODEL_NAME = "facebook/bart-large-cnn"
|
27 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
28 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
29 |
+
model.eval()
|
|
|
30 |
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
|
31 |
|
32 |
+
reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
|
|
|
33 |
executor = ThreadPoolExecutor()
|
34 |
|
35 |
+
summary_cache = {}
|
36 |
+
|
37 |
def clean_text(text: str) -> str:
|
38 |
text = re.sub(r'\s+', ' ', text)
|
39 |
text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
|
|
|
100 |
return chunks
|
101 |
|
102 |
def generate_summary(text: str, length: str = "medium") -> str:
|
103 |
+
cache_key = hashlib.md5((text + length).encode()).hexdigest()
|
104 |
+
if cache_key in summary_cache:
|
105 |
+
return summary_cache[cache_key]
|
106 |
+
|
107 |
length_params = {
|
108 |
"short": {"max_length": 80, "min_length": 30},
|
109 |
"medium": {"max_length": 200, "min_length": 80},
|
|
|
127 |
|
128 |
final_summary = " ".join(summary_texts)
|
129 |
final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
|
130 |
+
final_summary = final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
|
131 |
+
|
132 |
+
summary_cache[cache_key] = final_summary
|
133 |
+
return final_summary
|
134 |
|
135 |
def text_to_speech(text: str):
|
136 |
try:
|
|
|
167 |
file_path = file.name
|
168 |
file_extension = file_path.split(".")[-1].lower()
|
169 |
original_filename = os.path.basename(file_path)
|
170 |
+
text, error = extract_text(file_path, file_extension)
|
|
|
171 |
if error:
|
172 |
return error, "", None, None
|
173 |
if not text or len(text.split()) < 30:
|