Spaces:
Running
Running
fast but this is faster
Browse files
app.py
CHANGED
@@ -16,6 +16,8 @@ import os
|
|
16 |
import easyocr
|
17 |
from fpdf import FPDF
|
18 |
import datetime
|
|
|
|
|
19 |
|
20 |
nltk.download('punkt', quiet=True)
|
21 |
|
@@ -25,10 +27,14 @@ app = FastAPI()
|
|
25 |
MODEL_NAME = "facebook/bart-large-cnn"
|
26 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
27 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
|
|
|
|
28 |
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
|
29 |
|
30 |
reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
|
31 |
|
|
|
|
|
32 |
def clean_text(text: str) -> str:
|
33 |
text = re.sub(r'\s+', ' ', text)
|
34 |
text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
|
@@ -82,7 +88,8 @@ def chunk_text(text: str, max_tokens: int = 950):
|
|
82 |
chunks = []
|
83 |
current_chunk = ""
|
84 |
for sentence in sentences:
|
85 |
-
|
|
|
86 |
current_chunk += " " + sentence
|
87 |
else:
|
88 |
chunks.append(current_chunk.strip())
|
@@ -154,7 +161,8 @@ def summarize_document(file, summary_length: str, enable_tts: bool = True):
|
|
154 |
file_path = file.name
|
155 |
file_extension = file_path.split(".")[-1].lower()
|
156 |
original_filename = os.path.basename(file_path)
|
157 |
-
|
|
|
158 |
if error:
|
159 |
return error, "", None, None
|
160 |
if not text or len(text.split()) < 30:
|
|
|
16 |
import easyocr
|
17 |
from fpdf import FPDF
|
18 |
import datetime
|
19 |
+
import asyncio
|
20 |
+
from concurrent.futures import ThreadPoolExecutor
|
21 |
|
22 |
nltk.download('punkt', quiet=True)
|
23 |
|
|
|
27 |
MODEL_NAME = "facebook/bart-large-cnn"
|
28 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
29 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
30 |
+
model.eval() # Optimization: inference mode
|
31 |
+
|
32 |
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
|
33 |
|
34 |
reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
|
35 |
|
36 |
+
executor = ThreadPoolExecutor()
|
37 |
+
|
38 |
def clean_text(text: str) -> str:
|
39 |
text = re.sub(r'\s+', ' ', text)
|
40 |
text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
|
|
|
88 |
chunks = []
|
89 |
current_chunk = ""
|
90 |
for sentence in sentences:
|
91 |
+
token_length = len(tokenizer.encode(current_chunk + " " + sentence))
|
92 |
+
if token_length <= max_tokens:
|
93 |
current_chunk += " " + sentence
|
94 |
else:
|
95 |
chunks.append(current_chunk.strip())
|
|
|
161 |
file_path = file.name
|
162 |
file_extension = file_path.split(".")[-1].lower()
|
163 |
original_filename = os.path.basename(file_path)
|
164 |
+
loop = asyncio.get_event_loop()
|
165 |
+
text, error = loop.run_until_complete(loop.run_in_executor(executor, extract_text, file_path, file_extension))
|
166 |
if error:
|
167 |
return error, "", None, None
|
168 |
if not text or len(text.split()) < 30:
|