Spaces:

ikraamkb
/

Summarization

Running

ikraamkb commited on 7 days ago

Commit

231ece3

verified ·

1 Parent(s): 1f58079

fast but this is faster

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,6 +16,8 @@ import os
 import easyocr
 from fpdf import FPDF
 import datetime
 nltk.download('punkt', quiet=True)
@@ -25,10 +27,14 @@ app = FastAPI()
 MODEL_NAME = "facebook/bart-large-cnn"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
 summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
 reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
 def clean_text(text: str) -> str:
     text = re.sub(r'\s+', ' ', text)
     text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
@@ -82,7 +88,8 @@ def chunk_text(text: str, max_tokens: int = 950):
     chunks = []
     current_chunk = ""
     for sentence in sentences:
-        if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
             current_chunk += " " + sentence
         else:
             chunks.append(current_chunk.strip())
@@ -154,7 +161,8 @@ def summarize_document(file, summary_length: str, enable_tts: bool = True):
     file_path = file.name
     file_extension = file_path.split(".")[-1].lower()
     original_filename = os.path.basename(file_path)
-    text, error = extract_text(file_path, file_extension)
     if error:
         return error, "", None, None
     if not text or len(text.split()) < 30:

 import easyocr
 from fpdf import FPDF
 import datetime
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
 nltk.download('punkt', quiet=True)
 MODEL_NAME = "facebook/bart-large-cnn"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
+model.eval()  # Optimization: inference mode
 summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
 reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
+executor = ThreadPoolExecutor()
 def clean_text(text: str) -> str:
     text = re.sub(r'\s+', ' ', text)
     text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
     chunks = []
     current_chunk = ""
     for sentence in sentences:
+        token_length = len(tokenizer.encode(current_chunk + " " + sentence))
+        if token_length <= max_tokens:
             current_chunk += " " + sentence
         else:
             chunks.append(current_chunk.strip())
     file_path = file.name
     file_extension = file_path.split(".")[-1].lower()
     original_filename = os.path.basename(file_path)
+    loop = asyncio.get_event_loop()
+    text, error = loop.run_until_complete(loop.run_in_executor(executor, extract_text, file_path, file_extension))
     if error:
         return error, "", None, None
     if not text or len(text.split()) < 30: