ikraamkb commited on
Commit
231ece3
·
verified ·
1 Parent(s): 1f58079

fast but this is faster

Browse files
Files changed (1) hide show
  1. app.py +10 -2
app.py CHANGED
@@ -16,6 +16,8 @@ import os
16
  import easyocr
17
  from fpdf import FPDF
18
  import datetime
 
 
19
 
20
  nltk.download('punkt', quiet=True)
21
 
@@ -25,10 +27,14 @@ app = FastAPI()
25
  MODEL_NAME = "facebook/bart-large-cnn"
26
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
27
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
 
 
28
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
29
 
30
  reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
31
 
 
 
32
  def clean_text(text: str) -> str:
33
  text = re.sub(r'\s+', ' ', text)
34
  text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
@@ -82,7 +88,8 @@ def chunk_text(text: str, max_tokens: int = 950):
82
  chunks = []
83
  current_chunk = ""
84
  for sentence in sentences:
85
- if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
 
86
  current_chunk += " " + sentence
87
  else:
88
  chunks.append(current_chunk.strip())
@@ -154,7 +161,8 @@ def summarize_document(file, summary_length: str, enable_tts: bool = True):
154
  file_path = file.name
155
  file_extension = file_path.split(".")[-1].lower()
156
  original_filename = os.path.basename(file_path)
157
- text, error = extract_text(file_path, file_extension)
 
158
  if error:
159
  return error, "", None, None
160
  if not text or len(text.split()) < 30:
 
16
  import easyocr
17
  from fpdf import FPDF
18
  import datetime
19
+ import asyncio
20
+ from concurrent.futures import ThreadPoolExecutor
21
 
22
  nltk.download('punkt', quiet=True)
23
 
 
27
  MODEL_NAME = "facebook/bart-large-cnn"
28
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
29
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
30
+ model.eval() # Optimization: inference mode
31
+
32
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
33
 
34
  reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
35
 
36
+ executor = ThreadPoolExecutor()
37
+
38
  def clean_text(text: str) -> str:
39
  text = re.sub(r'\s+', ' ', text)
40
  text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
 
88
  chunks = []
89
  current_chunk = ""
90
  for sentence in sentences:
91
+ token_length = len(tokenizer.encode(current_chunk + " " + sentence))
92
+ if token_length <= max_tokens:
93
  current_chunk += " " + sentence
94
  else:
95
  chunks.append(current_chunk.strip())
 
161
  file_path = file.name
162
  file_extension = file_path.split(".")[-1].lower()
163
  original_filename = os.path.basename(file_path)
164
+ loop = asyncio.get_event_loop()
165
+ text, error = loop.run_until_complete(loop.run_in_executor(executor, extract_text, file_path, file_extension))
166
  if error:
167
  return error, "", None, None
168
  if not text or len(text.split()) < 30: