ikraamkb commited on
Commit
d9c0a34
·
verified ·
1 Parent(s): 577b48b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -9
app.py CHANGED
@@ -16,25 +16,24 @@ import os
16
  import easyocr
17
  from fpdf import FPDF
18
  import datetime
19
- import asyncio
20
  from concurrent.futures import ThreadPoolExecutor
 
21
 
22
  nltk.download('punkt', quiet=True)
23
 
24
  app = FastAPI()
25
 
26
- # Load models
27
  MODEL_NAME = "facebook/bart-large-cnn"
28
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
29
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
30
- model.eval() # Optimization: inference mode
31
-
32
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
33
 
34
- reader = easyocr.Reader(['en','fr'], gpu=torch.cuda.is_available())
35
-
36
  executor = ThreadPoolExecutor()
37
 
 
 
38
  def clean_text(text: str) -> str:
39
  text = re.sub(r'\s+', ' ', text)
40
  text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
@@ -101,6 +100,10 @@ def chunk_text(text: str, max_tokens: int = 950):
101
  return chunks
102
 
103
  def generate_summary(text: str, length: str = "medium") -> str:
 
 
 
 
104
  length_params = {
105
  "short": {"max_length": 80, "min_length": 30},
106
  "medium": {"max_length": 200, "min_length": 80},
@@ -124,7 +127,10 @@ def generate_summary(text: str, length: str = "medium") -> str:
124
 
125
  final_summary = " ".join(summary_texts)
126
  final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
127
- return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
 
 
 
128
 
129
  def text_to_speech(text: str):
130
  try:
@@ -161,8 +167,7 @@ def summarize_document(file, summary_length: str, enable_tts: bool = True):
161
  file_path = file.name
162
  file_extension = file_path.split(".")[-1].lower()
163
  original_filename = os.path.basename(file_path)
164
- loop = asyncio.get_event_loop()
165
- text, error = loop.run_until_complete(loop.run_in_executor(executor, extract_text, file_path, file_extension))
166
  if error:
167
  return error, "", None, None
168
  if not text or len(text.split()) < 30:
 
16
  import easyocr
17
  from fpdf import FPDF
18
  import datetime
 
19
  from concurrent.futures import ThreadPoolExecutor
20
+ import hashlib
21
 
22
  nltk.download('punkt', quiet=True)
23
 
24
  app = FastAPI()
25
 
 
26
  MODEL_NAME = "facebook/bart-large-cnn"
27
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
28
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
29
+ model.eval()
 
30
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
31
 
32
+ reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
 
33
  executor = ThreadPoolExecutor()
34
 
35
+ summary_cache = {}
36
+
37
  def clean_text(text: str) -> str:
38
  text = re.sub(r'\s+', ' ', text)
39
  text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
 
100
  return chunks
101
 
102
  def generate_summary(text: str, length: str = "medium") -> str:
103
+ cache_key = hashlib.md5((text + length).encode()).hexdigest()
104
+ if cache_key in summary_cache:
105
+ return summary_cache[cache_key]
106
+
107
  length_params = {
108
  "short": {"max_length": 80, "min_length": 30},
109
  "medium": {"max_length": 200, "min_length": 80},
 
127
 
128
  final_summary = " ".join(summary_texts)
129
  final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
130
+ final_summary = final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
131
+
132
+ summary_cache[cache_key] = final_summary
133
+ return final_summary
134
 
135
  def text_to_speech(text: str):
136
  try:
 
167
  file_path = file.name
168
  file_extension = file_path.split(".")[-1].lower()
169
  original_filename = os.path.basename(file_path)
170
+ text, error = extract_text(file_path, file_extension)
 
171
  if error:
172
  return error, "", None, None
173
  if not text or len(text.split()) < 30: