ikraamkb commited on
Commit
1f58079
·
verified ·
1 Parent(s): c98b8c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -21
app.py CHANGED
@@ -25,13 +25,13 @@ app = FastAPI()
25
  MODEL_NAME = "facebook/bart-large-cnn"
26
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
27
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
28
- summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1)
29
 
30
- reader = easyocr.Reader(['en'])
31
 
32
  def clean_text(text: str) -> str:
33
  text = re.sub(r'\s+', ' ', text)
34
- text = re.sub(r'•\s*|\d\.\s+', '', text)
35
  text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
36
  text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
37
  return text.strip()
@@ -72,7 +72,7 @@ def extract_text(file_path: str, file_extension: str):
72
  except Exception as e:
73
  return "", f"Error reading {file_extension.upper()} file: {str(e)}"
74
 
75
- def chunk_text(text: str, max_tokens: int = 768):
76
  try:
77
  sentences = sent_tokenize(text)
78
  except:
@@ -100,23 +100,22 @@ def generate_summary(text: str, length: str = "medium") -> str:
100
  "long": {"max_length": 300, "min_length": 210}
101
  }
102
  chunks = chunk_text(text)
103
- summaries = []
104
- for chunk in chunks:
105
- try:
106
- summary = summarizer(
107
- chunk,
108
- max_length=length_params[length]["max_length"],
109
- min_length=length_params[length]["min_length"],
110
- do_sample=False,
111
- truncation=True,
112
- no_repeat_ngram_size=2,
113
- num_beams=2,
114
- early_stopping=True
115
- )
116
- summaries.append(summary[0]['summary_text'])
117
- except Exception as e:
118
- summaries.append(f"[Chunk error: {str(e)}]")
119
- final_summary = " ".join(summaries)
120
  final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
121
  return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
122
 
 
25
  MODEL_NAME = "facebook/bart-large-cnn"
26
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
27
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
28
+ summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
29
 
30
+ reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
31
 
32
  def clean_text(text: str) -> str:
33
  text = re.sub(r'\s+', ' ', text)
34
+ text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
35
  text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
36
  text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
37
  return text.strip()
 
72
  except Exception as e:
73
  return "", f"Error reading {file_extension.upper()} file: {str(e)}"
74
 
75
+ def chunk_text(text: str, max_tokens: int = 950):
76
  try:
77
  sentences = sent_tokenize(text)
78
  except:
 
100
  "long": {"max_length": 300, "min_length": 210}
101
  }
102
  chunks = chunk_text(text)
103
+ try:
104
+ summaries = summarizer(
105
+ chunks,
106
+ max_length=length_params[length]["max_length"],
107
+ min_length=length_params[length]["min_length"],
108
+ do_sample=False,
109
+ truncation=True,
110
+ no_repeat_ngram_size=2,
111
+ num_beams=2,
112
+ early_stopping=True
113
+ )
114
+ summary_texts = [s['summary_text'] for s in summaries]
115
+ except Exception as e:
116
+ summary_texts = [f"[Batch error: {str(e)}]"]
117
+
118
+ final_summary = " ".join(summary_texts)
 
119
  final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
120
  return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
121