Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -25,13 +25,13 @@ app = FastAPI()
|
|
25 |
MODEL_NAME = "facebook/bart-large-cnn"
|
26 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
27 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
28 |
-
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1)
|
29 |
|
30 |
-
reader = easyocr.Reader(['en'])
|
31 |
|
32 |
def clean_text(text: str) -> str:
|
33 |
text = re.sub(r'\s+', ' ', text)
|
34 |
-
text = re.sub(r'
|
35 |
text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
|
36 |
text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
|
37 |
return text.strip()
|
@@ -72,7 +72,7 @@ def extract_text(file_path: str, file_extension: str):
|
|
72 |
except Exception as e:
|
73 |
return "", f"Error reading {file_extension.upper()} file: {str(e)}"
|
74 |
|
75 |
-
def chunk_text(text: str, max_tokens: int =
|
76 |
try:
|
77 |
sentences = sent_tokenize(text)
|
78 |
except:
|
@@ -100,23 +100,22 @@ def generate_summary(text: str, length: str = "medium") -> str:
|
|
100 |
"long": {"max_length": 300, "min_length": 210}
|
101 |
}
|
102 |
chunks = chunk_text(text)
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
final_summary = " ".join(summaries)
|
120 |
final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
|
121 |
return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
|
122 |
|
|
|
25 |
MODEL_NAME = "facebook/bart-large-cnn"
|
26 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
27 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
28 |
+
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
|
29 |
|
30 |
+
reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
|
31 |
|
32 |
def clean_text(text: str) -> str:
|
33 |
text = re.sub(r'\s+', ' ', text)
|
34 |
+
text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
|
35 |
text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
|
36 |
text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
|
37 |
return text.strip()
|
|
|
72 |
except Exception as e:
|
73 |
return "", f"Error reading {file_extension.upper()} file: {str(e)}"
|
74 |
|
75 |
+
def chunk_text(text: str, max_tokens: int = 950):
|
76 |
try:
|
77 |
sentences = sent_tokenize(text)
|
78 |
except:
|
|
|
100 |
"long": {"max_length": 300, "min_length": 210}
|
101 |
}
|
102 |
chunks = chunk_text(text)
|
103 |
+
try:
|
104 |
+
summaries = summarizer(
|
105 |
+
chunks,
|
106 |
+
max_length=length_params[length]["max_length"],
|
107 |
+
min_length=length_params[length]["min_length"],
|
108 |
+
do_sample=False,
|
109 |
+
truncation=True,
|
110 |
+
no_repeat_ngram_size=2,
|
111 |
+
num_beams=2,
|
112 |
+
early_stopping=True
|
113 |
+
)
|
114 |
+
summary_texts = [s['summary_text'] for s in summaries]
|
115 |
+
except Exception as e:
|
116 |
+
summary_texts = [f"[Batch error: {str(e)}]"]
|
117 |
+
|
118 |
+
final_summary = " ".join(summary_texts)
|
|
|
119 |
final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
|
120 |
return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
|
121 |
|