Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -39,25 +39,71 @@ def allowed_file(filename):
|
|
39 |
"""Check if the uploaded file has an allowed extension."""
|
40 |
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
41 |
|
42 |
-
def summarize_text(text, max_length=
|
43 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
try:
|
45 |
if not text.strip():
|
46 |
return "No text found in the document to summarize."
|
47 |
|
48 |
-
#
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
except Exception as e:
|
62 |
logger.error(f"Error in T5 summarization: {str(e)}")
|
63 |
return f"Error summarizing text: {str(e)}"
|
|
|
39 |
"""Check if the uploaded file has an allowed extension."""
|
40 |
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
41 |
|
42 |
+
def summarize_text(text, max_length=300, min_length=100):
|
43 |
+
"""
|
44 |
+
Summarize text using T5-Base with improved parameters for more comprehensive summaries.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
text (str): The text to summarize
|
48 |
+
max_length (int): Maximum length of the summary (increased from 150)
|
49 |
+
min_length (int): Minimum length of the summary (increased from 30)
|
50 |
+
|
51 |
+
Returns:
|
52 |
+
str: The generated summary
|
53 |
+
"""
|
54 |
try:
|
55 |
if not text.strip():
|
56 |
return "No text found in the document to summarize."
|
57 |
|
58 |
+
# Break text into chunks if it's very long
|
59 |
+
chunks = []
|
60 |
+
chunk_size = 4000 # Characters per chunk
|
61 |
+
for i in range(0, len(text), chunk_size):
|
62 |
+
chunks.append(text[i:i + chunk_size])
|
63 |
+
|
64 |
+
summaries = []
|
65 |
+
for i, chunk in enumerate(chunks):
|
66 |
+
# Only process up to 5 chunks to avoid very long processing times
|
67 |
+
if i >= 5:
|
68 |
+
summaries.append("... (Document continues)")
|
69 |
+
break
|
70 |
+
|
71 |
+
input_text = "summarize: " + chunk
|
72 |
+
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
|
73 |
+
|
74 |
+
# Generate with improved parameters
|
75 |
+
summary_ids = model.generate(
|
76 |
+
inputs["input_ids"],
|
77 |
+
max_length=max_length // min(5, len(chunks)), # Adjust max_length based on chunks
|
78 |
+
min_length=min_length // min(5, len(chunks)), # Adjust min_length based on chunks
|
79 |
+
length_penalty=1.5, # Reduced to avoid overly verbose summaries
|
80 |
+
num_beams=4,
|
81 |
+
early_stopping=True,
|
82 |
+
no_repeat_ngram_size=3 # Avoid repeating trigrams
|
83 |
+
)
|
84 |
+
|
85 |
+
chunk_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
86 |
+
summaries.append(chunk_summary)
|
87 |
+
|
88 |
+
# Combine summaries from all chunks
|
89 |
+
combined_summary = " ".join(summaries)
|
90 |
+
|
91 |
+
# For very short summaries, try again with the first chunk but longer output
|
92 |
+
if len(combined_summary.split()) < 50 and chunks:
|
93 |
+
input_text = "summarize: " + chunks[0]
|
94 |
+
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
|
95 |
+
summary_ids = model.generate(
|
96 |
+
inputs["input_ids"],
|
97 |
+
max_length=max_length,
|
98 |
+
min_length=min_length,
|
99 |
+
length_penalty=2.0,
|
100 |
+
num_beams=5,
|
101 |
+
early_stopping=True,
|
102 |
+
repetition_penalty=2.5 # Penalize repetition more heavily
|
103 |
+
)
|
104 |
+
combined_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
105 |
+
|
106 |
+
return combined_summary
|
107 |
except Exception as e:
|
108 |
logger.error(f"Error in T5 summarization: {str(e)}")
|
109 |
return f"Error summarizing text: {str(e)}"
|