mike23415 commited on
Commit
b46017c
·
verified ·
1 Parent(s): b4aa0e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -15
app.py CHANGED
@@ -39,25 +39,71 @@ def allowed_file(filename):
39
  """Check if the uploaded file has an allowed extension."""
40
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
41
 
42
- def summarize_text(text, max_length=150, min_length=30):
43
- """Summarize text using T5-Base."""
 
 
 
 
 
 
 
 
 
 
44
  try:
45
  if not text.strip():
46
  return "No text found in the document to summarize."
47
 
48
- # Limit text length to prevent tokenizer errors
49
- input_text = "summarize: " + text[:10000] # Limiting to 10K chars to be safe
50
- inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
51
- summary_ids = model.generate(
52
- inputs["input_ids"],
53
- max_length=max_length,
54
- min_length=min_length,
55
- length_penalty=2.0,
56
- num_beams=4,
57
- early_stopping=True
58
- )
59
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
60
- return summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  except Exception as e:
62
  logger.error(f"Error in T5 summarization: {str(e)}")
63
  return f"Error summarizing text: {str(e)}"
 
39
  """Check if the uploaded file has an allowed extension."""
40
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
41
 
42
+ def summarize_text(text, max_length=300, min_length=100):
43
+ """
44
+ Summarize text using T5-Base with improved parameters for more comprehensive summaries.
45
+
46
+ Args:
47
+ text (str): The text to summarize
48
+ max_length (int): Maximum length of the summary (increased from 150)
49
+ min_length (int): Minimum length of the summary (increased from 30)
50
+
51
+ Returns:
52
+ str: The generated summary
53
+ """
54
  try:
55
  if not text.strip():
56
  return "No text found in the document to summarize."
57
 
58
+ # Break text into chunks if it's very long
59
+ chunks = []
60
+ chunk_size = 4000 # Characters per chunk
61
+ for i in range(0, len(text), chunk_size):
62
+ chunks.append(text[i:i + chunk_size])
63
+
64
+ summaries = []
65
+ for i, chunk in enumerate(chunks):
66
+ # Only process up to 5 chunks to avoid very long processing times
67
+ if i >= 5:
68
+ summaries.append("... (Document continues)")
69
+ break
70
+
71
+ input_text = "summarize: " + chunk
72
+ inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
73
+
74
+ # Generate with improved parameters
75
+ summary_ids = model.generate(
76
+ inputs["input_ids"],
77
+ max_length=max_length // min(5, len(chunks)), # Adjust max_length based on chunks
78
+ min_length=min_length // min(5, len(chunks)), # Adjust min_length based on chunks
79
+ length_penalty=1.5, # Reduced to avoid overly verbose summaries
80
+ num_beams=4,
81
+ early_stopping=True,
82
+ no_repeat_ngram_size=3 # Avoid repeating trigrams
83
+ )
84
+
85
+ chunk_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
86
+ summaries.append(chunk_summary)
87
+
88
+ # Combine summaries from all chunks
89
+ combined_summary = " ".join(summaries)
90
+
91
+ # For very short summaries, try again with the first chunk but longer output
92
+ if len(combined_summary.split()) < 50 and chunks:
93
+ input_text = "summarize: " + chunks[0]
94
+ inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
95
+ summary_ids = model.generate(
96
+ inputs["input_ids"],
97
+ max_length=max_length,
98
+ min_length=min_length,
99
+ length_penalty=2.0,
100
+ num_beams=5,
101
+ early_stopping=True,
102
+ repetition_penalty=2.5 # Penalize repetition more heavily
103
+ )
104
+ combined_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
105
+
106
+ return combined_summary
107
  except Exception as e:
108
  logger.error(f"Error in T5 summarization: {str(e)}")
109
  return f"Error summarizing text: {str(e)}"