Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

mike23415 commited on about 1 month ago

Commit

b46017c

verified ·

1 Parent(s): b4aa0e4

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -15

app.py CHANGED Viewed

@@ -39,25 +39,71 @@ def allowed_file(filename):
     """Check if the uploaded file has an allowed extension."""
     return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
-def summarize_text(text, max_length=150, min_length=30):
-    """Summarize text using T5-Base."""
     try:
         if not text.strip():
             return "No text found in the document to summarize."
-        # Limit text length to prevent tokenizer errors
-        input_text = "summarize: " + text[:10000]  # Limiting to 10K chars to be safe
-        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
-        summary_ids = model.generate(
-            inputs["input_ids"],
-            max_length=max_length,
-            min_length=min_length,
-            length_penalty=2.0,
-            num_beams=4,
-            early_stopping=True
-        )
-        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        return summary
     except Exception as e:
         logger.error(f"Error in T5 summarization: {str(e)}")
         return f"Error summarizing text: {str(e)}"

     """Check if the uploaded file has an allowed extension."""
     return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
+def summarize_text(text, max_length=300, min_length=100):
+    """
+    Summarize text using T5-Base with improved parameters for more comprehensive summaries.
+    Args:
+        text (str): The text to summarize
+        max_length (int): Maximum length of the summary (increased from 150)
+        min_length (int): Minimum length of the summary (increased from 30)
+    Returns:
+        str: The generated summary
+    """
     try:
         if not text.strip():
             return "No text found in the document to summarize."
+        # Break text into chunks if it's very long
+        chunks = []
+        chunk_size = 4000  # Characters per chunk
+        for i in range(0, len(text), chunk_size):
+            chunks.append(text[i:i + chunk_size])
+        summaries = []
+        for i, chunk in enumerate(chunks):
+            # Only process up to 5 chunks to avoid very long processing times
+            if i >= 5:
+                summaries.append("... (Document continues)")
+                break
+            input_text = "summarize: " + chunk
+            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
+            # Generate with improved parameters
+            summary_ids = model.generate(
+                inputs["input_ids"],
+                max_length=max_length // min(5, len(chunks)),  # Adjust max_length based on chunks
+                min_length=min_length // min(5, len(chunks)),  # Adjust min_length based on chunks
+                length_penalty=1.5,  # Reduced to avoid overly verbose summaries
+                num_beams=4,
+                early_stopping=True,
+                no_repeat_ngram_size=3  # Avoid repeating trigrams
+            )
+            chunk_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+            summaries.append(chunk_summary)
+        # Combine summaries from all chunks
+        combined_summary = " ".join(summaries)
+        # For very short summaries, try again with the first chunk but longer output
+        if len(combined_summary.split()) < 50 and chunks:
+            input_text = "summarize: " + chunks[0]
+            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
+            summary_ids = model.generate(
+                inputs["input_ids"],
+                max_length=max_length,
+                min_length=min_length,
+                length_penalty=2.0,
+                num_beams=5,
+                early_stopping=True,
+                repetition_penalty=2.5  # Penalize repetition more heavily
+            )
+            combined_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        return combined_summary
     except Exception as e:
         logger.error(f"Error in T5 summarization: {str(e)}")
         return f"Error summarizing text: {str(e)}"