Spaces:

adithya747
/

website-summarizer

Running

App Files Files Community

adithya747 commited on Feb 12

Commit

501ba1b

verified ·

1 Parent(s): 34cb312

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -21

app.py CHANGED Viewed

@@ -3,38 +3,68 @@ import requests
 from bs4 import BeautifulSoup
 from transformers import pipeline
-# Load summarization pipeline from Hugging Face
-summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 def scrape_website(url):
-    """Extracts text from a website."""
     try:
-        response = requests.get(url, timeout=10)
         soup = BeautifulSoup(response.text, "html.parser")
-        paragraphs = soup.find_all("p")
-        text = " ".join([p.get_text() for p in paragraphs])
-        return text if text else "No content found."
     except Exception as e:
-        return f"Error: {str(e)}"
 def summarize_website(url):
-    """Scrapes website and summarizes the extracted content."""
-    extracted_text = scrape_website(url)
-    if "Error:" in extracted_text or len(extracted_text.split()) < 50:
-        return "Could not extract enough text to summarize."
-    # Summarize using Hugging Face model
-    summary = summarizer(extracted_text, max_length=200, min_length=50, do_sample=False)
-    return f"**Summary:**\n\n{summary[0]['summary_text']}"
-# Gradio interface
 iface = gr.Interface(
     fn=summarize_website,
-    inputs="text",
-    outputs="markdown",
     title="AI-Powered Website Summarizer",
-    description="Enter a website URL, and this tool will summarize its content using an AI model."
 )
 iface.launch()

 from bs4 import BeautifulSoup
 from transformers import pipeline
+# Use a more lightweight model for Hugging Face Spaces
+summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
 def scrape_website(url):
+    """Extracts text from a website with error handling"""
     try:
+        headers = {'User-Agent': 'Mozilla/5.0'}  # Add headers to prevent 403 errors
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()  # Raise HTTP errors
         soup = BeautifulSoup(response.text, "html.parser")
+        # Extract text from common content-containing tags
+        text_elements = soup.find_all(['p', 'article', 'main', 'section'])
+        text = " ".join([e.get_text(strip=True, separator=' ') for e in text_elements])
+        return text if text.strip() else "No content found"
     except Exception as e:
+        return f"Scraping Error: {str(e)}"
 def summarize_website(url):
+    """Handles the full summarization pipeline"""
+    try:
+        extracted_text = scrape_website(url)
+        if "Error" in extracted_text:
+            return extracted_text
+        # Check minimum text length
+        if len(extracted_text.split()) < 50:
+            return "Error: Insufficient content for summarization (minimum 50 words required)"
+        # Truncate text to model's max input length (1024 tokens for DistilBART)
+        max_input_length = 1000  # Conservative estimate for token count
+        truncated_text = extracted_text[:max_input_length]
+        # Generate summary
+        summary = summarizer(
+            truncated_text,
+            max_length=200,
+            min_length=50,
+            do_sample=False,
+            truncation=True  # Ensure truncation is enabled
+        )
+        return f"**Summary:**\n\n{summary[0]['summary_text']}"
+    except Exception as e:
+        return f"Summarization Error: {str(e)}"
+# Gradio interface with improved configuration
 iface = gr.Interface(
     fn=summarize_website,
+    inputs=gr.Textbox(label="Website URL", placeholder="Enter full URL (including https://)..."),
+    outputs=gr.Markdown(),
     title="AI-Powered Website Summarizer",
+    description="Enter a website URL to get an AI-generated summary of its content",
+    examples=[
+        ["https://en.wikipedia.org/wiki/Large_language_model"],
+        ["https://www.bbc.com/news/technology-66510295"]
+    ]
 )
 iface.launch()