adithya747 commited on
Commit
501ba1b
·
verified ·
1 Parent(s): 34cb312

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -21
app.py CHANGED
@@ -3,38 +3,68 @@ import requests
3
  from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
 
6
- # Load summarization pipeline from Hugging Face
7
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
8
 
9
  def scrape_website(url):
10
- """Extracts text from a website."""
11
  try:
12
- response = requests.get(url, timeout=10)
 
 
 
13
  soup = BeautifulSoup(response.text, "html.parser")
14
- paragraphs = soup.find_all("p")
15
- text = " ".join([p.get_text() for p in paragraphs])
16
- return text if text else "No content found."
 
 
 
 
17
  except Exception as e:
18
- return f"Error: {str(e)}"
19
 
20
  def summarize_website(url):
21
- """Scrapes website and summarizes the extracted content."""
22
- extracted_text = scrape_website(url)
23
-
24
- if "Error:" in extracted_text or len(extracted_text.split()) < 50:
25
- return "Could not extract enough text to summarize."
26
-
27
- # Summarize using Hugging Face model
28
- summary = summarizer(extracted_text, max_length=200, min_length=50, do_sample=False)
29
- return f"**Summary:**\n\n{summary[0]['summary_text']}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- # Gradio interface
32
  iface = gr.Interface(
33
  fn=summarize_website,
34
- inputs="text",
35
- outputs="markdown",
36
  title="AI-Powered Website Summarizer",
37
- description="Enter a website URL, and this tool will summarize its content using an AI model."
 
 
 
 
38
  )
39
 
40
  iface.launch()
 
3
  from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
 
6
+ # Use a more lightweight model for Hugging Face Spaces
7
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
8
 
9
  def scrape_website(url):
10
+ """Extracts text from a website with error handling"""
11
  try:
12
+ headers = {'User-Agent': 'Mozilla/5.0'} # Add headers to prevent 403 errors
13
+ response = requests.get(url, headers=headers, timeout=10)
14
+ response.raise_for_status() # Raise HTTP errors
15
+
16
  soup = BeautifulSoup(response.text, "html.parser")
17
+
18
+ # Extract text from common content-containing tags
19
+ text_elements = soup.find_all(['p', 'article', 'main', 'section'])
20
+ text = " ".join([e.get_text(strip=True, separator=' ') for e in text_elements])
21
+
22
+ return text if text.strip() else "No content found"
23
+
24
  except Exception as e:
25
+ return f"Scraping Error: {str(e)}"
26
 
27
  def summarize_website(url):
28
+ """Handles the full summarization pipeline"""
29
+ try:
30
+ extracted_text = scrape_website(url)
31
+
32
+ if "Error" in extracted_text:
33
+ return extracted_text
34
+
35
+ # Check minimum text length
36
+ if len(extracted_text.split()) < 50:
37
+ return "Error: Insufficient content for summarization (minimum 50 words required)"
38
+
39
+ # Truncate text to model's max input length (1024 tokens for DistilBART)
40
+ max_input_length = 1000 # Conservative estimate for token count
41
+ truncated_text = extracted_text[:max_input_length]
42
+
43
+ # Generate summary
44
+ summary = summarizer(
45
+ truncated_text,
46
+ max_length=200,
47
+ min_length=50,
48
+ do_sample=False,
49
+ truncation=True # Ensure truncation is enabled
50
+ )
51
+
52
+ return f"**Summary:**\n\n{summary[0]['summary_text']}"
53
+
54
+ except Exception as e:
55
+ return f"Summarization Error: {str(e)}"
56
 
57
+ # Gradio interface with improved configuration
58
  iface = gr.Interface(
59
  fn=summarize_website,
60
+ inputs=gr.Textbox(label="Website URL", placeholder="Enter full URL (including https://)..."),
61
+ outputs=gr.Markdown(),
62
  title="AI-Powered Website Summarizer",
63
+ description="Enter a website URL to get an AI-generated summary of its content",
64
+ examples=[
65
+ ["https://en.wikipedia.org/wiki/Large_language_model"],
66
+ ["https://www.bbc.com/news/technology-66510295"]
67
+ ]
68
  )
69
 
70
  iface.launch()