adithya747 commited on
Commit
4a1e457
Β·
verified Β·
1 Parent(s): f175faf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -93
app.py CHANGED
@@ -3,20 +3,23 @@ import requests
3
  from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
 
6
- # Load summarization pipeline
7
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
8
 
9
  def scrape_website(url):
10
  """Extracts text from a website with error handling"""
11
  try:
12
- headers = {'User-Agent': 'Mozilla/5.0'}
13
  response = requests.get(url, headers=headers, timeout=10)
14
- response.raise_for_status()
15
 
16
  soup = BeautifulSoup(response.text, "html.parser")
 
 
17
  text_elements = soup.find_all(['p', 'article', 'main', 'section'])
18
  text = " ".join([e.get_text(strip=True, separator=' ') for e in text_elements])
19
- return text.strip() if text.strip() else "No content found"
 
20
 
21
  except Exception as e:
22
  return f"Scraping Error: {str(e)}"
@@ -24,97 +27,44 @@ def scrape_website(url):
24
  def summarize_website(url):
25
  """Handles the full summarization pipeline"""
26
  try:
27
- with gr.Column(variant="panel"):
28
- gr.Markdown("## ⚑ Processing...")
29
-
30
- extracted_text = scrape_website(url)
31
-
32
- if "Error" in extracted_text:
33
- return f"❌ {extracted_text}"
34
-
35
- if len(extracted_text.split()) < 50:
36
- return "⚠️ Error: Insufficient content for summarization (minimum 50 words required)"
37
-
38
- max_input_length = 1000
39
- truncated_text = extracted_text[:max_input_length]
40
-
41
- summary = summarizer(
42
- truncated_text,
43
- max_length=200,
44
- min_length=50,
45
- do_sample=False,
46
- truncation=True
47
- )
48
 
49
- return f"## πŸ“ Summary\n\n{summary[0]['summary_text']}"
 
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  except Exception as e:
52
- return f"β›” Summarization Error: {str(e)}"
53
-
54
- # Custom CSS for mobile optimization
55
- css = """
56
- @media screen and (max-width: 600px) {
57
- .container {
58
- padding: 10px !important;
59
- }
60
- .input-box textarea {
61
- font-size: 16px !important;
62
- }
63
- }
64
- """
65
 
66
- # Mobile-optimized interface with Blocks API
67
- with gr.Blocks(theme=gr.themes.Soft(), css=css, title="Website Summarizer") as app:
68
- gr.Markdown("# 🌐 AI Website Summarizer")
69
- gr.Markdown("Paste any website URL below to get an instant AI-powered summary!")
70
-
71
- with gr.Row():
72
- url_input = gr.Textbox(
73
- label="Website URL",
74
- placeholder="Enter full URL (https://...)",
75
- lines=1,
76
- max_lines=1,
77
- elem_id="input-box"
78
- )
79
-
80
- with gr.Row():
81
- submit_btn = gr.Button("Generate Summary πŸš€", variant="primary")
82
- clear_btn = gr.Button("Clear πŸ”„")
83
-
84
- output = gr.Markdown()
85
-
86
- # Example section
87
- gr.Examples(
88
- examples=[
89
- ["https://en.wikipedia.org/wiki/Large_language_model"],
90
- ["https://www.bbc.com/news/technology-66510295"]
91
- ],
92
- inputs=url_input,
93
- label="Try these examples:",
94
- examples_per_page=2
95
- )
96
-
97
- # Progress indicator
98
- progress = gr.Textbox(visible=False)
99
-
100
- # Event handlers
101
- submit_btn.click(
102
- fn=summarize_website,
103
- inputs=url_input,
104
- outputs=output,
105
- api_name="summarize"
106
- )
107
-
108
- clear_btn.click(
109
- fn=lambda: ("", ""),
110
- inputs=None,
111
- outputs=[url_input, output],
112
- queue=False
113
- )
114
 
115
- # Mobile-friendly configuration
116
- app.launch(
117
- server_name="0.0.0.0",
118
- server_port=7860,
119
- favicon_path="https://www.svgrepo.com/show/355037/huggingface.svg"
120
- )
 
3
  from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
 
6
+ # Use a more lightweight model for Hugging Face Spaces
7
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
8
 
9
  def scrape_website(url):
10
  """Extracts text from a website with error handling"""
11
  try:
12
+ headers = {'User-Agent': 'Mozilla/5.0'} # Add headers to prevent 403 errors
13
  response = requests.get(url, headers=headers, timeout=10)
14
+ response.raise_for_status() # Raise HTTP errors
15
 
16
  soup = BeautifulSoup(response.text, "html.parser")
17
+
18
+ # Extract text from common content-containing tags
19
  text_elements = soup.find_all(['p', 'article', 'main', 'section'])
20
  text = " ".join([e.get_text(strip=True, separator=' ') for e in text_elements])
21
+
22
+ return text if text.strip() else "No content found"
23
 
24
  except Exception as e:
25
  return f"Scraping Error: {str(e)}"
 
27
  def summarize_website(url):
28
  """Handles the full summarization pipeline"""
29
  try:
30
+ extracted_text = scrape_website(url)
31
+
32
+ if "Error" in extracted_text:
33
+ return extracted_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Check minimum text length
36
+ if len(extracted_text.split()) < 50:
37
+ return "Error: Insufficient content for summarization (minimum 50 words required)"
38
 
39
+ # Truncate text to model's max input length (1024 tokens for DistilBART)
40
+ max_input_length = 1000 # Conservative estimate for token count
41
+ truncated_text = extracted_text[:max_input_length]
42
+
43
+ # Generate summary
44
+ summary = summarizer(
45
+ truncated_text,
46
+ max_length=200,
47
+ min_length=50,
48
+ do_sample=False,
49
+ truncation=True # Ensure truncation is enabled
50
+ )
51
+
52
+ return f"**Summary:**\n\n{summary[0]['summary_text']}"
53
+
54
  except Exception as e:
55
+ return f"Summarization Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Gradio interface with improved configuration
58
+ iface = gr.Interface(
59
+ fn=summarize_website,
60
+ inputs=gr.Textbox(label="Website URL", placeholder="Enter full URL (including https://)..."),
61
+ outputs=gr.Markdown(),
62
+ title="AI-Powered Website Summarizer",
63
+ description="Enter a website URL to get an AI-generated summary of its content",
64
+ examples=[
65
+ ["https://en.wikipedia.org/wiki/Large_language_model"],
66
+ ["https://www.bbc.com/news/technology-66510295"]
67
+ ]
68
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ iface.launch()