File size: 3,984 Bytes
2a808f8 900966d 4d9e1a7 900966d 4d9e1a7 a5ba9ea cf919e4 a5ba9ea cf919e4 900966d 1db6081 900966d a5ba9ea 1db6081 22f3b84 0deeeca 1db6081 0deeeca 900966d 0deeeca 900966d 0deeeca 900966d 0deeeca 900966d 0deeeca 900966d 4d9e1a7 bf6070d 2a808f8 900966d 0deeeca a5ba9ea 0deeeca 900966d a5ba9ea 900966d a5ba9ea 900966d bf6070d 900966d bf6070d 900966d 4d9e1a7 900966d 4d9e1a7 2a808f8 b8a4185 4d9e1a7 b8a4185 900966d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
import urllib3
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# Custom HTTP Session and Response Classes
class CustomSession:
def __init__(self):
self.pool_manager = urllib3.PoolManager()
def get(self, url):
response = self.pool_manager.request('GET', url)
return CustomResponse(response)
class CustomResponse:
def __init__(self, response):
self.status_code = response.status
self.headers = response.headers
self.content = response.data
def soup(self):
return BeautifulSoup(self.content, 'lxml')
def get(url):
session = CustomSession()
return session.get(url)
# Utility Functions
def extract_texts(soup, title):
"""Extracts all text content from the soup, excluding the title."""
texts = [text for text in soup.stripped_strings]
# Remove the title from the texts if it exists
if title in texts:
texts.remove(title)
return texts
def extract_links(soup, base_url):
"""Extracts all valid links from the soup."""
links = []
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
link_text = link.get_text(strip=True) or "No Text"
links.append({"Text": link_text, "URL": full_url})
return links
def extract_images(soup, base_url):
"""Extracts all valid image URLs and their alt text from the soup."""
images = []
for img in soup.find_all('img', src=True):
img_url = img['src']
full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
alt_text = img.get('alt', 'No Alt Text')
images.append({"Alt Text": alt_text, "Image URL": full_img_url})
return images
def format_detailed_output(structured_data, title):
"""Formats the structured data into a Markdown string."""
result = f"### Title\n\n{title}\n\n"
result += "### Texts\n\n"
result += " ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found."
result += "\n\n### Links\n\n"
if structured_data["Links"]:
result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"])
else:
result += "No links found."
result += "\n\n### Images\n\n"
if structured_data["Images"]:
result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"])
else:
result += "No images found."
return result
# Web Page Processing Function
def download_and_process_web_page(url):
"""Downloads a web page and returns the structured content."""
if not url.startswith("http://") and not url.startswith("https://"):
url = "http://" + url # Prepend "http://" if not present
try:
response = get(url)
soup = response.soup()
# Extract title
title = soup.title.string if soup.title else "No Title Found"
structured_data = {
"Texts": extract_texts(soup, title),
"Links": extract_links(soup, url),
"Images": extract_images(soup, url)
}
return format_detailed_output(structured_data, title)
except urllib3.exceptions.HTTPError as e:
return f"Error: {e}"
except Exception as e:
return f"Error processing web page: {e}"
# Gradio Interface
iface = gr.Interface(
fn=download_and_process_web_page,
inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
outputs=gr.Markdown(label="Web Page Content"),
title="Web Page Processor for Hugging Face Chat Tools",
description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools."
)
# Launch the interface without sharing
iface.launch() |