|
import gradio as gr |
|
import urllib3 |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urljoin |
|
|
|
|
|
class CustomSession: |
|
def __init__(self): |
|
self.pool_manager = urllib3.PoolManager() |
|
|
|
def get(self, url): |
|
response = self.pool_manager.request('GET', url) |
|
return CustomResponse(response) |
|
|
|
class CustomResponse: |
|
def __init__(self, response): |
|
self.status_code = response.status |
|
self.headers = response.headers |
|
self.content = response.data |
|
|
|
def soup(self): |
|
return BeautifulSoup(self.content, 'lxml') |
|
|
|
def get(url): |
|
session = CustomSession() |
|
return session.get(url) |
|
|
|
|
|
def extract_texts(soup, title): |
|
"""Extracts all text content from the soup, excluding the title.""" |
|
texts = [text for text in soup.stripped_strings] |
|
|
|
if title in texts: |
|
texts.remove(title) |
|
return texts |
|
|
|
def extract_links(soup, base_url): |
|
"""Extracts all valid links from the soup.""" |
|
links = [] |
|
for link in soup.find_all('a', href=True): |
|
href = link['href'] |
|
full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href |
|
link_text = link.get_text(strip=True) or "No Text" |
|
links.append({"Text": link_text, "URL": full_url}) |
|
return links |
|
|
|
def extract_images(soup, base_url): |
|
"""Extracts all valid image URLs and their alt text from the soup.""" |
|
images = [] |
|
for img in soup.find_all('img', src=True): |
|
img_url = img['src'] |
|
full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url |
|
alt_text = img.get('alt', 'No Alt Text') |
|
images.append({"Alt Text": alt_text, "Image URL": full_img_url}) |
|
return images |
|
|
|
def format_detailed_output(structured_data, title): |
|
"""Formats the structured data into a Markdown string.""" |
|
result = f"### Title\n\n{title}\n\n" |
|
result += "### Texts\n\n" |
|
result += " ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found." |
|
result += "\n\n### Links\n\n" |
|
if structured_data["Links"]: |
|
result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) |
|
else: |
|
result += "No links found." |
|
result += "\n\n### Images\n\n" |
|
if structured_data["Images"]: |
|
result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) |
|
else: |
|
result += "No images found." |
|
return result |
|
|
|
|
|
def download_and_process_web_page(url): |
|
"""Downloads a web page and returns the structured content.""" |
|
if not url.startswith("http://") and not url.startswith("https://"): |
|
url = "http://" + url |
|
|
|
try: |
|
response = get(url) |
|
soup = response.soup() |
|
|
|
|
|
title = soup.title.string if soup.title else "No Title Found" |
|
|
|
structured_data = { |
|
"Texts": extract_texts(soup, title), |
|
"Links": extract_links(soup, url), |
|
"Images": extract_images(soup, url) |
|
} |
|
return format_detailed_output(structured_data, title) |
|
|
|
except urllib3.exceptions.HTTPError as e: |
|
return f"Error: {e}" |
|
except Exception as e: |
|
return f"Error processing web page: {e}" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=download_and_process_web_page, |
|
inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"), |
|
outputs=gr.Markdown(label="Web Page Content"), |
|
title="Web Page Processor for Hugging Face Chat Tools", |
|
description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools." |
|
) |
|
|
|
|
|
iface.launch() |