import gradio as gr import urllib3 from bs4 import BeautifulSoup from urllib.parse import urljoin # Custom HTTP Session and Response Classes class CustomSession: def __init__(self): self.pool_manager = urllib3.PoolManager() def get(self, url): response = self.pool_manager.request('GET', url) return CustomResponse(response) class CustomResponse: def __init__(self, response): self.status_code = response.status self.headers = response.headers self.content = response.data def soup(self): return BeautifulSoup(self.content, 'lxml') def get(url): session = CustomSession() return session.get(url) # Utility Functions def extract_texts(soup, title): """Extracts all text content from the soup, excluding the title.""" texts = [text for text in soup.stripped_strings] # Remove the title from the texts if it exists if title in texts: texts.remove(title) return texts def extract_links(soup, base_url): """Extracts all valid links from the soup.""" links = [] for link in soup.find_all('a', href=True): href = link['href'] full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href link_text = link.get_text(strip=True) or "No Text" links.append({"Text": link_text, "URL": full_url}) return links def extract_images(soup, base_url): """Extracts all valid image URLs and their alt text from the soup.""" images = [] for img in soup.find_all('img', src=True): img_url = img['src'] full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url alt_text = img.get('alt', 'No Alt Text') images.append({"Alt Text": alt_text, "Image URL": full_img_url}) return images def format_detailed_output(structured_data, title): """Formats the structured data into a Markdown string.""" result = f"### Title\n\n{title}\n\n" result += "### Texts\n\n" result += " ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found." result += "\n\n### Links\n\n" if structured_data["Links"]: result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) else: result += "No links found." result += "\n\n### Images\n\n" if structured_data["Images"]: result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) else: result += "No images found." return result # Web Page Processing Function def download_and_process_web_page(url): """Downloads a web page and returns the structured content.""" if not url.startswith("http://") and not url.startswith("https://"): url = "http://" + url # Prepend "http://" if not present try: response = get(url) soup = response.soup() # Extract title title = soup.title.string if soup.title else "No Title Found" structured_data = { "Texts": extract_texts(soup, title), "Links": extract_links(soup, url), "Images": extract_images(soup, url) } return format_detailed_output(structured_data, title) except urllib3.exceptions.HTTPError as e: return f"Error: {e}" except Exception as e: return f"Error processing web page: {e}" # Gradio Interface iface = gr.Interface( fn=download_and_process_web_page, inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"), outputs=gr.Markdown(label="Web Page Content"), title="Web Page Processor for Hugging Face Chat Tools", description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools." ) # Launch the interface without sharing iface.launch()