Spaces:

KSh100
/

websearch

Sleeping

App Files Files Community

websearch / app.py

KSh100

Update app.py

1db6081 verified 17 days ago

raw

history blame contribute delete

3.98 kB

	import gradio as gr
	import urllib3
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin

	# Custom HTTP Session and Response Classes
	class CustomSession:
	def __init__(self):
	self.pool_manager = urllib3.PoolManager()

	def get(self, url):
	response = self.pool_manager.request('GET', url)
	return CustomResponse(response)

	class CustomResponse:
	def __init__(self, response):
	self.status_code = response.status
	self.headers = response.headers
	self.content = response.data

	def soup(self):
	return BeautifulSoup(self.content, 'lxml')

	def get(url):
	session = CustomSession()
	return session.get(url)

	# Utility Functions
	def extract_texts(soup, title):
	"""Extracts all text content from the soup, excluding the title."""
	texts = [text for text in soup.stripped_strings]
	# Remove the title from the texts if it exists
	if title in texts:
	texts.remove(title)
	return texts

	def extract_links(soup, base_url):
	"""Extracts all valid links from the soup."""
	links = []
	for link in soup.find_all('a', href=True):
	href = link['href']
	full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
	link_text = link.get_text(strip=True) or "No Text"
	links.append({"Text": link_text, "URL": full_url})
	return links

	def extract_images(soup, base_url):
	"""Extracts all valid image URLs and their alt text from the soup."""
	images = []
	for img in soup.find_all('img', src=True):
	img_url = img['src']
	full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
	alt_text = img.get('alt', 'No Alt Text')
	images.append({"Alt Text": alt_text, "Image URL": full_img_url})
	return images

	def format_detailed_output(structured_data, title):
	"""Formats the structured data into a Markdown string."""
	result = f"### Title\n\n{title}\n\n"
	result += "### Texts\n\n"
	result += " ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found."
	result += "\n\n### Links\n\n"
	if structured_data["Links"]:
	result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"])
	else:
	result += "No links found."
	result += "\n\n### Images\n\n"
	if structured_data["Images"]:
	result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"])
	else:
	result += "No images found."
	return result

	# Web Page Processing Function
	def download_and_process_web_page(url):
	"""Downloads a web page and returns the structured content."""
	if not url.startswith("http://") and not url.startswith("https://"):
	url = "http://" + url # Prepend "http://" if not present

	try:
	response = get(url)
	soup = response.soup()

	# Extract title
	title = soup.title.string if soup.title else "No Title Found"

	structured_data = {
	"Texts": extract_texts(soup, title),
	"Links": extract_links(soup, url),
	"Images": extract_images(soup, url)
	}
	return format_detailed_output(structured_data, title)

	except urllib3.exceptions.HTTPError as e:
	return f"Error: {e}"
	except Exception as e:
	return f"Error processing web page: {e}"

	# Gradio Interface
	iface = gr.Interface(
	fn=download_and_process_web_page,
	inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
	outputs=gr.Markdown(label="Web Page Content"),
	title="Web Page Processor for Hugging Face Chat Tools",
	description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools."
	)

	# Launch the interface without sharing
	iface.launch()