Spaces:
Runtime error
Runtime error
""" | |
Web scraping and processing utilities. | |
""" | |
from typing import Dict, Any, List, Optional | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urlparse, urljoin | |
from tenacity import retry, stop_after_attempt, wait_exponential | |
class WebUtils: | |
def __init__(self): | |
self.session = requests.Session() | |
self.session.headers.update({ | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
}) | |
async def fetch_url(self, url: str, timeout: int = 10) -> Optional[str]: | |
"""Fetch content from a URL.""" | |
try: | |
response = self.session.get(url, timeout=timeout) | |
response.raise_for_status() | |
return response.text | |
except Exception as e: | |
print(f"Error fetching {url}: {e}") | |
return None | |
def extract_text(self, html: str) -> str: | |
"""Extract clean text from HTML content.""" | |
soup = BeautifulSoup(html, "html.parser") | |
# Remove unwanted elements | |
for element in soup(["script", "style", "nav", "footer", "header"]): | |
element.decompose() | |
# Get text and clean it | |
text = soup.get_text(separator="\n", strip=True) | |
# Remove excessive newlines | |
text = re.sub(r"\n\s*\n", "\n\n", text) | |
return text.strip() | |
def extract_metadata(self, html: str, url: str) -> Dict[str, Any]: | |
"""Extract metadata from HTML content.""" | |
soup = BeautifulSoup(html, "html.parser") | |
metadata = { | |
"url": url, | |
"title": None, | |
"description": None, | |
"keywords": None, | |
"author": None, | |
"published_date": None | |
} | |
# Extract title | |
metadata["title"] = ( | |
soup.title.string if soup.title else None | |
) | |
# Extract meta tags | |
meta_tags = soup.find_all("meta") | |
for tag in meta_tags: | |
# Description | |
if tag.get("name", "").lower() == "description": | |
metadata["description"] = tag.get("content") | |
# Keywords | |
elif tag.get("name", "").lower() == "keywords": | |
metadata["keywords"] = tag.get("content") | |
# Author | |
elif tag.get("name", "").lower() == "author": | |
metadata["author"] = tag.get("content") | |
# Published date | |
elif tag.get("name", "").lower() in ["published_time", "publication_date"]: | |
metadata["published_date"] = tag.get("content") | |
return metadata | |
def extract_links(self, html: str, base_url: str) -> List[str]: | |
"""Extract all links from HTML content.""" | |
soup = BeautifulSoup(html, "html.parser") | |
links = [] | |
for link in soup.find_all("a"): | |
href = link.get("href") | |
if href: | |
# Convert relative URLs to absolute | |
absolute_url = urljoin(base_url, href) | |
# Only include http(s) URLs | |
if absolute_url.startswith(("http://", "https://")): | |
links.append(absolute_url) | |
return list(set(links)) # Remove duplicates | |
def is_valid_url(self, url: str) -> bool: | |
"""Check if a URL is valid.""" | |
try: | |
result = urlparse(url) | |
return all([result.scheme, result.netloc]) | |
except Exception: | |
return False | |
def clean_url(self, url: str) -> str: | |
"""Clean and normalize a URL.""" | |
# Remove tracking parameters | |
parsed = urlparse(url) | |
path = parsed.path | |
# Remove common tracking parameters | |
query_params = [] | |
if parsed.query: | |
for param in parsed.query.split("&"): | |
if "=" in param: | |
key = param.split("=")[0].lower() | |
if not any(track in key for track in ["utm_", "ref_", "source", "campaign"]): | |
query_params.append(param) | |
# Rebuild URL | |
clean_url = f"{parsed.scheme}://{parsed.netloc}{path}" | |
if query_params: | |
clean_url += "?" + "&".join(query_params) | |
return clean_url | |