""" Web scraping and processing utilities. """ from typing import Dict, Any, List, Optional import requests from bs4 import BeautifulSoup import re from urllib.parse import urlparse, urljoin from tenacity import retry, stop_after_attempt, wait_exponential class WebUtils: def __init__(self): self.session = requests.Session() self.session.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" }) @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) async def fetch_url(self, url: str, timeout: int = 10) -> Optional[str]: """Fetch content from a URL.""" try: response = self.session.get(url, timeout=timeout) response.raise_for_status() return response.text except Exception as e: print(f"Error fetching {url}: {e}") return None def extract_text(self, html: str) -> str: """Extract clean text from HTML content.""" soup = BeautifulSoup(html, "html.parser") # Remove unwanted elements for element in soup(["script", "style", "nav", "footer", "header"]): element.decompose() # Get text and clean it text = soup.get_text(separator="\n", strip=True) # Remove excessive newlines text = re.sub(r"\n\s*\n", "\n\n", text) return text.strip() def extract_metadata(self, html: str, url: str) -> Dict[str, Any]: """Extract metadata from HTML content.""" soup = BeautifulSoup(html, "html.parser") metadata = { "url": url, "title": None, "description": None, "keywords": None, "author": None, "published_date": None } # Extract title metadata["title"] = ( soup.title.string if soup.title else None ) # Extract meta tags meta_tags = soup.find_all("meta") for tag in meta_tags: # Description if tag.get("name", "").lower() == "description": metadata["description"] = tag.get("content") # Keywords elif tag.get("name", "").lower() == "keywords": metadata["keywords"] = tag.get("content") # Author elif tag.get("name", "").lower() == "author": metadata["author"] = tag.get("content") # Published date elif tag.get("name", "").lower() in ["published_time", "publication_date"]: metadata["published_date"] = tag.get("content") return metadata def extract_links(self, html: str, base_url: str) -> List[str]: """Extract all links from HTML content.""" soup = BeautifulSoup(html, "html.parser") links = [] for link in soup.find_all("a"): href = link.get("href") if href: # Convert relative URLs to absolute absolute_url = urljoin(base_url, href) # Only include http(s) URLs if absolute_url.startswith(("http://", "https://")): links.append(absolute_url) return list(set(links)) # Remove duplicates def is_valid_url(self, url: str) -> bool: """Check if a URL is valid.""" try: result = urlparse(url) return all([result.scheme, result.netloc]) except Exception: return False def clean_url(self, url: str) -> str: """Clean and normalize a URL.""" # Remove tracking parameters parsed = urlparse(url) path = parsed.path # Remove common tracking parameters query_params = [] if parsed.query: for param in parsed.query.split("&"): if "=" in param: key = param.split("=")[0].lower() if not any(track in key for track in ["utm_", "ref_", "source", "campaign"]): query_params.append(param) # Rebuild URL clean_url = f"{parsed.scheme}://{parsed.netloc}{path}" if query_params: clean_url += "?" + "&".join(query_params) return clean_url