from curl_cffi import requests as req from bs4 import BeautifulSoup import logging from typing import Union, List, Dict, Optional from urllib.parse import urljoin, urlparse # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ScrapingError(Exception): """Custom exception for scraping errors""" pass def validate_url(url: str) -> bool: """Validate if the given URL is properly formatted""" try: result = urlparse(url) return all([result.scheme, result.netloc]) except Exception: return False def clean_url(url: str) -> str: """Clean and normalize URL""" if url.startswith('//'): return f'https:{url}' return url def scrape_html(url: str) -> Union[str, Dict[str, str]]: """ Fetch HTML content from a URL with improved error handling Args: url (str): The URL to scrape Returns: str: HTML content if successful dict: Error information if failed """ try: if not validate_url(url): return {"error": "Invalid URL format"} response = req.get( url, impersonate='chrome110', timeout=30, max_redirects=5 ) # Check if response is HTML content_type = response.headers.get('content-type', '').lower() if 'text/html' not in content_type: return {"error": f"Unexpected content type: {content_type}"} return response.text except Exception as e: logger.error(f"Unexpected error while scraping {url}: {str(e)}") return {"error": f"Unexpected error: {str(e)}"} def scrape_images(data: str, filter: str = "") -> Union[List[str], Dict[str, str]]: """ Extract image URLs from HTML content with improved filtering and validation Args: data (str): HTML content filter (str): Optional filter string for URLs Returns: list: List of image URLs if successful dict: Error information if failed """ try: if not data: return {"error": "No HTML content provided"} soup = BeautifulSoup(data, 'html.parser') images = [] # Look for both img tags and background images in style attributes for img in soup.find_all('img'): src = img.get('src') or img.get('data-src') if src: src = clean_url(src) if validate_url(src) and (not filter or filter.lower() in src.lower()): images.append(src) # Look for background images in style attributes for elem in soup.find_all(style=True): style = elem['style'] if 'background-image' in style: url_start = style.find('url(') + 4 url_end = style.find(')', url_start) if url_start > 4 and url_end != -1: src = style[url_start:url_end].strip('"\'') src = clean_url(src) if validate_url(src) and (not filter or filter.lower() in src.lower()): images.append(src) return list(set(images)) # Remove duplicates except Exception as e: logger.error(f"Error extracting images: {str(e)}") return {"error": f"Failed to extract images: {str(e)}"} def scrape_links(url: str, filter: str = "") -> Union[List[str], Dict[str, str]]: """ Extract links from a webpage with improved validation and error handling Args: url (str): URL to scrape filter (str): Optional filter for links Returns: list: List of links if successful dict: Error information if failed """ try: if not validate_url(url): return {"error": "Invalid URL format"} print(url) response = req.get(url, impersonate='chrome110') soup = BeautifulSoup(response.text, 'html.parser') links = [] base_url = url try: for a in soup.find_all('a', href=True): href = a['href'] # Convert relative URLs to absolute full_url = urljoin(base_url, href) if validate_url(full_url) and (not filter or filter.lower() in full_url.lower()): links.append(full_url) return list(set(links)) # Remove duplicates except Exception as e: logger.error(f"Error processing links: {str(e)}") return {"error": f"Failed to process links: {str(e)}"} except Exception as e: logger.error(f"Error extracting links: {str(e)}") return {"error": f"Failed to extract links: {str(e)}"} def scrape_text(data: str) -> Union[str, Dict[str, str]]: """ Extract clean text content from HTML Args: data (str): HTML content Returns: str: Extracted text if successful dict: Error information if failed """ try: if not data: return {"error": "No HTML content provided"} soup = BeautifulSoup(data, 'html.parser') # Remove script and style elements for element in soup(['script', 'style', 'head']): element.decompose() # Get text and clean it text = soup.get_text(separator='\n') # Remove excessive newlines and whitespace text = '\n'.join(line.strip() for line in text.split('\n') if line.strip()) return text except Exception as e: logger.error(f"Error extracting text: {str(e)}") return {"error": f"Failed to extract text: {str(e)}"} def scrape_div(data: str, div: str) -> Union[List[str], Dict[str, str]]: """ Extract content from specific div elements Args: data (str): HTML content div (str): Class or ID of the div to scrape Returns: list: List of div contents if successful dict: Error information if failed """ try: if not data: return {"error": "No HTML content provided"} if not div: return {"error": "No div selector provided"} soup = BeautifulSoup(data, 'html.parser') results = [] # Try class first elements = soup.find_all(class_=div) if not elements: # Try ID if no class found elements = soup.find_all(id=div) if not elements: return {"error": f"No elements found with class or ID: {div}"} for element in elements: # Get both text and HTML content content = { "text": element.get_text(strip=True), "html": str(element) } results.append(content) return results except Exception as e: logger.error(f"Error extracting div content: {str(e)}") return {"error": f"Failed to extract div content: {str(e)}"} # Function to scrape metadata def scrape_metadata(data): soup = BeautifulSoup(data, 'html.parser') metadata = {} for meta in soup.find_all('meta'): name = meta.get('name') or meta.get('property') content = meta.get('content') if name and content: metadata[name] = content return metadata # Function to scrape table data def scrape_tables(data): soup = BeautifulSoup(data, 'html.parser') tables = [] for table in soup.find_all('table'): rows = [] for row in table.find_all('tr'): cells = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])] rows.append(cells) tables.append(rows) return tables