from typing import Dict, List, Any import requests from bs4 import BeautifulSoup from duckduckgo_search import ddg from transformers import pipeline from langchain.embeddings import HuggingFaceEmbeddings import time import json import os from urllib.parse import urlparse class ModelManager: """Manages AI models for text processing""" def __init__(self): # Initialize with smaller, CPU-friendly models self.summarizer = pipeline( "summarization", model="facebook/bart-base", device=-1 # Use CPU ) self.embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) def generate_summary(self, text: str, max_length: int = 150) -> str: """Generate a concise summary of the text""" if not text or len(text.split()) < 50: return text try: summary = self.summarizer( text, max_length=max_length, min_length=30, do_sample=False )[0]['summary_text'] return summary except Exception as e: print(f"Error in summarization: {e}") return text[:500] + "..." class ContentProcessor: """Processes and analyzes different types of content""" def __init__(self): self.model_manager = ModelManager() def process_content(self, content: str) -> Dict[str, Any]: """Process content and generate insights""" if not content: return {"summary": "", "insights": []} try: summary = self.model_manager.generate_summary(content) return { "summary": summary, "insights": [] # Simplified for CPU deployment } except Exception as e: print(f"Error processing content: {e}") return {"summary": content[:500] + "...", "insights": []} class WebSearchEngine: """Main search engine class""" def __init__(self): self.processor = ContentProcessor() self.session = requests.Session() self.request_delay = 1.0 self.last_request_time = 0 def is_valid_url(self, url: str) -> bool: """Check if URL is valid for crawling""" try: parsed = urlparse(url) return bool(parsed.netloc and parsed.scheme in ['http', 'https']) except: return False def get_metadata(self, soup: BeautifulSoup) -> Dict[str, str]: """Extract metadata from page""" metadata = {} # Get title title = soup.find('title') if title: metadata['title'] = title.text.strip() # Get meta description desc = soup.find('meta', attrs={'name': 'description'}) if desc: metadata['description'] = desc.get('content', '') # Get publication date date = soup.find('meta', attrs={'property': 'article:published_time'}) if date: metadata['published_date'] = date.get('content', '').split('T')[0] return metadata def process_url(self, url: str) -> Dict[str, Any]: """Process a single URL""" if not self.is_valid_url(url): return None try: # Rate limiting current_time = time.time() if current_time - self.last_request_time < self.request_delay: time.sleep(self.request_delay) response = self.session.get(url, timeout=10) self.last_request_time = time.time() if response.status_code != 200: return None soup = BeautifulSoup(response.text, 'lxml') metadata = self.get_metadata(soup) # Extract main content (simplified) content = ' '.join([p.text for p in soup.find_all('p')]) processed = self.processor.process_content(content) return { 'url': url, 'title': metadata.get('title', url), 'summary': processed['summary'], 'published_date': metadata.get('published_date', '') } except Exception as e: print(f"Error processing URL {url}: {e}") return None def search(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]: """Perform search and process results""" try: # Perform DuckDuckGo search search_results = ddg(query, max_results=max_results) results = [] for result in search_results: processed = self.process_url(result['link']) if processed: results.append(processed) return results[:max_results] except Exception as e: print(f"Error in search: {e}") return [] # Main search function def search(query: str, max_results: int = 5) -> List[Dict[str, Any]]: """Main search function""" engine = WebSearchEngine() return engine.search(query, max_results)