Spaces:

tensor-boy
/

ISE

Runtime error

ISE / search_engine.py

fikird

Initial commit with all files

dcc91e6 5 months ago

5.27 kB

	from typing import Dict, List, Any
	import requests
	from bs4 import BeautifulSoup
	from duckduckgo_search import ddg
	from transformers import pipeline
	from langchain.embeddings import HuggingFaceEmbeddings
	import time
	import json
	import os
	from urllib.parse import urlparse

	class ModelManager:
	"""Manages AI models for text processing"""
	def __init__(self):
	# Initialize with smaller, CPU-friendly models
	self.summarizer = pipeline(
	"summarization",
	model="facebook/bart-base",
	device=-1 # Use CPU
	)
	self.embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)

	def generate_summary(self, text: str, max_length: int = 150) -> str:
	"""Generate a concise summary of the text"""
	if not text or len(text.split()) < 50:
	return text

	try:
	summary = self.summarizer(
	text,
	max_length=max_length,
	min_length=30,
	do_sample=False
	)[0]['summary_text']
	return summary
	except Exception as e:
	print(f"Error in summarization: {e}")
	return text[:500] + "..."

	class ContentProcessor:
	"""Processes and analyzes different types of content"""
	def __init__(self):
	self.model_manager = ModelManager()

	def process_content(self, content: str) -> Dict[str, Any]:
	"""Process content and generate insights"""
	if not content:
	return {"summary": "", "insights": []}

	try:
	summary = self.model_manager.generate_summary(content)
	return {
	"summary": summary,
	"insights": [] # Simplified for CPU deployment
	}
	except Exception as e:
	print(f"Error processing content: {e}")
	return {"summary": content[:500] + "...", "insights": []}

	class WebSearchEngine:
	"""Main search engine class"""
	def __init__(self):
	self.processor = ContentProcessor()
	self.session = requests.Session()
	self.request_delay = 1.0
	self.last_request_time = 0

	def is_valid_url(self, url: str) -> bool:
	"""Check if URL is valid for crawling"""
	try:
	parsed = urlparse(url)
	return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
	except:
	return False

	def get_metadata(self, soup: BeautifulSoup) -> Dict[str, str]:
	"""Extract metadata from page"""
	metadata = {}

	# Get title
	title = soup.find('title')
	if title:
	metadata['title'] = title.text.strip()

	# Get meta description
	desc = soup.find('meta', attrs={'name': 'description'})
	if desc:
	metadata['description'] = desc.get('content', '')

	# Get publication date
	date = soup.find('meta', attrs={'property': 'article:published_time'})
	if date:
	metadata['published_date'] = date.get('content', '').split('T')[0]

	return metadata

	def process_url(self, url: str) -> Dict[str, Any]:
	"""Process a single URL"""
	if not self.is_valid_url(url):
	return None

	try:
	# Rate limiting
	current_time = time.time()
	if current_time - self.last_request_time < self.request_delay:
	time.sleep(self.request_delay)

	response = self.session.get(url, timeout=10)
	self.last_request_time = time.time()

	if response.status_code != 200:
	return None

	soup = BeautifulSoup(response.text, 'lxml')
	metadata = self.get_metadata(soup)

	# Extract main content (simplified)
	content = ' '.join([p.text for p in soup.find_all('p')])
	processed = self.processor.process_content(content)

	return {
	'url': url,
	'title': metadata.get('title', url),
	'summary': processed['summary'],
	'published_date': metadata.get('published_date', '')
	}

	except Exception as e:
	print(f"Error processing URL {url}: {e}")
	return None

	def search(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
	"""Perform search and process results"""
	try:
	# Perform DuckDuckGo search
	search_results = ddg(query, max_results=max_results)

	results = []
	for result in search_results:
	processed = self.process_url(result['link'])
	if processed:
	results.append(processed)

	return results[:max_results]

	except Exception as e:
	print(f"Error in search: {e}")
	return []

	# Main search function
	def search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
	"""Main search function"""
	engine = WebSearchEngine()
	return engine.search(query, max_results)