Spaces:

tensor-boy
/

ISE

Runtime error

App Files Files Community

fikird commited on Nov 27, 2024

Commit

dcc91e6

0 Parent(s):

Initial commit with all files

Browse files

Files changed (6) hide show

README.md +59 -0
app.py +57 -0
packages.txt +4 -0
requirements.txt +10 -0
search_engine.py +156 -0
space.yml +11 -0

README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# 🔍 Intelligent Search Engine
+An AI-powered search engine that provides intelligent summaries and insights from web content.
+## Features
+- 🌐 Web search powered by DuckDuckGo
+- 🤖 AI-powered content summarization
+- 📊 Semantic search capabilities
+- 📱 Clean, responsive UI
+## Technical Details
+### Core Components
+1. **Search Engine (`search_engine.py`)**
+   - DuckDuckGo integration for web search
+   - Content processing and summarization
+   - URL validation and metadata extraction
+2. **Web Interface (`app.py`)**
+   - Gradio-based UI
+   - Error handling
+   - Result formatting
+### Models
+- Summarization: facebook/bart-base
+- Embeddings: sentence-transformers/all-MiniLM-L6-v2
+### Dependencies
+- Python 3.10
+- Gradio 4.14.0
+- Transformers
+- DuckDuckGo Search
+- BeautifulSoup4
+- Langchain
+- Sentence Transformers
+## Usage
+1. Enter your search query in the text box
+2. Adjust the number of results using the slider
+3. Click "Submit" to see the results
+## Example Queries
+- "Latest developments in artificial intelligence"
+- "Climate change solutions"
+- "Space exploration news"
+## Deployment
+This project is deployed on Hugging Face Spaces, optimized for CPU environments.
+## License
+Apache 2.0

app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import gradio as gr
+from search_engine import search
+def safe_search(query, max_results=5):
+    try:
+        results = search(query, max_results)
+        formatted_results = []
+        for result in results:
+            formatted_result = f"""
+### [{result['title']}]({result['url']})
+{result['summary']}
+**Source:** {result['url']}
+**Published:** {result.get('published_date', 'N/A')}
+            """
+            formatted_results.append(formatted_result)
+        return "\n---\n".join(formatted_results)
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Create Gradio interface
+demo = gr.Interface(
+    fn=safe_search,
+    inputs=[
+        gr.Textbox(
+            label="Search Query",
+            placeholder="Enter your search query...",
+            lines=2
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=10,
+            value=5,
+            step=1,
+            label="Number of Results"
+        )
+    ],
+    outputs=gr.Markdown(label="Search Results"),
+    title="🔍 Intelligent Search Engine",
+    description="""
+    An AI-powered search engine that provides intelligent summaries and insights from web content.
+    Features:
+    - Smart content summarization
+    - Semantic search capabilities
+    - Clean, readable results
+    """,
+    examples=[
+        ["Latest developments in artificial intelligence", 3],
+        ["Climate change solutions", 5],
+        ["Space exploration news", 4]
+    ],
+    theme=gr.themes.Soft()
+)

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+python3-dev
+build-essential
+git
+libgomp1

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio==4.14.0
+torch==2.1.0
+transformers==4.35.2
+duckduckgo-search==3.9.3
+beautifulsoup4==4.12.2
+langchain==0.0.335
+sentence-transformers==2.2.2
+lxml==4.9.3
+requests==2.31.0
+protobuf==4.25.1

search_engine.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from typing import Dict, List, Any
+import requests
+from bs4 import BeautifulSoup
+from duckduckgo_search import ddg
+from transformers import pipeline
+from langchain.embeddings import HuggingFaceEmbeddings
+import time
+import json
+import os
+from urllib.parse import urlparse
+class ModelManager:
+    """Manages AI models for text processing"""
+    def __init__(self):
+        # Initialize with smaller, CPU-friendly models
+        self.summarizer = pipeline(
+            "summarization",
+            model="facebook/bart-base",
+            device=-1  # Use CPU
+        )
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    def generate_summary(self, text: str, max_length: int = 150) -> str:
+        """Generate a concise summary of the text"""
+        if not text or len(text.split()) < 50:
+            return text
+        try:
+            summary = self.summarizer(
+                text,
+                max_length=max_length,
+                min_length=30,
+                do_sample=False
+            )[0]['summary_text']
+            return summary
+        except Exception as e:
+            print(f"Error in summarization: {e}")
+            return text[:500] + "..."
+class ContentProcessor:
+    """Processes and analyzes different types of content"""
+    def __init__(self):
+        self.model_manager = ModelManager()
+    def process_content(self, content: str) -> Dict[str, Any]:
+        """Process content and generate insights"""
+        if not content:
+            return {"summary": "", "insights": []}
+        try:
+            summary = self.model_manager.generate_summary(content)
+            return {
+                "summary": summary,
+                "insights": []  # Simplified for CPU deployment
+            }
+        except Exception as e:
+            print(f"Error processing content: {e}")
+            return {"summary": content[:500] + "...", "insights": []}
+class WebSearchEngine:
+    """Main search engine class"""
+    def __init__(self):
+        self.processor = ContentProcessor()
+        self.session = requests.Session()
+        self.request_delay = 1.0
+        self.last_request_time = 0
+    def is_valid_url(self, url: str) -> bool:
+        """Check if URL is valid for crawling"""
+        try:
+            parsed = urlparse(url)
+            return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
+        except:
+            return False
+    def get_metadata(self, soup: BeautifulSoup) -> Dict[str, str]:
+        """Extract metadata from page"""
+        metadata = {}
+        # Get title
+        title = soup.find('title')
+        if title:
+            metadata['title'] = title.text.strip()
+        # Get meta description
+        desc = soup.find('meta', attrs={'name': 'description'})
+        if desc:
+            metadata['description'] = desc.get('content', '')
+        # Get publication date
+        date = soup.find('meta', attrs={'property': 'article:published_time'})
+        if date:
+            metadata['published_date'] = date.get('content', '').split('T')[0]
+        return metadata
+    def process_url(self, url: str) -> Dict[str, Any]:
+        """Process a single URL"""
+        if not self.is_valid_url(url):
+            return None
+        try:
+            # Rate limiting
+            current_time = time.time()
+            if current_time - self.last_request_time < self.request_delay:
+                time.sleep(self.request_delay)
+            response = self.session.get(url, timeout=10)
+            self.last_request_time = time.time()
+            if response.status_code != 200:
+                return None
+            soup = BeautifulSoup(response.text, 'lxml')
+            metadata = self.get_metadata(soup)
+            # Extract main content (simplified)
+            content = ' '.join([p.text for p in soup.find_all('p')])
+            processed = self.processor.process_content(content)
+            return {
+                'url': url,
+                'title': metadata.get('title', url),
+                'summary': processed['summary'],
+                'published_date': metadata.get('published_date', '')
+            }
+        except Exception as e:
+            print(f"Error processing URL {url}: {e}")
+            return None
+    def search(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
+        """Perform search and process results"""
+        try:
+            # Perform DuckDuckGo search
+            search_results = ddg(query, max_results=max_results)
+            results = []
+            for result in search_results:
+                processed = self.process_url(result['link'])
+                if processed:
+                    results.append(processed)
+            return results[:max_results]
+        except Exception as e:
+            print(f"Error in search: {e}")
+            return []
+# Main search function
+def search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
+    """Main search function"""
+    engine = WebSearchEngine()
+    return engine.search(query, max_results)

space.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+title: Intelligent Search Engine
+emoji: 🔍
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.14.0
+python_version: "3.10"
+app_file: app.py
+app_port: 7860
+pinned: false
+license: apache-2.0