Spaces:
Runtime error
Runtime error
fikird
commited on
Commit
·
dcc91e6
0
Parent(s):
Initial commit with all files
Browse files- README.md +59 -0
- app.py +57 -0
- packages.txt +4 -0
- requirements.txt +10 -0
- search_engine.py +156 -0
- space.yml +11 -0
README.md
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🔍 Intelligent Search Engine
|
2 |
+
|
3 |
+
An AI-powered search engine that provides intelligent summaries and insights from web content.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- 🌐 Web search powered by DuckDuckGo
|
8 |
+
- 🤖 AI-powered content summarization
|
9 |
+
- 📊 Semantic search capabilities
|
10 |
+
- 📱 Clean, responsive UI
|
11 |
+
|
12 |
+
## Technical Details
|
13 |
+
|
14 |
+
### Core Components
|
15 |
+
|
16 |
+
1. **Search Engine (`search_engine.py`)**
|
17 |
+
- DuckDuckGo integration for web search
|
18 |
+
- Content processing and summarization
|
19 |
+
- URL validation and metadata extraction
|
20 |
+
|
21 |
+
2. **Web Interface (`app.py`)**
|
22 |
+
- Gradio-based UI
|
23 |
+
- Error handling
|
24 |
+
- Result formatting
|
25 |
+
|
26 |
+
### Models
|
27 |
+
|
28 |
+
- Summarization: facebook/bart-base
|
29 |
+
- Embeddings: sentence-transformers/all-MiniLM-L6-v2
|
30 |
+
|
31 |
+
### Dependencies
|
32 |
+
|
33 |
+
- Python 3.10
|
34 |
+
- Gradio 4.14.0
|
35 |
+
- Transformers
|
36 |
+
- DuckDuckGo Search
|
37 |
+
- BeautifulSoup4
|
38 |
+
- Langchain
|
39 |
+
- Sentence Transformers
|
40 |
+
|
41 |
+
## Usage
|
42 |
+
|
43 |
+
1. Enter your search query in the text box
|
44 |
+
2. Adjust the number of results using the slider
|
45 |
+
3. Click "Submit" to see the results
|
46 |
+
|
47 |
+
## Example Queries
|
48 |
+
|
49 |
+
- "Latest developments in artificial intelligence"
|
50 |
+
- "Climate change solutions"
|
51 |
+
- "Space exploration news"
|
52 |
+
|
53 |
+
## Deployment
|
54 |
+
|
55 |
+
This project is deployed on Hugging Face Spaces, optimized for CPU environments.
|
56 |
+
|
57 |
+
## License
|
58 |
+
|
59 |
+
Apache 2.0
|
app.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from search_engine import search
|
3 |
+
|
4 |
+
def safe_search(query, max_results=5):
|
5 |
+
try:
|
6 |
+
results = search(query, max_results)
|
7 |
+
formatted_results = []
|
8 |
+
|
9 |
+
for result in results:
|
10 |
+
formatted_result = f"""
|
11 |
+
### [{result['title']}]({result['url']})
|
12 |
+
|
13 |
+
{result['summary']}
|
14 |
+
|
15 |
+
**Source:** {result['url']}
|
16 |
+
**Published:** {result.get('published_date', 'N/A')}
|
17 |
+
"""
|
18 |
+
formatted_results.append(formatted_result)
|
19 |
+
|
20 |
+
return "\n---\n".join(formatted_results)
|
21 |
+
except Exception as e:
|
22 |
+
return f"Error: {str(e)}"
|
23 |
+
|
24 |
+
# Create Gradio interface
|
25 |
+
demo = gr.Interface(
|
26 |
+
fn=safe_search,
|
27 |
+
inputs=[
|
28 |
+
gr.Textbox(
|
29 |
+
label="Search Query",
|
30 |
+
placeholder="Enter your search query...",
|
31 |
+
lines=2
|
32 |
+
),
|
33 |
+
gr.Slider(
|
34 |
+
minimum=1,
|
35 |
+
maximum=10,
|
36 |
+
value=5,
|
37 |
+
step=1,
|
38 |
+
label="Number of Results"
|
39 |
+
)
|
40 |
+
],
|
41 |
+
outputs=gr.Markdown(label="Search Results"),
|
42 |
+
title="🔍 Intelligent Search Engine",
|
43 |
+
description="""
|
44 |
+
An AI-powered search engine that provides intelligent summaries and insights from web content.
|
45 |
+
|
46 |
+
Features:
|
47 |
+
- Smart content summarization
|
48 |
+
- Semantic search capabilities
|
49 |
+
- Clean, readable results
|
50 |
+
""",
|
51 |
+
examples=[
|
52 |
+
["Latest developments in artificial intelligence", 3],
|
53 |
+
["Climate change solutions", 5],
|
54 |
+
["Space exploration news", 4]
|
55 |
+
],
|
56 |
+
theme=gr.themes.Soft()
|
57 |
+
)
|
packages.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python3-dev
|
2 |
+
build-essential
|
3 |
+
git
|
4 |
+
libgomp1
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==4.14.0
|
2 |
+
torch==2.1.0
|
3 |
+
transformers==4.35.2
|
4 |
+
duckduckgo-search==3.9.3
|
5 |
+
beautifulsoup4==4.12.2
|
6 |
+
langchain==0.0.335
|
7 |
+
sentence-transformers==2.2.2
|
8 |
+
lxml==4.9.3
|
9 |
+
requests==2.31.0
|
10 |
+
protobuf==4.25.1
|
search_engine.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List, Any
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from duckduckgo_search import ddg
|
5 |
+
from transformers import pipeline
|
6 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
7 |
+
import time
|
8 |
+
import json
|
9 |
+
import os
|
10 |
+
from urllib.parse import urlparse
|
11 |
+
|
12 |
+
class ModelManager:
|
13 |
+
"""Manages AI models for text processing"""
|
14 |
+
def __init__(self):
|
15 |
+
# Initialize with smaller, CPU-friendly models
|
16 |
+
self.summarizer = pipeline(
|
17 |
+
"summarization",
|
18 |
+
model="facebook/bart-base",
|
19 |
+
device=-1 # Use CPU
|
20 |
+
)
|
21 |
+
self.embeddings = HuggingFaceEmbeddings(
|
22 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
23 |
+
)
|
24 |
+
|
25 |
+
def generate_summary(self, text: str, max_length: int = 150) -> str:
|
26 |
+
"""Generate a concise summary of the text"""
|
27 |
+
if not text or len(text.split()) < 50:
|
28 |
+
return text
|
29 |
+
|
30 |
+
try:
|
31 |
+
summary = self.summarizer(
|
32 |
+
text,
|
33 |
+
max_length=max_length,
|
34 |
+
min_length=30,
|
35 |
+
do_sample=False
|
36 |
+
)[0]['summary_text']
|
37 |
+
return summary
|
38 |
+
except Exception as e:
|
39 |
+
print(f"Error in summarization: {e}")
|
40 |
+
return text[:500] + "..."
|
41 |
+
|
42 |
+
class ContentProcessor:
|
43 |
+
"""Processes and analyzes different types of content"""
|
44 |
+
def __init__(self):
|
45 |
+
self.model_manager = ModelManager()
|
46 |
+
|
47 |
+
def process_content(self, content: str) -> Dict[str, Any]:
|
48 |
+
"""Process content and generate insights"""
|
49 |
+
if not content:
|
50 |
+
return {"summary": "", "insights": []}
|
51 |
+
|
52 |
+
try:
|
53 |
+
summary = self.model_manager.generate_summary(content)
|
54 |
+
return {
|
55 |
+
"summary": summary,
|
56 |
+
"insights": [] # Simplified for CPU deployment
|
57 |
+
}
|
58 |
+
except Exception as e:
|
59 |
+
print(f"Error processing content: {e}")
|
60 |
+
return {"summary": content[:500] + "...", "insights": []}
|
61 |
+
|
62 |
+
class WebSearchEngine:
|
63 |
+
"""Main search engine class"""
|
64 |
+
def __init__(self):
|
65 |
+
self.processor = ContentProcessor()
|
66 |
+
self.session = requests.Session()
|
67 |
+
self.request_delay = 1.0
|
68 |
+
self.last_request_time = 0
|
69 |
+
|
70 |
+
def is_valid_url(self, url: str) -> bool:
|
71 |
+
"""Check if URL is valid for crawling"""
|
72 |
+
try:
|
73 |
+
parsed = urlparse(url)
|
74 |
+
return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
|
75 |
+
except:
|
76 |
+
return False
|
77 |
+
|
78 |
+
def get_metadata(self, soup: BeautifulSoup) -> Dict[str, str]:
|
79 |
+
"""Extract metadata from page"""
|
80 |
+
metadata = {}
|
81 |
+
|
82 |
+
# Get title
|
83 |
+
title = soup.find('title')
|
84 |
+
if title:
|
85 |
+
metadata['title'] = title.text.strip()
|
86 |
+
|
87 |
+
# Get meta description
|
88 |
+
desc = soup.find('meta', attrs={'name': 'description'})
|
89 |
+
if desc:
|
90 |
+
metadata['description'] = desc.get('content', '')
|
91 |
+
|
92 |
+
# Get publication date
|
93 |
+
date = soup.find('meta', attrs={'property': 'article:published_time'})
|
94 |
+
if date:
|
95 |
+
metadata['published_date'] = date.get('content', '').split('T')[0]
|
96 |
+
|
97 |
+
return metadata
|
98 |
+
|
99 |
+
def process_url(self, url: str) -> Dict[str, Any]:
|
100 |
+
"""Process a single URL"""
|
101 |
+
if not self.is_valid_url(url):
|
102 |
+
return None
|
103 |
+
|
104 |
+
try:
|
105 |
+
# Rate limiting
|
106 |
+
current_time = time.time()
|
107 |
+
if current_time - self.last_request_time < self.request_delay:
|
108 |
+
time.sleep(self.request_delay)
|
109 |
+
|
110 |
+
response = self.session.get(url, timeout=10)
|
111 |
+
self.last_request_time = time.time()
|
112 |
+
|
113 |
+
if response.status_code != 200:
|
114 |
+
return None
|
115 |
+
|
116 |
+
soup = BeautifulSoup(response.text, 'lxml')
|
117 |
+
metadata = self.get_metadata(soup)
|
118 |
+
|
119 |
+
# Extract main content (simplified)
|
120 |
+
content = ' '.join([p.text for p in soup.find_all('p')])
|
121 |
+
processed = self.processor.process_content(content)
|
122 |
+
|
123 |
+
return {
|
124 |
+
'url': url,
|
125 |
+
'title': metadata.get('title', url),
|
126 |
+
'summary': processed['summary'],
|
127 |
+
'published_date': metadata.get('published_date', '')
|
128 |
+
}
|
129 |
+
|
130 |
+
except Exception as e:
|
131 |
+
print(f"Error processing URL {url}: {e}")
|
132 |
+
return None
|
133 |
+
|
134 |
+
def search(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
135 |
+
"""Perform search and process results"""
|
136 |
+
try:
|
137 |
+
# Perform DuckDuckGo search
|
138 |
+
search_results = ddg(query, max_results=max_results)
|
139 |
+
|
140 |
+
results = []
|
141 |
+
for result in search_results:
|
142 |
+
processed = self.process_url(result['link'])
|
143 |
+
if processed:
|
144 |
+
results.append(processed)
|
145 |
+
|
146 |
+
return results[:max_results]
|
147 |
+
|
148 |
+
except Exception as e:
|
149 |
+
print(f"Error in search: {e}")
|
150 |
+
return []
|
151 |
+
|
152 |
+
# Main search function
|
153 |
+
def search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
154 |
+
"""Main search function"""
|
155 |
+
engine = WebSearchEngine()
|
156 |
+
return engine.search(query, max_results)
|
space.yml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
title: Intelligent Search Engine
|
2 |
+
emoji: 🔍
|
3 |
+
colorFrom: blue
|
4 |
+
colorTo: indigo
|
5 |
+
sdk: gradio
|
6 |
+
sdk_version: 4.14.0
|
7 |
+
python_version: "3.10"
|
8 |
+
app_file: app.py
|
9 |
+
app_port: 7860
|
10 |
+
pinned: false
|
11 |
+
license: apache-2.0
|