fikird commited on
Commit
dcc91e6
·
0 Parent(s):

Initial commit with all files

Browse files
Files changed (6) hide show
  1. README.md +59 -0
  2. app.py +57 -0
  3. packages.txt +4 -0
  4. requirements.txt +10 -0
  5. search_engine.py +156 -0
  6. space.yml +11 -0
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🔍 Intelligent Search Engine
2
+
3
+ An AI-powered search engine that provides intelligent summaries and insights from web content.
4
+
5
+ ## Features
6
+
7
+ - 🌐 Web search powered by DuckDuckGo
8
+ - 🤖 AI-powered content summarization
9
+ - 📊 Semantic search capabilities
10
+ - 📱 Clean, responsive UI
11
+
12
+ ## Technical Details
13
+
14
+ ### Core Components
15
+
16
+ 1. **Search Engine (`search_engine.py`)**
17
+ - DuckDuckGo integration for web search
18
+ - Content processing and summarization
19
+ - URL validation and metadata extraction
20
+
21
+ 2. **Web Interface (`app.py`)**
22
+ - Gradio-based UI
23
+ - Error handling
24
+ - Result formatting
25
+
26
+ ### Models
27
+
28
+ - Summarization: facebook/bart-base
29
+ - Embeddings: sentence-transformers/all-MiniLM-L6-v2
30
+
31
+ ### Dependencies
32
+
33
+ - Python 3.10
34
+ - Gradio 4.14.0
35
+ - Transformers
36
+ - DuckDuckGo Search
37
+ - BeautifulSoup4
38
+ - Langchain
39
+ - Sentence Transformers
40
+
41
+ ## Usage
42
+
43
+ 1. Enter your search query in the text box
44
+ 2. Adjust the number of results using the slider
45
+ 3. Click "Submit" to see the results
46
+
47
+ ## Example Queries
48
+
49
+ - "Latest developments in artificial intelligence"
50
+ - "Climate change solutions"
51
+ - "Space exploration news"
52
+
53
+ ## Deployment
54
+
55
+ This project is deployed on Hugging Face Spaces, optimized for CPU environments.
56
+
57
+ ## License
58
+
59
+ Apache 2.0
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from search_engine import search
3
+
4
+ def safe_search(query, max_results=5):
5
+ try:
6
+ results = search(query, max_results)
7
+ formatted_results = []
8
+
9
+ for result in results:
10
+ formatted_result = f"""
11
+ ### [{result['title']}]({result['url']})
12
+
13
+ {result['summary']}
14
+
15
+ **Source:** {result['url']}
16
+ **Published:** {result.get('published_date', 'N/A')}
17
+ """
18
+ formatted_results.append(formatted_result)
19
+
20
+ return "\n---\n".join(formatted_results)
21
+ except Exception as e:
22
+ return f"Error: {str(e)}"
23
+
24
+ # Create Gradio interface
25
+ demo = gr.Interface(
26
+ fn=safe_search,
27
+ inputs=[
28
+ gr.Textbox(
29
+ label="Search Query",
30
+ placeholder="Enter your search query...",
31
+ lines=2
32
+ ),
33
+ gr.Slider(
34
+ minimum=1,
35
+ maximum=10,
36
+ value=5,
37
+ step=1,
38
+ label="Number of Results"
39
+ )
40
+ ],
41
+ outputs=gr.Markdown(label="Search Results"),
42
+ title="🔍 Intelligent Search Engine",
43
+ description="""
44
+ An AI-powered search engine that provides intelligent summaries and insights from web content.
45
+
46
+ Features:
47
+ - Smart content summarization
48
+ - Semantic search capabilities
49
+ - Clean, readable results
50
+ """,
51
+ examples=[
52
+ ["Latest developments in artificial intelligence", 3],
53
+ ["Climate change solutions", 5],
54
+ ["Space exploration news", 4]
55
+ ],
56
+ theme=gr.themes.Soft()
57
+ )
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ python3-dev
2
+ build-essential
3
+ git
4
+ libgomp1
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.14.0
2
+ torch==2.1.0
3
+ transformers==4.35.2
4
+ duckduckgo-search==3.9.3
5
+ beautifulsoup4==4.12.2
6
+ langchain==0.0.335
7
+ sentence-transformers==2.2.2
8
+ lxml==4.9.3
9
+ requests==2.31.0
10
+ protobuf==4.25.1
search_engine.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from duckduckgo_search import ddg
5
+ from transformers import pipeline
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ import time
8
+ import json
9
+ import os
10
+ from urllib.parse import urlparse
11
+
12
+ class ModelManager:
13
+ """Manages AI models for text processing"""
14
+ def __init__(self):
15
+ # Initialize with smaller, CPU-friendly models
16
+ self.summarizer = pipeline(
17
+ "summarization",
18
+ model="facebook/bart-base",
19
+ device=-1 # Use CPU
20
+ )
21
+ self.embeddings = HuggingFaceEmbeddings(
22
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
23
+ )
24
+
25
+ def generate_summary(self, text: str, max_length: int = 150) -> str:
26
+ """Generate a concise summary of the text"""
27
+ if not text or len(text.split()) < 50:
28
+ return text
29
+
30
+ try:
31
+ summary = self.summarizer(
32
+ text,
33
+ max_length=max_length,
34
+ min_length=30,
35
+ do_sample=False
36
+ )[0]['summary_text']
37
+ return summary
38
+ except Exception as e:
39
+ print(f"Error in summarization: {e}")
40
+ return text[:500] + "..."
41
+
42
+ class ContentProcessor:
43
+ """Processes and analyzes different types of content"""
44
+ def __init__(self):
45
+ self.model_manager = ModelManager()
46
+
47
+ def process_content(self, content: str) -> Dict[str, Any]:
48
+ """Process content and generate insights"""
49
+ if not content:
50
+ return {"summary": "", "insights": []}
51
+
52
+ try:
53
+ summary = self.model_manager.generate_summary(content)
54
+ return {
55
+ "summary": summary,
56
+ "insights": [] # Simplified for CPU deployment
57
+ }
58
+ except Exception as e:
59
+ print(f"Error processing content: {e}")
60
+ return {"summary": content[:500] + "...", "insights": []}
61
+
62
+ class WebSearchEngine:
63
+ """Main search engine class"""
64
+ def __init__(self):
65
+ self.processor = ContentProcessor()
66
+ self.session = requests.Session()
67
+ self.request_delay = 1.0
68
+ self.last_request_time = 0
69
+
70
+ def is_valid_url(self, url: str) -> bool:
71
+ """Check if URL is valid for crawling"""
72
+ try:
73
+ parsed = urlparse(url)
74
+ return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
75
+ except:
76
+ return False
77
+
78
+ def get_metadata(self, soup: BeautifulSoup) -> Dict[str, str]:
79
+ """Extract metadata from page"""
80
+ metadata = {}
81
+
82
+ # Get title
83
+ title = soup.find('title')
84
+ if title:
85
+ metadata['title'] = title.text.strip()
86
+
87
+ # Get meta description
88
+ desc = soup.find('meta', attrs={'name': 'description'})
89
+ if desc:
90
+ metadata['description'] = desc.get('content', '')
91
+
92
+ # Get publication date
93
+ date = soup.find('meta', attrs={'property': 'article:published_time'})
94
+ if date:
95
+ metadata['published_date'] = date.get('content', '').split('T')[0]
96
+
97
+ return metadata
98
+
99
+ def process_url(self, url: str) -> Dict[str, Any]:
100
+ """Process a single URL"""
101
+ if not self.is_valid_url(url):
102
+ return None
103
+
104
+ try:
105
+ # Rate limiting
106
+ current_time = time.time()
107
+ if current_time - self.last_request_time < self.request_delay:
108
+ time.sleep(self.request_delay)
109
+
110
+ response = self.session.get(url, timeout=10)
111
+ self.last_request_time = time.time()
112
+
113
+ if response.status_code != 200:
114
+ return None
115
+
116
+ soup = BeautifulSoup(response.text, 'lxml')
117
+ metadata = self.get_metadata(soup)
118
+
119
+ # Extract main content (simplified)
120
+ content = ' '.join([p.text for p in soup.find_all('p')])
121
+ processed = self.processor.process_content(content)
122
+
123
+ return {
124
+ 'url': url,
125
+ 'title': metadata.get('title', url),
126
+ 'summary': processed['summary'],
127
+ 'published_date': metadata.get('published_date', '')
128
+ }
129
+
130
+ except Exception as e:
131
+ print(f"Error processing URL {url}: {e}")
132
+ return None
133
+
134
+ def search(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
135
+ """Perform search and process results"""
136
+ try:
137
+ # Perform DuckDuckGo search
138
+ search_results = ddg(query, max_results=max_results)
139
+
140
+ results = []
141
+ for result in search_results:
142
+ processed = self.process_url(result['link'])
143
+ if processed:
144
+ results.append(processed)
145
+
146
+ return results[:max_results]
147
+
148
+ except Exception as e:
149
+ print(f"Error in search: {e}")
150
+ return []
151
+
152
+ # Main search function
153
+ def search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
154
+ """Main search function"""
155
+ engine = WebSearchEngine()
156
+ return engine.search(query, max_results)
space.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: Intelligent Search Engine
2
+ emoji: 🔍
3
+ colorFrom: blue
4
+ colorTo: indigo
5
+ sdk: gradio
6
+ sdk_version: 4.14.0
7
+ python_version: "3.10"
8
+ app_file: app.py
9
+ app_port: 7860
10
+ pinned: false
11
+ license: apache-2.0