fikird commited on
Commit
a3440c5
·
1 Parent(s): 48922fa

feat: Enhanced search engine with caching and metadata

Browse files

- Added result caching with TTL
- Improved content extraction
- Enhanced metadata collection
- Optimized dependencies
- Removed unnecessary files

Files changed (7) hide show
  1. apt.txt +0 -8
  2. engines/search.py +159 -13
  3. osint_engine.py +0 -489
  4. packages.txt +0 -25
  5. requirements.txt +32 -35
  6. search_engine.py +0 -219
  7. space.yml +0 -11
apt.txt DELETED
@@ -1,8 +0,0 @@
1
- python3-dev
2
- python3-pip
3
- build-essential
4
- gcc
5
- g++
6
- git
7
- cmake
8
- libgomp1
 
 
 
 
 
 
 
 
 
engines/search.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- RAG-based search engine with intelligent answer synthesis.
3
  """
4
  from typing import List, Dict, Any, Optional
5
  import asyncio
@@ -13,6 +13,12 @@ from googlesearch import search as gsearch
13
  import requests
14
  from bs4 import BeautifulSoup
15
  from tenacity import retry, stop_after_attempt, wait_exponential
 
 
 
 
 
 
16
 
17
  class SearchEngine:
18
  def __init__(self):
@@ -23,12 +29,42 @@ class SearchEngine:
23
  chunk_size=500,
24
  chunk_overlap=50
25
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
28
  async def search_web(self, query: str, max_results: int = 10) -> List[Dict[str, str]]:
29
  """Perform web search using multiple search engines."""
30
  results = []
31
 
 
 
 
 
 
32
  # DuckDuckGo Search
33
  try:
34
  with DDGS() as ddgs:
@@ -44,8 +80,26 @@ class SearchEngine:
44
  except Exception as e:
45
  print(f"Google search error: {e}")
46
 
 
47
  return results[:max_results]
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
50
  async def fetch_content(self, url: str) -> Optional[str]:
51
  """Fetch and extract content from a webpage."""
@@ -56,25 +110,90 @@ class SearchEngine:
56
  response = requests.get(url, headers=headers, timeout=10)
57
  response.raise_for_status()
58
 
 
59
  soup = BeautifulSoup(response.text, "html.parser")
60
 
61
  # Remove unwanted elements
62
- for element in soup(["script", "style", "nav", "footer", "header"]):
63
  element.decompose()
64
 
65
- text = soup.get_text(separator="\n", strip=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  return text
67
  except Exception as e:
68
  print(f"Error fetching {url}: {e}")
69
  return None
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  async def process_search_results(self, query: str) -> Dict[str, Any]:
72
  """Process search results and create a RAG-based answer."""
 
 
 
 
 
73
  # Perform web search
74
  search_results = await self.search_web(query)
75
 
76
  # Fetch content from search results
77
  documents = []
 
 
78
  for result in search_results:
79
  url = result.get("link")
80
  if not url:
@@ -84,17 +203,28 @@ class SearchEngine:
84
  if content:
85
  # Split content into chunks
86
  chunks = self.text_splitter.split_text(content)
 
 
 
 
 
 
 
 
 
 
87
  for chunk in chunks:
88
  doc = Document(
89
  page_content=chunk,
90
- metadata={"source": url, "title": result.get("title", url)}
91
  )
92
  documents.append(doc)
93
 
94
  if not documents:
95
  return {
96
  "answer": "I couldn't find any relevant information.",
97
- "sources": []
 
98
  }
99
 
100
  # Create vector store
@@ -109,18 +239,33 @@ class SearchEngine:
109
  # Get relevant documents
110
  relevant_docs = chain.retriever.get_relevant_documents(query)
111
 
112
- # For now, return the most relevant chunks and sources
113
  sources = []
114
  content = []
115
- for doc in relevant_docs[:3]:
116
- if doc.metadata["source"] not in sources:
117
- sources.append(doc.metadata["source"])
118
- content.append(doc.page_content)
119
 
120
- return {
 
 
 
 
 
 
 
 
 
 
 
 
121
  "answer": "\n\n".join(content),
122
- "sources": sources
 
123
  }
 
 
 
 
 
124
 
125
  async def search(self, query: str) -> Dict[str, Any]:
126
  """Main search interface."""
@@ -129,5 +274,6 @@ class SearchEngine:
129
  except Exception as e:
130
  return {
131
  "answer": f"An error occurred: {str(e)}",
132
- "sources": []
 
133
  }
 
1
  """
2
+ Advanced RAG-based search engine with multi-source intelligence.
3
  """
4
  from typing import List, Dict, Any, Optional
5
  import asyncio
 
13
  import requests
14
  from bs4 import BeautifulSoup
15
  from tenacity import retry, stop_after_attempt, wait_exponential
16
+ import json
17
+ import time
18
+ from datetime import datetime, timedelta
19
+ import hashlib
20
+ from urllib.parse import urlparse
21
+ import re
22
 
23
  class SearchEngine:
24
  def __init__(self):
 
29
  chunk_size=500,
30
  chunk_overlap=50
31
  )
32
+ self.cache = {}
33
+ self.cache_ttl = timedelta(hours=24)
34
+ self.search_delay = 2 # seconds between searches
35
+ self.last_search_time = datetime.min
36
+
37
+ def _get_cache_key(self, query: str, **kwargs) -> str:
38
+ """Generate cache key from query and kwargs."""
39
+ cache_data = {
40
+ "query": query,
41
+ **kwargs
42
+ }
43
+ return hashlib.md5(json.dumps(cache_data, sort_keys=True).encode()).hexdigest()
44
+
45
+ def _get_cached_result(self, cache_key: str) -> Optional[Dict[str, Any]]:
46
+ """Get result from cache if valid."""
47
+ if cache_key in self.cache:
48
+ result, timestamp = self.cache[cache_key]
49
+ if datetime.now() - timestamp < self.cache_ttl:
50
+ return result
51
+ del self.cache[cache_key]
52
+ return None
53
+
54
+ def _set_cached_result(self, cache_key: str, result: Dict[str, Any]):
55
+ """Store result in cache."""
56
+ self.cache[cache_key] = (result, datetime.now())
57
 
58
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
59
  async def search_web(self, query: str, max_results: int = 10) -> List[Dict[str, str]]:
60
  """Perform web search using multiple search engines."""
61
  results = []
62
 
63
+ # Respect rate limiting
64
+ time_since_last = datetime.now() - self.last_search_time
65
+ if time_since_last.total_seconds() < self.search_delay:
66
+ await asyncio.sleep(self.search_delay - time_since_last.total_seconds())
67
+
68
  # DuckDuckGo Search
69
  try:
70
  with DDGS() as ddgs:
 
80
  except Exception as e:
81
  print(f"Google search error: {e}")
82
 
83
+ self.last_search_time = datetime.now()
84
  return results[:max_results]
85
 
86
+ def _clean_html(self, html: str) -> str:
87
+ """Clean HTML content."""
88
+ # Remove script and style elements
89
+ html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
90
+ html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL)
91
+
92
+ # Remove comments
93
+ html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
94
+
95
+ # Remove remaining tags
96
+ html = re.sub(r'<[^>]+>', ' ', html)
97
+
98
+ # Clean whitespace
99
+ html = re.sub(r'\s+', ' ', html).strip()
100
+
101
+ return html
102
+
103
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
104
  async def fetch_content(self, url: str) -> Optional[str]:
105
  """Fetch and extract content from a webpage."""
 
110
  response = requests.get(url, headers=headers, timeout=10)
111
  response.raise_for_status()
112
 
113
+ # Extract main content
114
  soup = BeautifulSoup(response.text, "html.parser")
115
 
116
  # Remove unwanted elements
117
+ for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
118
  element.decompose()
119
 
120
+ # Try to find main content
121
+ main_content = None
122
+
123
+ # Look for article tag
124
+ if soup.find("article"):
125
+ main_content = soup.find("article")
126
+
127
+ # Look for main tag
128
+ elif soup.find("main"):
129
+ main_content = soup.find("main")
130
+
131
+ # Look for div with common content class names
132
+ elif soup.find("div", class_=re.compile(r"content|article|post|entry")):
133
+ main_content = soup.find("div", class_=re.compile(r"content|article|post|entry"))
134
+
135
+ # Use body if no main content found
136
+ if not main_content:
137
+ main_content = soup.body
138
+
139
+ # Extract text
140
+ if main_content:
141
+ text = self._clean_html(str(main_content))
142
+ else:
143
+ text = self._clean_html(response.text)
144
+
145
  return text
146
  except Exception as e:
147
  print(f"Error fetching {url}: {e}")
148
  return None
149
 
150
+ def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
151
+ """Extract metadata from webpage."""
152
+ metadata = {
153
+ "url": url,
154
+ "domain": urlparse(url).netloc,
155
+ "title": None,
156
+ "description": None,
157
+ "published_date": None,
158
+ "author": None,
159
+ "keywords": None
160
+ }
161
+
162
+ # Extract title
163
+ if soup.title:
164
+ metadata["title"] = soup.title.string
165
+
166
+ # Extract meta tags
167
+ for meta in soup.find_all("meta"):
168
+ name = meta.get("name", "").lower()
169
+ property = meta.get("property", "").lower()
170
+ content = meta.get("content")
171
+
172
+ if name == "description" or property == "og:description":
173
+ metadata["description"] = content
174
+ elif name == "author":
175
+ metadata["author"] = content
176
+ elif name == "keywords":
177
+ metadata["keywords"] = content
178
+ elif name in ["published_time", "article:published_time"]:
179
+ metadata["published_date"] = content
180
+
181
+ return metadata
182
+
183
  async def process_search_results(self, query: str) -> Dict[str, Any]:
184
  """Process search results and create a RAG-based answer."""
185
+ cache_key = self._get_cache_key(query)
186
+ cached_result = self._get_cached_result(cache_key)
187
+ if cached_result:
188
+ return cached_result
189
+
190
  # Perform web search
191
  search_results = await self.search_web(query)
192
 
193
  # Fetch content from search results
194
  documents = []
195
+ metadata_list = []
196
+
197
  for result in search_results:
198
  url = result.get("link")
199
  if not url:
 
203
  if content:
204
  # Split content into chunks
205
  chunks = self.text_splitter.split_text(content)
206
+
207
+ # Store metadata
208
+ metadata = {
209
+ "source": url,
210
+ "title": result.get("title", url),
211
+ **result
212
+ }
213
+ metadata_list.append(metadata)
214
+
215
+ # Create documents
216
  for chunk in chunks:
217
  doc = Document(
218
  page_content=chunk,
219
+ metadata=metadata
220
  )
221
  documents.append(doc)
222
 
223
  if not documents:
224
  return {
225
  "answer": "I couldn't find any relevant information.",
226
+ "sources": [],
227
+ "metadata": []
228
  }
229
 
230
  # Create vector store
 
239
  # Get relevant documents
240
  relevant_docs = chain.retriever.get_relevant_documents(query)
241
 
242
+ # Extract unique sources and content
243
  sources = []
244
  content = []
245
+ used_metadata = []
 
 
 
246
 
247
+ for doc in relevant_docs[:5]: # Limit to top 5 most relevant docs
248
+ source = doc.metadata["source"]
249
+ if source not in sources:
250
+ sources.append(source)
251
+ content.append(doc.page_content)
252
+
253
+ # Find corresponding metadata
254
+ for meta in metadata_list:
255
+ if meta["source"] == source:
256
+ used_metadata.append(meta)
257
+ break
258
+
259
+ result = {
260
  "answer": "\n\n".join(content),
261
+ "sources": sources,
262
+ "metadata": used_metadata
263
  }
264
+
265
+ # Cache the result
266
+ self._set_cached_result(cache_key, result)
267
+
268
+ return result
269
 
270
  async def search(self, query: str) -> Dict[str, Any]:
271
  """Main search interface."""
 
274
  except Exception as e:
275
  return {
276
  "answer": f"An error occurred: {str(e)}",
277
+ "sources": [],
278
+ "metadata": []
279
  }
osint_engine.py DELETED
@@ -1,489 +0,0 @@
1
- import os
2
- import re
3
- import json
4
- import time
5
- import asyncio
6
- import aiohttp
7
- import requests
8
- import httpx
9
- from PIL import Image
10
- from io import BytesIO
11
- from typing import Dict, List, Any, Union, Optional
12
- from selenium import webdriver
13
- from selenium.webdriver.chrome.options import Options
14
- from selenium.webdriver.chrome.service import Service
15
- from webdriver_manager.chrome import ChromeDriverManager
16
- from geopy.geocoders import Nominatim
17
- from waybackpy import WaybackMachineCDXServerAPI
18
- import whois
19
- from datetime import datetime
20
- from googlesearch import search as google_search
21
- import base64
22
- import io
23
-
24
- class OSINTEngine:
25
- """OSINT capabilities for advanced information gathering"""
26
-
27
- def __init__(self):
28
- self.chrome_options = Options()
29
- self.chrome_options.add_argument('--headless')
30
- self.chrome_options.add_argument('--no-sandbox')
31
- self.chrome_options.add_argument('--disable-dev-shm-usage')
32
- self.setup_apis()
33
- self.session = None
34
- self.platforms = {
35
- "twitter": "https://twitter.com/{}",
36
- "instagram": "https://instagram.com/{}",
37
- "facebook": "https://facebook.com/{}",
38
- "linkedin": "https://linkedin.com/in/{}",
39
- "github": "https://github.com/{}",
40
- "reddit": "https://reddit.com/user/{}",
41
- "youtube": "https://youtube.com/@{}",
42
- "tiktok": "https://tiktok.com/@{}",
43
- "pinterest": "https://pinterest.com/{}",
44
- "snapchat": "https://snapchat.com/add/{}",
45
- "twitch": "https://twitch.tv/{}",
46
- "medium": "https://medium.com/@{}",
47
- "devto": "https://dev.to/{}",
48
- "stackoverflow": "https://stackoverflow.com/users/{}"
49
- }
50
-
51
- def setup_apis(self):
52
- """Initialize API clients"""
53
- self.geolocator = Nominatim(user_agent="intelligent_search")
54
- self.http_client = httpx.AsyncClient()
55
-
56
- async def initialize(self):
57
- if not self.session:
58
- self.session = aiohttp.ClientSession()
59
-
60
- async def close(self):
61
- if self.session:
62
- await self.session.close()
63
- self.session = None
64
-
65
- async def search_username(self, username: str) -> Dict[str, Any]:
66
- """Search for username across multiple platforms"""
67
- results = {
68
- 'platforms': [],
69
- 'social_media': {},
70
- 'websites': []
71
- }
72
-
73
- # Common social media platforms
74
- platforms = [
75
- {'name': 'GitHub', 'url': f'https://github.com/{username}'},
76
- {'name': 'Twitter', 'url': f'https://twitter.com/{username}'},
77
- {'name': 'Instagram', 'url': f'https://instagram.com/{username}'},
78
- {'name': 'LinkedIn', 'url': f'https://linkedin.com/in/{username}'},
79
- {'name': 'Facebook', 'url': f'https://facebook.com/{username}'},
80
- {'name': 'YouTube', 'url': f'https://youtube.com/@{username}'},
81
- ]
82
-
83
- async with aiohttp.ClientSession() as session:
84
- tasks = []
85
- for platform in platforms:
86
- task = self.check_profile(session, platform['url'], platform['name'])
87
- tasks.append(task)
88
-
89
- platform_results = await asyncio.gather(*tasks)
90
- results['platforms'] = [r for r in platform_results if r is not None]
91
-
92
- # Google search for additional mentions
93
- try:
94
- search_query = f'"{username}" OR "@{username}" -site:twitter.com -site:facebook.com -site:instagram.com'
95
- web_results = list(google_search(search_query, num_results=5))
96
- results['websites'] = web_results
97
- except Exception as e:
98
- results['websites'] = [str(e)]
99
-
100
- return results
101
-
102
- async def check_profile(self, session, url: str, platform: str) -> Dict[str, str]:
103
- """Check if a profile exists on a platform"""
104
- try:
105
- async with session.get(url) as response:
106
- if response.status == 200:
107
- return {
108
- 'platform': platform,
109
- 'url': url,
110
- 'exists': True
111
- }
112
- except:
113
- pass
114
- return None
115
-
116
- async def check_username(self, username: str, platform: str = "all") -> List[Dict]:
117
- await self.initialize()
118
- results = []
119
-
120
- platforms_to_check = [platform] if platform != "all" else self.platforms.keys()
121
-
122
- for platform_name in platforms_to_check:
123
- if platform_name in self.platforms:
124
- url = self.platforms[platform_name].format(username)
125
- try:
126
- async with self.session.get(url) as response:
127
- exists = response.status == 200
128
- results.append({
129
- "platform": platform_name,
130
- "url": url,
131
- "exists": exists
132
- })
133
- except:
134
- results.append({
135
- "platform": platform_name,
136
- "url": url,
137
- "exists": False,
138
- "error": "Connection failed"
139
- })
140
-
141
- return results
142
-
143
- async def search_image(self, image_url: str) -> Dict[str, Any]:
144
- """Image analysis and reverse search"""
145
- results = {
146
- 'analysis': {},
147
- 'similar_images': [],
148
- 'error': None
149
- }
150
-
151
- try:
152
- # Download and analyze image
153
- response = requests.get(image_url)
154
- img = Image.open(BytesIO(response.content))
155
-
156
- # Basic image analysis
157
- results['analysis'] = {
158
- 'format': img.format,
159
- 'size': img.size,
160
- 'mode': img.mode
161
- }
162
-
163
- # Perform reverse image search using Google Lens
164
- search_url = f"https://lens.google.com/uploadbyurl?url={image_url}"
165
- results['similar_images'].append({
166
- 'source': 'Google Lens',
167
- 'url': search_url
168
- })
169
-
170
- except Exception as e:
171
- results['error'] = str(e)
172
-
173
- return results
174
-
175
- async def gather_personal_info(self, data: Dict[str, str]) -> Dict[str, Any]:
176
- """Gather personal information from various sources"""
177
- results = {}
178
-
179
- if 'location' in data:
180
- results['location'] = await self.analyze_location(data['location'])
181
-
182
- if 'domain' in data:
183
- results['domain'] = self.analyze_domain(data['domain'])
184
-
185
- return results
186
-
187
- async def analyze_location(self, location: str) -> Dict[str, Any]:
188
- """Analyze location information"""
189
- try:
190
- location_data = self.geolocator.geocode(location)
191
- if location_data:
192
- return {
193
- 'address': location_data.address,
194
- 'latitude': location_data.latitude,
195
- 'longitude': location_data.longitude,
196
- 'raw': location_data.raw
197
- }
198
- except Exception as e:
199
- return {'error': str(e)}
200
- return None
201
-
202
- def analyze_domain(self, domain: str) -> Dict[str, Any]:
203
- """Analyze domain information"""
204
- try:
205
- domain_info = whois.whois(domain)
206
- return {
207
- 'registrar': domain_info.registrar,
208
- 'creation_date': domain_info.creation_date,
209
- 'expiration_date': domain_info.expiration_date,
210
- 'last_updated': domain_info.updated_date,
211
- 'status': domain_info.status
212
- }
213
- except Exception as e:
214
- return {'error': str(e)}
215
-
216
- async def search_historical_data(self, url: str) -> List[Dict[str, Any]]:
217
- """Search for historical data using Wayback Machine"""
218
- results = []
219
-
220
- try:
221
- user_agent = "Mozilla/5.0"
222
- cdx = WaybackMachineCDXServerAPI(url, user_agent)
223
-
224
- for snapshot in cdx.snapshots():
225
- results.append({
226
- 'timestamp': snapshot.timestamp,
227
- 'url': snapshot.archive_url,
228
- 'status': snapshot.status_code,
229
- 'mime_type': snapshot.mime_type
230
- })
231
-
232
- except Exception as e:
233
- results.append({'error': str(e)})
234
-
235
- return results
236
-
237
- async def search_person(self, name: str, location: Optional[str] = None) -> List[Dict]:
238
- await self.initialize()
239
- results = []
240
-
241
- # Format search query
242
- query = f"{name}"
243
- if location:
244
- query += f" {location}"
245
-
246
- # Simulate searching various sources
247
- sources = ["social_media", "news", "public_records", "professional"]
248
-
249
- for source in sources:
250
- # Simulate different data sources
251
- if source == "social_media":
252
- profile = {
253
- "name": name,
254
- "location": location,
255
- "source": "Social Media",
256
- "profile_image": "https://example.com/profile.jpg",
257
- "social_links": [
258
- {"platform": "LinkedIn", "url": f"https://linkedin.com/in/{name.lower().replace(' ', '-')}"},
259
- {"platform": "Twitter", "url": f"https://twitter.com/{name.lower().replace(' ', '')}"}
260
- ],
261
- "occupation": "Professional",
262
- "last_seen": datetime.now().strftime("%Y-%m-%d")
263
- }
264
- results.append(profile)
265
-
266
- elif source == "news":
267
- news = {
268
- "name": name,
269
- "source": "News Articles",
270
- "mentions": [
271
- {
272
- "title": f"Article about {name}",
273
- "url": "https://example.com/news",
274
- "date": "2023-01-01"
275
- }
276
- ]
277
- }
278
- results.append(news)
279
-
280
- elif source == "public_records":
281
- record = {
282
- "name": name,
283
- "source": "Public Records",
284
- "location": location,
285
- "age_range": "25-35",
286
- "possible_relatives": ["Jane Doe", "John Doe Sr."],
287
- "previous_locations": ["New York, NY", "Los Angeles, CA"]
288
- }
289
- results.append(record)
290
-
291
- elif source == "professional":
292
- prof = {
293
- "name": name,
294
- "source": "Professional Records",
295
- "education": ["University Example"],
296
- "work_history": ["Company A", "Company B"],
297
- "skills": ["Leadership", "Management"]
298
- }
299
- results.append(prof)
300
-
301
- return results
302
-
303
- async def get_person_details(self, person_id: str) -> Dict:
304
- """Get detailed information about a specific person"""
305
- await self.initialize()
306
-
307
- # Simulate gathering detailed information
308
- details = {
309
- "personal": {
310
- "name": person_id,
311
- "age_range": "25-35",
312
- "locations": ["Current City, Country", "Previous City, Country"],
313
- "education": ["University Name", "High School Name"],
314
- "occupation": "Current Occupation"
315
- },
316
- "social_media": {
317
- "profiles": [
318
- {
319
- "platform": "LinkedIn",
320
- "url": f"https://linkedin.com/in/{person_id}",
321
- "last_active": "2023-01-01"
322
- },
323
- {
324
- "platform": "Twitter",
325
- "url": f"https://twitter.com/{person_id}",
326
- "last_active": "2023-01-01"
327
- }
328
- ]
329
- },
330
- "contact": {
331
- "email_pattern": "j***@example.com",
332
- "phone_pattern": "+1 (***) ***-**89"
333
- },
334
- "images": [
335
- {
336
- "url": "https://example.com/profile1.jpg",
337
- "source": "LinkedIn",
338
- "date": "2023-01-01"
339
- }
340
- ],
341
- "activities": {
342
- "recent_posts": [
343
- {
344
- "platform": "Twitter",
345
- "content": "Example post content",
346
- "date": "2023-01-01"
347
- }
348
- ],
349
- "mentions": [
350
- {
351
- "source": "News Article",
352
- "title": "Article Title",
353
- "url": "https://example.com/article",
354
- "date": "2023-01-01"
355
- }
356
- ]
357
- }
358
- }
359
-
360
- return details
361
-
362
- async def analyze_image(self, image_path: str) -> Dict:
363
- """Analyze an image and return information about it"""
364
- try:
365
- # Open and analyze the image
366
- img = Image.open(image_path if os.path.exists(image_path) else io.BytesIO(requests.get(image_path).content))
367
-
368
- analysis = {
369
- "format": img.format,
370
- "size": f"{img.size[0]}x{img.size[1]}",
371
- "mode": img.mode,
372
- "metadata": {},
373
- }
374
-
375
- # Extract EXIF data if available
376
- if hasattr(img, '_getexif') and img._getexif():
377
- exif = img._getexif()
378
- if exif:
379
- analysis["metadata"] = {
380
- "datetime": exif.get(306, "Unknown"),
381
- "make": exif.get(271, "Unknown"),
382
- "model": exif.get(272, "Unknown"),
383
- "software": exif.get(305, "Unknown")
384
- }
385
-
386
- return analysis
387
- except Exception as e:
388
- return {"error": str(e)}
389
-
390
- async def find_similar_images(self, image_url: str) -> List[Dict]:
391
- """Find similar images"""
392
- # Simulate finding similar images
393
- return [
394
- {
395
- "url": "https://example.com/similar1.jpg",
396
- "similarity": 0.95,
397
- "source": "Website A"
398
- },
399
- {
400
- "url": "https://example.com/similar2.jpg",
401
- "similarity": 0.85,
402
- "source": "Website B"
403
- }
404
- ]
405
-
406
- async def get_location_info(self, location: str) -> Dict:
407
- """Get information about a location"""
408
- # Simulate location information retrieval
409
- return {
410
- "name": location,
411
- "coordinates": {"lat": 40.7128, "lng": -74.0060},
412
- "country": "United States",
413
- "timezone": "America/New_York",
414
- "population": "8.4 million",
415
- "weather": "Sunny, 72°F"
416
- }
417
-
418
- async def get_domain_info(self, domain: str) -> Dict:
419
- """Get information about a domain"""
420
- # Simulate domain information retrieval
421
- return {
422
- "domain": domain,
423
- "registrar": "Example Registrar",
424
- "creation_date": "2020-01-01",
425
- "expiration_date": "2024-01-01",
426
- "nameservers": ["ns1.example.com", "ns2.example.com"],
427
- "ip_address": "192.0.2.1",
428
- "location": "United States"
429
- }
430
-
431
- # Helper function to create document from gathered information
432
- def create_report(data: Dict[str, Any], template: str = "default") -> str:
433
- """Create a formatted report from gathered information"""
434
- if template == "default":
435
- report = "# OSINT Investigation Report\n\n"
436
- report += f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
437
-
438
- for section, content in data.items():
439
- report += f"## {section.title()}\n"
440
- if isinstance(content, dict):
441
- for key, value in content.items():
442
- report += f"* {key}: {value}\n"
443
- elif isinstance(content, list):
444
- for item in content:
445
- if isinstance(item, dict):
446
- for k, v in item.items():
447
- report += f"* {k}: {v}\n"
448
- else:
449
- report += f"* {item}\n"
450
- else:
451
- report += f"{content}\n"
452
- report += "\n"
453
-
454
- return report
455
- else:
456
- raise ValueError(f"Template '{template}' not found")
457
-
458
- async def create_report_from_data(data: Dict) -> Dict:
459
- """Create a formatted report from the gathered data"""
460
- engine = OSINTEngine()
461
-
462
- try:
463
- report = {}
464
-
465
- if "username" in data:
466
- report["platforms"] = await engine.check_username(data["username"], data.get("platform", "all"))
467
-
468
- if "image_url" in data:
469
- report["analysis"] = await engine.analyze_image(data["image_url"])
470
- report["similar_images"] = await engine.find_similar_images(data["image_url"])
471
-
472
- if "location" in data:
473
- report["location"] = await engine.get_location_info(data["location"])
474
-
475
- if "domain" in data:
476
- report["domain"] = await engine.get_domain_info(data["domain"])
477
-
478
- if "name" in data:
479
- report["matches"] = await engine.search_person(data["name"], data.get("location"))
480
-
481
- if "person_id" in data:
482
- report["details"] = await engine.get_person_details(data["person_id"])
483
-
484
- await engine.close()
485
- return report
486
-
487
- except Exception as e:
488
- await engine.close()
489
- return {"error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
packages.txt DELETED
@@ -1,25 +0,0 @@
1
- python3-dev
2
- python3-pip
3
- build-essential
4
- gcc
5
- g++
6
- git
7
- cmake
8
- libgomp1
9
- libglib2.0-0
10
- libnss3
11
- libnspr4
12
- libatk1.0-0
13
- libatk-bridge2.0-0
14
- libcups2
15
- libdrm2
16
- libdbus-1-3
17
- libxkbcommon0
18
- libxcomposite1
19
- libxdamage1
20
- libxfixes3
21
- libxrandr2
22
- libgbm1
23
- libpango-1.0-0
24
- libcairo2
25
- libasound2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,42 +1,39 @@
1
- # Core dependencies
2
- langchain==0.0.335
3
- pydantic==1.10.13
4
- numpy>=1.23.5
5
- pandas>=2.0.2
6
- tqdm>=4.65.0
7
-
8
- # Web and Networking
9
- requests==2.31.0
10
- aiohttp==3.8.5
11
- httpx==0.24.1
12
- beautifulsoup4==4.12.2
13
- selenium==4.15.2
14
- webdriver-manager==4.0.1
15
- googlesearch-python==1.2.3
16
- duckduckgo-search==3.8.5
17
 
18
- # ML and AI
19
- --extra-index-url https://download.pytorch.org/whl/cpu
20
- torch==2.0.1+cpu
21
- torchvision==0.15.2+cpu
22
- transformers==4.31.0
23
- sentence-transformers==2.2.2
 
 
24
 
25
- # UI
26
- gradio==3.40.1
 
 
27
 
28
  # OSINT Tools
29
- python-whois==0.8.0
30
- geopy==2.4.1
31
- socid-extractor==1.0.0
32
- holehe==1.61
33
- sherlock-project==0.14.3
34
 
35
- # Image Processing
36
- Pillow==10.0.0
37
- face-recognition==1.3.0
38
 
39
  # Utilities
40
- python-dotenv==1.0.0
41
- tenacity==8.2.3
42
- retry==0.9.2
 
 
 
1
+ # Core Dependencies
2
+ python-dotenv>=1.0.0
3
+ langchain>=0.0.200
4
+ transformers>=4.30.2
5
+ sentence-transformers>=2.2.2
6
+ faiss-cpu>=1.7.4
7
+ torch>=2.0.1 --index-url https://download.pytorch.org/whl/cpu
8
+ accelerate>=0.21.0
 
 
 
 
 
 
 
 
9
 
10
+ # Web Scraping & Search
11
+ duckduckgo-search>=3.8.3
12
+ beautifulsoup4>=4.12.2
13
+ requests>=2.31.0
14
+ google>=3.0.0
15
+ tenacity>=8.2.2
16
+ aiohttp>=3.8.5
17
+ httpx>=0.24.1
18
 
19
+ # Image Processing
20
+ Pillow>=10.0.0
21
+ face-recognition>=1.3.0
22
+ opencv-python-headless>=4.8.0
23
 
24
  # OSINT Tools
25
+ holehe>=1.61
26
+ sherlock-project>=0.14.0
27
+ python-whois>=0.8.0
28
+ geopy>=2.3.0
 
29
 
30
+ # UI
31
+ gradio>=3.40.1
32
+ markdown>=3.4.3
33
 
34
  # Utilities
35
+ python-dateutil>=2.8.2
36
+ tqdm>=4.65.0
37
+ validators>=0.20.0
38
+ urllib3>=2.0.4
39
+ certifi>=2023.7.22
search_engine.py DELETED
@@ -1,219 +0,0 @@
1
- from typing import Dict, List, Any
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from duckduckgo_search import ddg
5
- from transformers import pipeline
6
- from langchain.embeddings import HuggingFaceEmbeddings
7
- import time
8
- import json
9
- import os
10
- from urllib.parse import urlparse
11
- import asyncio
12
-
13
- class ModelManager:
14
- """Manages AI models for text processing"""
15
- def __init__(self):
16
- # Initialize with smaller, CPU-friendly models
17
- self.summarizer = pipeline(
18
- "summarization",
19
- model="facebook/bart-base",
20
- device=-1 # Use CPU
21
- )
22
- self.embeddings = HuggingFaceEmbeddings(
23
- model_name="sentence-transformers/all-MiniLM-L6-v2"
24
- )
25
-
26
- def generate_summary(self, text: str, max_length: int = 150) -> str:
27
- """Generate a concise summary of the text"""
28
- if not text or len(text.split()) < 50:
29
- return text
30
-
31
- try:
32
- summary = self.summarizer(
33
- text,
34
- max_length=max_length,
35
- min_length=30,
36
- do_sample=False
37
- )[0]['summary_text']
38
- return summary
39
- except Exception as e:
40
- print(f"Error in summarization: {e}")
41
- return text[:500] + "..."
42
-
43
- class ContentProcessor:
44
- """Processes and analyzes different types of content"""
45
- def __init__(self):
46
- self.model_manager = ModelManager()
47
-
48
- def process_content(self, content: str) -> Dict[str, Any]:
49
- """Process content and generate insights"""
50
- if not content:
51
- return {"summary": "", "insights": []}
52
-
53
- try:
54
- summary = self.model_manager.generate_summary(content)
55
- return {
56
- "summary": summary,
57
- "insights": [] # Simplified for CPU deployment
58
- }
59
- except Exception as e:
60
- print(f"Error processing content: {e}")
61
- return {"summary": content[:500] + "...", "insights": []}
62
-
63
- class OSINTEngine:
64
- """Main OSINT engine class"""
65
- def __init__(self):
66
- from osint_engine import OSINTEngine as ExternalOSINT
67
- self.engine = ExternalOSINT()
68
-
69
- async def search_username(self, query: str) -> Dict[str, Any]:
70
- """Search for usernames"""
71
- return await self.engine.search_username(query)
72
-
73
- async def search_image(self, query: str) -> Dict[str, Any]:
74
- """Search for images"""
75
- return await self.engine.search_image(query)
76
-
77
- async def search_social_media(self, query: str, platform: str) -> Dict[str, Any]:
78
- """Search for social media profiles"""
79
- results = await self.engine.search_username(query)
80
- if platform:
81
- return {platform: [r for r in results.get('platforms', []) if r['platform'].lower() == platform.lower()]}
82
- return results
83
-
84
- async def gather_personal_info(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
85
- """Gather personal information"""
86
- return await self.engine.gather_personal_info(kwargs)
87
-
88
- async def search_historical_data(self, query: str) -> Dict[str, Any]:
89
- """Search for historical data"""
90
- return await self.engine.search_historical_data(query)
91
-
92
- class WebSearchEngine:
93
- """Main search engine class"""
94
- def __init__(self):
95
- self.processor = ContentProcessor()
96
- self.session = requests.Session()
97
- self.request_delay = 1.0
98
- self.last_request_time = 0
99
- self.osint_engine = OSINTEngine() # Add OSINT engine
100
-
101
- def is_valid_url(self, url: str) -> bool:
102
- """Check if URL is valid for crawling"""
103
- try:
104
- parsed = urlparse(url)
105
- return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
106
- except:
107
- return False
108
-
109
- def get_metadata(self, soup: BeautifulSoup) -> Dict[str, str]:
110
- """Extract metadata from page"""
111
- metadata = {}
112
-
113
- # Get title
114
- title = soup.find('title')
115
- if title:
116
- metadata['title'] = title.text.strip()
117
-
118
- # Get meta description
119
- desc = soup.find('meta', attrs={'name': 'description'})
120
- if desc:
121
- metadata['description'] = desc.get('content', '')
122
-
123
- # Get publication date
124
- date = soup.find('meta', attrs={'property': 'article:published_time'})
125
- if date:
126
- metadata['published_date'] = date.get('content', '').split('T')[0]
127
-
128
- return metadata
129
-
130
- def process_url(self, url: str) -> Dict[str, Any]:
131
- """Process a single URL"""
132
- if not self.is_valid_url(url):
133
- return None
134
-
135
- try:
136
- # Rate limiting
137
- current_time = time.time()
138
- if current_time - self.last_request_time < self.request_delay:
139
- time.sleep(self.request_delay)
140
-
141
- response = self.session.get(url, timeout=10)
142
- self.last_request_time = time.time()
143
-
144
- if response.status_code != 200:
145
- return None
146
-
147
- soup = BeautifulSoup(response.text, 'lxml')
148
- metadata = self.get_metadata(soup)
149
-
150
- # Extract main content (simplified)
151
- content = ' '.join([p.text for p in soup.find_all('p')])
152
- processed = self.processor.process_content(content)
153
-
154
- return {
155
- 'url': url,
156
- 'title': metadata.get('title', url),
157
- 'summary': processed['summary'],
158
- 'published_date': metadata.get('published_date', '')
159
- }
160
-
161
- except Exception as e:
162
- print(f"Error processing URL {url}: {e}")
163
- return None
164
-
165
- def search(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
166
- """Perform search and process results"""
167
- try:
168
- # Perform DuckDuckGo search
169
- search_results = ddg(query, max_results=max_results)
170
-
171
- results = []
172
- for result in search_results:
173
- processed = self.process_url(result['link'])
174
- if processed:
175
- results.append(processed)
176
-
177
- return results[:max_results]
178
-
179
- except Exception as e:
180
- print(f"Error in search: {e}")
181
- return []
182
-
183
- async def advanced_search(self, query: str, search_type: str = "web", **kwargs) -> Dict[str, Any]:
184
- """Perform advanced search based on type"""
185
- results = {}
186
-
187
- try:
188
- if search_type == "web":
189
- results["web"] = self.search(query, kwargs.get("max_results", 5))
190
- elif search_type == "username":
191
- results["osint"] = await self.osint_engine.search_username(query)
192
- elif search_type == "image":
193
- results["image"] = await self.osint_engine.search_image(query)
194
- elif search_type == "social":
195
- results["social"] = await self.osint_engine.search_social_media(
196
- query,
197
- kwargs.get("platform")
198
- )
199
- elif search_type == "personal":
200
- results["personal"] = await self.osint_engine.gather_personal_info(kwargs)
201
- elif search_type == "historical":
202
- results["historical"] = await self.osint_engine.search_historical_data(query)
203
-
204
- except Exception as e:
205
- results["error"] = str(e)
206
-
207
- return results
208
-
209
- # Main search function
210
- def search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
211
- """Main search function"""
212
- engine = WebSearchEngine()
213
- return engine.search(query, max_results)
214
-
215
- # Main advanced search function
216
- async def advanced_search(query: str, search_type: str = "web", **kwargs) -> Dict[str, Any]:
217
- """Main advanced search function"""
218
- engine = WebSearchEngine()
219
- return await engine.advanced_search(query, search_type, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
space.yml DELETED
@@ -1,11 +0,0 @@
1
- title: Intelligent Search Engine
2
- emoji: 🔍
3
- colorFrom: blue
4
- colorTo: indigo
5
- sdk: gradio
6
- sdk_version: 4.14.0
7
- python_version: "3.10"
8
- app_file: app.py
9
- app_port: 7860
10
- pinned: false
11
- license: apache-2.0