Spaces:
Runtime error
Runtime error
fikird
commited on
Commit
·
a3440c5
1
Parent(s):
48922fa
feat: Enhanced search engine with caching and metadata
Browse files- Added result caching with TTL
- Improved content extraction
- Enhanced metadata collection
- Optimized dependencies
- Removed unnecessary files
- apt.txt +0 -8
- engines/search.py +159 -13
- osint_engine.py +0 -489
- packages.txt +0 -25
- requirements.txt +32 -35
- search_engine.py +0 -219
- space.yml +0 -11
apt.txt
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
python3-dev
|
2 |
-
python3-pip
|
3 |
-
build-essential
|
4 |
-
gcc
|
5 |
-
g++
|
6 |
-
git
|
7 |
-
cmake
|
8 |
-
libgomp1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
engines/search.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
"""
|
2 |
-
RAG-based search engine with
|
3 |
"""
|
4 |
from typing import List, Dict, Any, Optional
|
5 |
import asyncio
|
@@ -13,6 +13,12 @@ from googlesearch import search as gsearch
|
|
13 |
import requests
|
14 |
from bs4 import BeautifulSoup
|
15 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
class SearchEngine:
|
18 |
def __init__(self):
|
@@ -23,12 +29,42 @@ class SearchEngine:
|
|
23 |
chunk_size=500,
|
24 |
chunk_overlap=50
|
25 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
28 |
async def search_web(self, query: str, max_results: int = 10) -> List[Dict[str, str]]:
|
29 |
"""Perform web search using multiple search engines."""
|
30 |
results = []
|
31 |
|
|
|
|
|
|
|
|
|
|
|
32 |
# DuckDuckGo Search
|
33 |
try:
|
34 |
with DDGS() as ddgs:
|
@@ -44,8 +80,26 @@ class SearchEngine:
|
|
44 |
except Exception as e:
|
45 |
print(f"Google search error: {e}")
|
46 |
|
|
|
47 |
return results[:max_results]
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
50 |
async def fetch_content(self, url: str) -> Optional[str]:
|
51 |
"""Fetch and extract content from a webpage."""
|
@@ -56,25 +110,90 @@ class SearchEngine:
|
|
56 |
response = requests.get(url, headers=headers, timeout=10)
|
57 |
response.raise_for_status()
|
58 |
|
|
|
59 |
soup = BeautifulSoup(response.text, "html.parser")
|
60 |
|
61 |
# Remove unwanted elements
|
62 |
-
for element in soup(["script", "style", "nav", "footer", "header"]):
|
63 |
element.decompose()
|
64 |
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
return text
|
67 |
except Exception as e:
|
68 |
print(f"Error fetching {url}: {e}")
|
69 |
return None
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
async def process_search_results(self, query: str) -> Dict[str, Any]:
|
72 |
"""Process search results and create a RAG-based answer."""
|
|
|
|
|
|
|
|
|
|
|
73 |
# Perform web search
|
74 |
search_results = await self.search_web(query)
|
75 |
|
76 |
# Fetch content from search results
|
77 |
documents = []
|
|
|
|
|
78 |
for result in search_results:
|
79 |
url = result.get("link")
|
80 |
if not url:
|
@@ -84,17 +203,28 @@ class SearchEngine:
|
|
84 |
if content:
|
85 |
# Split content into chunks
|
86 |
chunks = self.text_splitter.split_text(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
for chunk in chunks:
|
88 |
doc = Document(
|
89 |
page_content=chunk,
|
90 |
-
metadata=
|
91 |
)
|
92 |
documents.append(doc)
|
93 |
|
94 |
if not documents:
|
95 |
return {
|
96 |
"answer": "I couldn't find any relevant information.",
|
97 |
-
"sources": []
|
|
|
98 |
}
|
99 |
|
100 |
# Create vector store
|
@@ -109,18 +239,33 @@ class SearchEngine:
|
|
109 |
# Get relevant documents
|
110 |
relevant_docs = chain.retriever.get_relevant_documents(query)
|
111 |
|
112 |
-
#
|
113 |
sources = []
|
114 |
content = []
|
115 |
-
|
116 |
-
if doc.metadata["source"] not in sources:
|
117 |
-
sources.append(doc.metadata["source"])
|
118 |
-
content.append(doc.page_content)
|
119 |
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
"answer": "\n\n".join(content),
|
122 |
-
"sources": sources
|
|
|
123 |
}
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
async def search(self, query: str) -> Dict[str, Any]:
|
126 |
"""Main search interface."""
|
@@ -129,5 +274,6 @@ class SearchEngine:
|
|
129 |
except Exception as e:
|
130 |
return {
|
131 |
"answer": f"An error occurred: {str(e)}",
|
132 |
-
"sources": []
|
|
|
133 |
}
|
|
|
1 |
"""
|
2 |
+
Advanced RAG-based search engine with multi-source intelligence.
|
3 |
"""
|
4 |
from typing import List, Dict, Any, Optional
|
5 |
import asyncio
|
|
|
13 |
import requests
|
14 |
from bs4 import BeautifulSoup
|
15 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
16 |
+
import json
|
17 |
+
import time
|
18 |
+
from datetime import datetime, timedelta
|
19 |
+
import hashlib
|
20 |
+
from urllib.parse import urlparse
|
21 |
+
import re
|
22 |
|
23 |
class SearchEngine:
|
24 |
def __init__(self):
|
|
|
29 |
chunk_size=500,
|
30 |
chunk_overlap=50
|
31 |
)
|
32 |
+
self.cache = {}
|
33 |
+
self.cache_ttl = timedelta(hours=24)
|
34 |
+
self.search_delay = 2 # seconds between searches
|
35 |
+
self.last_search_time = datetime.min
|
36 |
+
|
37 |
+
def _get_cache_key(self, query: str, **kwargs) -> str:
|
38 |
+
"""Generate cache key from query and kwargs."""
|
39 |
+
cache_data = {
|
40 |
+
"query": query,
|
41 |
+
**kwargs
|
42 |
+
}
|
43 |
+
return hashlib.md5(json.dumps(cache_data, sort_keys=True).encode()).hexdigest()
|
44 |
+
|
45 |
+
def _get_cached_result(self, cache_key: str) -> Optional[Dict[str, Any]]:
|
46 |
+
"""Get result from cache if valid."""
|
47 |
+
if cache_key in self.cache:
|
48 |
+
result, timestamp = self.cache[cache_key]
|
49 |
+
if datetime.now() - timestamp < self.cache_ttl:
|
50 |
+
return result
|
51 |
+
del self.cache[cache_key]
|
52 |
+
return None
|
53 |
+
|
54 |
+
def _set_cached_result(self, cache_key: str, result: Dict[str, Any]):
|
55 |
+
"""Store result in cache."""
|
56 |
+
self.cache[cache_key] = (result, datetime.now())
|
57 |
|
58 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
59 |
async def search_web(self, query: str, max_results: int = 10) -> List[Dict[str, str]]:
|
60 |
"""Perform web search using multiple search engines."""
|
61 |
results = []
|
62 |
|
63 |
+
# Respect rate limiting
|
64 |
+
time_since_last = datetime.now() - self.last_search_time
|
65 |
+
if time_since_last.total_seconds() < self.search_delay:
|
66 |
+
await asyncio.sleep(self.search_delay - time_since_last.total_seconds())
|
67 |
+
|
68 |
# DuckDuckGo Search
|
69 |
try:
|
70 |
with DDGS() as ddgs:
|
|
|
80 |
except Exception as e:
|
81 |
print(f"Google search error: {e}")
|
82 |
|
83 |
+
self.last_search_time = datetime.now()
|
84 |
return results[:max_results]
|
85 |
|
86 |
+
def _clean_html(self, html: str) -> str:
|
87 |
+
"""Clean HTML content."""
|
88 |
+
# Remove script and style elements
|
89 |
+
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
|
90 |
+
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL)
|
91 |
+
|
92 |
+
# Remove comments
|
93 |
+
html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
|
94 |
+
|
95 |
+
# Remove remaining tags
|
96 |
+
html = re.sub(r'<[^>]+>', ' ', html)
|
97 |
+
|
98 |
+
# Clean whitespace
|
99 |
+
html = re.sub(r'\s+', ' ', html).strip()
|
100 |
+
|
101 |
+
return html
|
102 |
+
|
103 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
104 |
async def fetch_content(self, url: str) -> Optional[str]:
|
105 |
"""Fetch and extract content from a webpage."""
|
|
|
110 |
response = requests.get(url, headers=headers, timeout=10)
|
111 |
response.raise_for_status()
|
112 |
|
113 |
+
# Extract main content
|
114 |
soup = BeautifulSoup(response.text, "html.parser")
|
115 |
|
116 |
# Remove unwanted elements
|
117 |
+
for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
118 |
element.decompose()
|
119 |
|
120 |
+
# Try to find main content
|
121 |
+
main_content = None
|
122 |
+
|
123 |
+
# Look for article tag
|
124 |
+
if soup.find("article"):
|
125 |
+
main_content = soup.find("article")
|
126 |
+
|
127 |
+
# Look for main tag
|
128 |
+
elif soup.find("main"):
|
129 |
+
main_content = soup.find("main")
|
130 |
+
|
131 |
+
# Look for div with common content class names
|
132 |
+
elif soup.find("div", class_=re.compile(r"content|article|post|entry")):
|
133 |
+
main_content = soup.find("div", class_=re.compile(r"content|article|post|entry"))
|
134 |
+
|
135 |
+
# Use body if no main content found
|
136 |
+
if not main_content:
|
137 |
+
main_content = soup.body
|
138 |
+
|
139 |
+
# Extract text
|
140 |
+
if main_content:
|
141 |
+
text = self._clean_html(str(main_content))
|
142 |
+
else:
|
143 |
+
text = self._clean_html(response.text)
|
144 |
+
|
145 |
return text
|
146 |
except Exception as e:
|
147 |
print(f"Error fetching {url}: {e}")
|
148 |
return None
|
149 |
|
150 |
+
def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
151 |
+
"""Extract metadata from webpage."""
|
152 |
+
metadata = {
|
153 |
+
"url": url,
|
154 |
+
"domain": urlparse(url).netloc,
|
155 |
+
"title": None,
|
156 |
+
"description": None,
|
157 |
+
"published_date": None,
|
158 |
+
"author": None,
|
159 |
+
"keywords": None
|
160 |
+
}
|
161 |
+
|
162 |
+
# Extract title
|
163 |
+
if soup.title:
|
164 |
+
metadata["title"] = soup.title.string
|
165 |
+
|
166 |
+
# Extract meta tags
|
167 |
+
for meta in soup.find_all("meta"):
|
168 |
+
name = meta.get("name", "").lower()
|
169 |
+
property = meta.get("property", "").lower()
|
170 |
+
content = meta.get("content")
|
171 |
+
|
172 |
+
if name == "description" or property == "og:description":
|
173 |
+
metadata["description"] = content
|
174 |
+
elif name == "author":
|
175 |
+
metadata["author"] = content
|
176 |
+
elif name == "keywords":
|
177 |
+
metadata["keywords"] = content
|
178 |
+
elif name in ["published_time", "article:published_time"]:
|
179 |
+
metadata["published_date"] = content
|
180 |
+
|
181 |
+
return metadata
|
182 |
+
|
183 |
async def process_search_results(self, query: str) -> Dict[str, Any]:
|
184 |
"""Process search results and create a RAG-based answer."""
|
185 |
+
cache_key = self._get_cache_key(query)
|
186 |
+
cached_result = self._get_cached_result(cache_key)
|
187 |
+
if cached_result:
|
188 |
+
return cached_result
|
189 |
+
|
190 |
# Perform web search
|
191 |
search_results = await self.search_web(query)
|
192 |
|
193 |
# Fetch content from search results
|
194 |
documents = []
|
195 |
+
metadata_list = []
|
196 |
+
|
197 |
for result in search_results:
|
198 |
url = result.get("link")
|
199 |
if not url:
|
|
|
203 |
if content:
|
204 |
# Split content into chunks
|
205 |
chunks = self.text_splitter.split_text(content)
|
206 |
+
|
207 |
+
# Store metadata
|
208 |
+
metadata = {
|
209 |
+
"source": url,
|
210 |
+
"title": result.get("title", url),
|
211 |
+
**result
|
212 |
+
}
|
213 |
+
metadata_list.append(metadata)
|
214 |
+
|
215 |
+
# Create documents
|
216 |
for chunk in chunks:
|
217 |
doc = Document(
|
218 |
page_content=chunk,
|
219 |
+
metadata=metadata
|
220 |
)
|
221 |
documents.append(doc)
|
222 |
|
223 |
if not documents:
|
224 |
return {
|
225 |
"answer": "I couldn't find any relevant information.",
|
226 |
+
"sources": [],
|
227 |
+
"metadata": []
|
228 |
}
|
229 |
|
230 |
# Create vector store
|
|
|
239 |
# Get relevant documents
|
240 |
relevant_docs = chain.retriever.get_relevant_documents(query)
|
241 |
|
242 |
+
# Extract unique sources and content
|
243 |
sources = []
|
244 |
content = []
|
245 |
+
used_metadata = []
|
|
|
|
|
|
|
246 |
|
247 |
+
for doc in relevant_docs[:5]: # Limit to top 5 most relevant docs
|
248 |
+
source = doc.metadata["source"]
|
249 |
+
if source not in sources:
|
250 |
+
sources.append(source)
|
251 |
+
content.append(doc.page_content)
|
252 |
+
|
253 |
+
# Find corresponding metadata
|
254 |
+
for meta in metadata_list:
|
255 |
+
if meta["source"] == source:
|
256 |
+
used_metadata.append(meta)
|
257 |
+
break
|
258 |
+
|
259 |
+
result = {
|
260 |
"answer": "\n\n".join(content),
|
261 |
+
"sources": sources,
|
262 |
+
"metadata": used_metadata
|
263 |
}
|
264 |
+
|
265 |
+
# Cache the result
|
266 |
+
self._set_cached_result(cache_key, result)
|
267 |
+
|
268 |
+
return result
|
269 |
|
270 |
async def search(self, query: str) -> Dict[str, Any]:
|
271 |
"""Main search interface."""
|
|
|
274 |
except Exception as e:
|
275 |
return {
|
276 |
"answer": f"An error occurred: {str(e)}",
|
277 |
+
"sources": [],
|
278 |
+
"metadata": []
|
279 |
}
|
osint_engine.py
DELETED
@@ -1,489 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import re
|
3 |
-
import json
|
4 |
-
import time
|
5 |
-
import asyncio
|
6 |
-
import aiohttp
|
7 |
-
import requests
|
8 |
-
import httpx
|
9 |
-
from PIL import Image
|
10 |
-
from io import BytesIO
|
11 |
-
from typing import Dict, List, Any, Union, Optional
|
12 |
-
from selenium import webdriver
|
13 |
-
from selenium.webdriver.chrome.options import Options
|
14 |
-
from selenium.webdriver.chrome.service import Service
|
15 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
16 |
-
from geopy.geocoders import Nominatim
|
17 |
-
from waybackpy import WaybackMachineCDXServerAPI
|
18 |
-
import whois
|
19 |
-
from datetime import datetime
|
20 |
-
from googlesearch import search as google_search
|
21 |
-
import base64
|
22 |
-
import io
|
23 |
-
|
24 |
-
class OSINTEngine:
|
25 |
-
"""OSINT capabilities for advanced information gathering"""
|
26 |
-
|
27 |
-
def __init__(self):
|
28 |
-
self.chrome_options = Options()
|
29 |
-
self.chrome_options.add_argument('--headless')
|
30 |
-
self.chrome_options.add_argument('--no-sandbox')
|
31 |
-
self.chrome_options.add_argument('--disable-dev-shm-usage')
|
32 |
-
self.setup_apis()
|
33 |
-
self.session = None
|
34 |
-
self.platforms = {
|
35 |
-
"twitter": "https://twitter.com/{}",
|
36 |
-
"instagram": "https://instagram.com/{}",
|
37 |
-
"facebook": "https://facebook.com/{}",
|
38 |
-
"linkedin": "https://linkedin.com/in/{}",
|
39 |
-
"github": "https://github.com/{}",
|
40 |
-
"reddit": "https://reddit.com/user/{}",
|
41 |
-
"youtube": "https://youtube.com/@{}",
|
42 |
-
"tiktok": "https://tiktok.com/@{}",
|
43 |
-
"pinterest": "https://pinterest.com/{}",
|
44 |
-
"snapchat": "https://snapchat.com/add/{}",
|
45 |
-
"twitch": "https://twitch.tv/{}",
|
46 |
-
"medium": "https://medium.com/@{}",
|
47 |
-
"devto": "https://dev.to/{}",
|
48 |
-
"stackoverflow": "https://stackoverflow.com/users/{}"
|
49 |
-
}
|
50 |
-
|
51 |
-
def setup_apis(self):
|
52 |
-
"""Initialize API clients"""
|
53 |
-
self.geolocator = Nominatim(user_agent="intelligent_search")
|
54 |
-
self.http_client = httpx.AsyncClient()
|
55 |
-
|
56 |
-
async def initialize(self):
|
57 |
-
if not self.session:
|
58 |
-
self.session = aiohttp.ClientSession()
|
59 |
-
|
60 |
-
async def close(self):
|
61 |
-
if self.session:
|
62 |
-
await self.session.close()
|
63 |
-
self.session = None
|
64 |
-
|
65 |
-
async def search_username(self, username: str) -> Dict[str, Any]:
|
66 |
-
"""Search for username across multiple platforms"""
|
67 |
-
results = {
|
68 |
-
'platforms': [],
|
69 |
-
'social_media': {},
|
70 |
-
'websites': []
|
71 |
-
}
|
72 |
-
|
73 |
-
# Common social media platforms
|
74 |
-
platforms = [
|
75 |
-
{'name': 'GitHub', 'url': f'https://github.com/{username}'},
|
76 |
-
{'name': 'Twitter', 'url': f'https://twitter.com/{username}'},
|
77 |
-
{'name': 'Instagram', 'url': f'https://instagram.com/{username}'},
|
78 |
-
{'name': 'LinkedIn', 'url': f'https://linkedin.com/in/{username}'},
|
79 |
-
{'name': 'Facebook', 'url': f'https://facebook.com/{username}'},
|
80 |
-
{'name': 'YouTube', 'url': f'https://youtube.com/@{username}'},
|
81 |
-
]
|
82 |
-
|
83 |
-
async with aiohttp.ClientSession() as session:
|
84 |
-
tasks = []
|
85 |
-
for platform in platforms:
|
86 |
-
task = self.check_profile(session, platform['url'], platform['name'])
|
87 |
-
tasks.append(task)
|
88 |
-
|
89 |
-
platform_results = await asyncio.gather(*tasks)
|
90 |
-
results['platforms'] = [r for r in platform_results if r is not None]
|
91 |
-
|
92 |
-
# Google search for additional mentions
|
93 |
-
try:
|
94 |
-
search_query = f'"{username}" OR "@{username}" -site:twitter.com -site:facebook.com -site:instagram.com'
|
95 |
-
web_results = list(google_search(search_query, num_results=5))
|
96 |
-
results['websites'] = web_results
|
97 |
-
except Exception as e:
|
98 |
-
results['websites'] = [str(e)]
|
99 |
-
|
100 |
-
return results
|
101 |
-
|
102 |
-
async def check_profile(self, session, url: str, platform: str) -> Dict[str, str]:
|
103 |
-
"""Check if a profile exists on a platform"""
|
104 |
-
try:
|
105 |
-
async with session.get(url) as response:
|
106 |
-
if response.status == 200:
|
107 |
-
return {
|
108 |
-
'platform': platform,
|
109 |
-
'url': url,
|
110 |
-
'exists': True
|
111 |
-
}
|
112 |
-
except:
|
113 |
-
pass
|
114 |
-
return None
|
115 |
-
|
116 |
-
async def check_username(self, username: str, platform: str = "all") -> List[Dict]:
|
117 |
-
await self.initialize()
|
118 |
-
results = []
|
119 |
-
|
120 |
-
platforms_to_check = [platform] if platform != "all" else self.platforms.keys()
|
121 |
-
|
122 |
-
for platform_name in platforms_to_check:
|
123 |
-
if platform_name in self.platforms:
|
124 |
-
url = self.platforms[platform_name].format(username)
|
125 |
-
try:
|
126 |
-
async with self.session.get(url) as response:
|
127 |
-
exists = response.status == 200
|
128 |
-
results.append({
|
129 |
-
"platform": platform_name,
|
130 |
-
"url": url,
|
131 |
-
"exists": exists
|
132 |
-
})
|
133 |
-
except:
|
134 |
-
results.append({
|
135 |
-
"platform": platform_name,
|
136 |
-
"url": url,
|
137 |
-
"exists": False,
|
138 |
-
"error": "Connection failed"
|
139 |
-
})
|
140 |
-
|
141 |
-
return results
|
142 |
-
|
143 |
-
async def search_image(self, image_url: str) -> Dict[str, Any]:
|
144 |
-
"""Image analysis and reverse search"""
|
145 |
-
results = {
|
146 |
-
'analysis': {},
|
147 |
-
'similar_images': [],
|
148 |
-
'error': None
|
149 |
-
}
|
150 |
-
|
151 |
-
try:
|
152 |
-
# Download and analyze image
|
153 |
-
response = requests.get(image_url)
|
154 |
-
img = Image.open(BytesIO(response.content))
|
155 |
-
|
156 |
-
# Basic image analysis
|
157 |
-
results['analysis'] = {
|
158 |
-
'format': img.format,
|
159 |
-
'size': img.size,
|
160 |
-
'mode': img.mode
|
161 |
-
}
|
162 |
-
|
163 |
-
# Perform reverse image search using Google Lens
|
164 |
-
search_url = f"https://lens.google.com/uploadbyurl?url={image_url}"
|
165 |
-
results['similar_images'].append({
|
166 |
-
'source': 'Google Lens',
|
167 |
-
'url': search_url
|
168 |
-
})
|
169 |
-
|
170 |
-
except Exception as e:
|
171 |
-
results['error'] = str(e)
|
172 |
-
|
173 |
-
return results
|
174 |
-
|
175 |
-
async def gather_personal_info(self, data: Dict[str, str]) -> Dict[str, Any]:
|
176 |
-
"""Gather personal information from various sources"""
|
177 |
-
results = {}
|
178 |
-
|
179 |
-
if 'location' in data:
|
180 |
-
results['location'] = await self.analyze_location(data['location'])
|
181 |
-
|
182 |
-
if 'domain' in data:
|
183 |
-
results['domain'] = self.analyze_domain(data['domain'])
|
184 |
-
|
185 |
-
return results
|
186 |
-
|
187 |
-
async def analyze_location(self, location: str) -> Dict[str, Any]:
|
188 |
-
"""Analyze location information"""
|
189 |
-
try:
|
190 |
-
location_data = self.geolocator.geocode(location)
|
191 |
-
if location_data:
|
192 |
-
return {
|
193 |
-
'address': location_data.address,
|
194 |
-
'latitude': location_data.latitude,
|
195 |
-
'longitude': location_data.longitude,
|
196 |
-
'raw': location_data.raw
|
197 |
-
}
|
198 |
-
except Exception as e:
|
199 |
-
return {'error': str(e)}
|
200 |
-
return None
|
201 |
-
|
202 |
-
def analyze_domain(self, domain: str) -> Dict[str, Any]:
|
203 |
-
"""Analyze domain information"""
|
204 |
-
try:
|
205 |
-
domain_info = whois.whois(domain)
|
206 |
-
return {
|
207 |
-
'registrar': domain_info.registrar,
|
208 |
-
'creation_date': domain_info.creation_date,
|
209 |
-
'expiration_date': domain_info.expiration_date,
|
210 |
-
'last_updated': domain_info.updated_date,
|
211 |
-
'status': domain_info.status
|
212 |
-
}
|
213 |
-
except Exception as e:
|
214 |
-
return {'error': str(e)}
|
215 |
-
|
216 |
-
async def search_historical_data(self, url: str) -> List[Dict[str, Any]]:
|
217 |
-
"""Search for historical data using Wayback Machine"""
|
218 |
-
results = []
|
219 |
-
|
220 |
-
try:
|
221 |
-
user_agent = "Mozilla/5.0"
|
222 |
-
cdx = WaybackMachineCDXServerAPI(url, user_agent)
|
223 |
-
|
224 |
-
for snapshot in cdx.snapshots():
|
225 |
-
results.append({
|
226 |
-
'timestamp': snapshot.timestamp,
|
227 |
-
'url': snapshot.archive_url,
|
228 |
-
'status': snapshot.status_code,
|
229 |
-
'mime_type': snapshot.mime_type
|
230 |
-
})
|
231 |
-
|
232 |
-
except Exception as e:
|
233 |
-
results.append({'error': str(e)})
|
234 |
-
|
235 |
-
return results
|
236 |
-
|
237 |
-
async def search_person(self, name: str, location: Optional[str] = None) -> List[Dict]:
|
238 |
-
await self.initialize()
|
239 |
-
results = []
|
240 |
-
|
241 |
-
# Format search query
|
242 |
-
query = f"{name}"
|
243 |
-
if location:
|
244 |
-
query += f" {location}"
|
245 |
-
|
246 |
-
# Simulate searching various sources
|
247 |
-
sources = ["social_media", "news", "public_records", "professional"]
|
248 |
-
|
249 |
-
for source in sources:
|
250 |
-
# Simulate different data sources
|
251 |
-
if source == "social_media":
|
252 |
-
profile = {
|
253 |
-
"name": name,
|
254 |
-
"location": location,
|
255 |
-
"source": "Social Media",
|
256 |
-
"profile_image": "https://example.com/profile.jpg",
|
257 |
-
"social_links": [
|
258 |
-
{"platform": "LinkedIn", "url": f"https://linkedin.com/in/{name.lower().replace(' ', '-')}"},
|
259 |
-
{"platform": "Twitter", "url": f"https://twitter.com/{name.lower().replace(' ', '')}"}
|
260 |
-
],
|
261 |
-
"occupation": "Professional",
|
262 |
-
"last_seen": datetime.now().strftime("%Y-%m-%d")
|
263 |
-
}
|
264 |
-
results.append(profile)
|
265 |
-
|
266 |
-
elif source == "news":
|
267 |
-
news = {
|
268 |
-
"name": name,
|
269 |
-
"source": "News Articles",
|
270 |
-
"mentions": [
|
271 |
-
{
|
272 |
-
"title": f"Article about {name}",
|
273 |
-
"url": "https://example.com/news",
|
274 |
-
"date": "2023-01-01"
|
275 |
-
}
|
276 |
-
]
|
277 |
-
}
|
278 |
-
results.append(news)
|
279 |
-
|
280 |
-
elif source == "public_records":
|
281 |
-
record = {
|
282 |
-
"name": name,
|
283 |
-
"source": "Public Records",
|
284 |
-
"location": location,
|
285 |
-
"age_range": "25-35",
|
286 |
-
"possible_relatives": ["Jane Doe", "John Doe Sr."],
|
287 |
-
"previous_locations": ["New York, NY", "Los Angeles, CA"]
|
288 |
-
}
|
289 |
-
results.append(record)
|
290 |
-
|
291 |
-
elif source == "professional":
|
292 |
-
prof = {
|
293 |
-
"name": name,
|
294 |
-
"source": "Professional Records",
|
295 |
-
"education": ["University Example"],
|
296 |
-
"work_history": ["Company A", "Company B"],
|
297 |
-
"skills": ["Leadership", "Management"]
|
298 |
-
}
|
299 |
-
results.append(prof)
|
300 |
-
|
301 |
-
return results
|
302 |
-
|
303 |
-
async def get_person_details(self, person_id: str) -> Dict:
|
304 |
-
"""Get detailed information about a specific person"""
|
305 |
-
await self.initialize()
|
306 |
-
|
307 |
-
# Simulate gathering detailed information
|
308 |
-
details = {
|
309 |
-
"personal": {
|
310 |
-
"name": person_id,
|
311 |
-
"age_range": "25-35",
|
312 |
-
"locations": ["Current City, Country", "Previous City, Country"],
|
313 |
-
"education": ["University Name", "High School Name"],
|
314 |
-
"occupation": "Current Occupation"
|
315 |
-
},
|
316 |
-
"social_media": {
|
317 |
-
"profiles": [
|
318 |
-
{
|
319 |
-
"platform": "LinkedIn",
|
320 |
-
"url": f"https://linkedin.com/in/{person_id}",
|
321 |
-
"last_active": "2023-01-01"
|
322 |
-
},
|
323 |
-
{
|
324 |
-
"platform": "Twitter",
|
325 |
-
"url": f"https://twitter.com/{person_id}",
|
326 |
-
"last_active": "2023-01-01"
|
327 |
-
}
|
328 |
-
]
|
329 |
-
},
|
330 |
-
"contact": {
|
331 |
-
"email_pattern": "j***@example.com",
|
332 |
-
"phone_pattern": "+1 (***) ***-**89"
|
333 |
-
},
|
334 |
-
"images": [
|
335 |
-
{
|
336 |
-
"url": "https://example.com/profile1.jpg",
|
337 |
-
"source": "LinkedIn",
|
338 |
-
"date": "2023-01-01"
|
339 |
-
}
|
340 |
-
],
|
341 |
-
"activities": {
|
342 |
-
"recent_posts": [
|
343 |
-
{
|
344 |
-
"platform": "Twitter",
|
345 |
-
"content": "Example post content",
|
346 |
-
"date": "2023-01-01"
|
347 |
-
}
|
348 |
-
],
|
349 |
-
"mentions": [
|
350 |
-
{
|
351 |
-
"source": "News Article",
|
352 |
-
"title": "Article Title",
|
353 |
-
"url": "https://example.com/article",
|
354 |
-
"date": "2023-01-01"
|
355 |
-
}
|
356 |
-
]
|
357 |
-
}
|
358 |
-
}
|
359 |
-
|
360 |
-
return details
|
361 |
-
|
362 |
-
async def analyze_image(self, image_path: str) -> Dict:
|
363 |
-
"""Analyze an image and return information about it"""
|
364 |
-
try:
|
365 |
-
# Open and analyze the image
|
366 |
-
img = Image.open(image_path if os.path.exists(image_path) else io.BytesIO(requests.get(image_path).content))
|
367 |
-
|
368 |
-
analysis = {
|
369 |
-
"format": img.format,
|
370 |
-
"size": f"{img.size[0]}x{img.size[1]}",
|
371 |
-
"mode": img.mode,
|
372 |
-
"metadata": {},
|
373 |
-
}
|
374 |
-
|
375 |
-
# Extract EXIF data if available
|
376 |
-
if hasattr(img, '_getexif') and img._getexif():
|
377 |
-
exif = img._getexif()
|
378 |
-
if exif:
|
379 |
-
analysis["metadata"] = {
|
380 |
-
"datetime": exif.get(306, "Unknown"),
|
381 |
-
"make": exif.get(271, "Unknown"),
|
382 |
-
"model": exif.get(272, "Unknown"),
|
383 |
-
"software": exif.get(305, "Unknown")
|
384 |
-
}
|
385 |
-
|
386 |
-
return analysis
|
387 |
-
except Exception as e:
|
388 |
-
return {"error": str(e)}
|
389 |
-
|
390 |
-
async def find_similar_images(self, image_url: str) -> List[Dict]:
|
391 |
-
"""Find similar images"""
|
392 |
-
# Simulate finding similar images
|
393 |
-
return [
|
394 |
-
{
|
395 |
-
"url": "https://example.com/similar1.jpg",
|
396 |
-
"similarity": 0.95,
|
397 |
-
"source": "Website A"
|
398 |
-
},
|
399 |
-
{
|
400 |
-
"url": "https://example.com/similar2.jpg",
|
401 |
-
"similarity": 0.85,
|
402 |
-
"source": "Website B"
|
403 |
-
}
|
404 |
-
]
|
405 |
-
|
406 |
-
async def get_location_info(self, location: str) -> Dict:
|
407 |
-
"""Get information about a location"""
|
408 |
-
# Simulate location information retrieval
|
409 |
-
return {
|
410 |
-
"name": location,
|
411 |
-
"coordinates": {"lat": 40.7128, "lng": -74.0060},
|
412 |
-
"country": "United States",
|
413 |
-
"timezone": "America/New_York",
|
414 |
-
"population": "8.4 million",
|
415 |
-
"weather": "Sunny, 72°F"
|
416 |
-
}
|
417 |
-
|
418 |
-
async def get_domain_info(self, domain: str) -> Dict:
|
419 |
-
"""Get information about a domain"""
|
420 |
-
# Simulate domain information retrieval
|
421 |
-
return {
|
422 |
-
"domain": domain,
|
423 |
-
"registrar": "Example Registrar",
|
424 |
-
"creation_date": "2020-01-01",
|
425 |
-
"expiration_date": "2024-01-01",
|
426 |
-
"nameservers": ["ns1.example.com", "ns2.example.com"],
|
427 |
-
"ip_address": "192.0.2.1",
|
428 |
-
"location": "United States"
|
429 |
-
}
|
430 |
-
|
431 |
-
# Helper function to create document from gathered information
|
432 |
-
def create_report(data: Dict[str, Any], template: str = "default") -> str:
|
433 |
-
"""Create a formatted report from gathered information"""
|
434 |
-
if template == "default":
|
435 |
-
report = "# OSINT Investigation Report\n\n"
|
436 |
-
report += f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
437 |
-
|
438 |
-
for section, content in data.items():
|
439 |
-
report += f"## {section.title()}\n"
|
440 |
-
if isinstance(content, dict):
|
441 |
-
for key, value in content.items():
|
442 |
-
report += f"* {key}: {value}\n"
|
443 |
-
elif isinstance(content, list):
|
444 |
-
for item in content:
|
445 |
-
if isinstance(item, dict):
|
446 |
-
for k, v in item.items():
|
447 |
-
report += f"* {k}: {v}\n"
|
448 |
-
else:
|
449 |
-
report += f"* {item}\n"
|
450 |
-
else:
|
451 |
-
report += f"{content}\n"
|
452 |
-
report += "\n"
|
453 |
-
|
454 |
-
return report
|
455 |
-
else:
|
456 |
-
raise ValueError(f"Template '{template}' not found")
|
457 |
-
|
458 |
-
async def create_report_from_data(data: Dict) -> Dict:
|
459 |
-
"""Create a formatted report from the gathered data"""
|
460 |
-
engine = OSINTEngine()
|
461 |
-
|
462 |
-
try:
|
463 |
-
report = {}
|
464 |
-
|
465 |
-
if "username" in data:
|
466 |
-
report["platforms"] = await engine.check_username(data["username"], data.get("platform", "all"))
|
467 |
-
|
468 |
-
if "image_url" in data:
|
469 |
-
report["analysis"] = await engine.analyze_image(data["image_url"])
|
470 |
-
report["similar_images"] = await engine.find_similar_images(data["image_url"])
|
471 |
-
|
472 |
-
if "location" in data:
|
473 |
-
report["location"] = await engine.get_location_info(data["location"])
|
474 |
-
|
475 |
-
if "domain" in data:
|
476 |
-
report["domain"] = await engine.get_domain_info(data["domain"])
|
477 |
-
|
478 |
-
if "name" in data:
|
479 |
-
report["matches"] = await engine.search_person(data["name"], data.get("location"))
|
480 |
-
|
481 |
-
if "person_id" in data:
|
482 |
-
report["details"] = await engine.get_person_details(data["person_id"])
|
483 |
-
|
484 |
-
await engine.close()
|
485 |
-
return report
|
486 |
-
|
487 |
-
except Exception as e:
|
488 |
-
await engine.close()
|
489 |
-
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
packages.txt
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
python3-dev
|
2 |
-
python3-pip
|
3 |
-
build-essential
|
4 |
-
gcc
|
5 |
-
g++
|
6 |
-
git
|
7 |
-
cmake
|
8 |
-
libgomp1
|
9 |
-
libglib2.0-0
|
10 |
-
libnss3
|
11 |
-
libnspr4
|
12 |
-
libatk1.0-0
|
13 |
-
libatk-bridge2.0-0
|
14 |
-
libcups2
|
15 |
-
libdrm2
|
16 |
-
libdbus-1-3
|
17 |
-
libxkbcommon0
|
18 |
-
libxcomposite1
|
19 |
-
libxdamage1
|
20 |
-
libxfixes3
|
21 |
-
libxrandr2
|
22 |
-
libgbm1
|
23 |
-
libpango-1.0-0
|
24 |
-
libcairo2
|
25 |
-
libasound2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,42 +1,39 @@
|
|
1 |
-
# Core
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
requests==2.31.0
|
10 |
-
aiohttp==3.8.5
|
11 |
-
httpx==0.24.1
|
12 |
-
beautifulsoup4==4.12.2
|
13 |
-
selenium==4.15.2
|
14 |
-
webdriver-manager==4.0.1
|
15 |
-
googlesearch-python==1.2.3
|
16 |
-
duckduckgo-search==3.8.5
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
24 |
|
25 |
-
#
|
26 |
-
|
|
|
|
|
27 |
|
28 |
# OSINT Tools
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
sherlock-project==0.14.3
|
34 |
|
35 |
-
#
|
36 |
-
|
37 |
-
|
38 |
|
39 |
# Utilities
|
40 |
-
python-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
1 |
+
# Core Dependencies
|
2 |
+
python-dotenv>=1.0.0
|
3 |
+
langchain>=0.0.200
|
4 |
+
transformers>=4.30.2
|
5 |
+
sentence-transformers>=2.2.2
|
6 |
+
faiss-cpu>=1.7.4
|
7 |
+
torch>=2.0.1 --index-url https://download.pytorch.org/whl/cpu
|
8 |
+
accelerate>=0.21.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
# Web Scraping & Search
|
11 |
+
duckduckgo-search>=3.8.3
|
12 |
+
beautifulsoup4>=4.12.2
|
13 |
+
requests>=2.31.0
|
14 |
+
google>=3.0.0
|
15 |
+
tenacity>=8.2.2
|
16 |
+
aiohttp>=3.8.5
|
17 |
+
httpx>=0.24.1
|
18 |
|
19 |
+
# Image Processing
|
20 |
+
Pillow>=10.0.0
|
21 |
+
face-recognition>=1.3.0
|
22 |
+
opencv-python-headless>=4.8.0
|
23 |
|
24 |
# OSINT Tools
|
25 |
+
holehe>=1.61
|
26 |
+
sherlock-project>=0.14.0
|
27 |
+
python-whois>=0.8.0
|
28 |
+
geopy>=2.3.0
|
|
|
29 |
|
30 |
+
# UI
|
31 |
+
gradio>=3.40.1
|
32 |
+
markdown>=3.4.3
|
33 |
|
34 |
# Utilities
|
35 |
+
python-dateutil>=2.8.2
|
36 |
+
tqdm>=4.65.0
|
37 |
+
validators>=0.20.0
|
38 |
+
urllib3>=2.0.4
|
39 |
+
certifi>=2023.7.22
|
search_engine.py
DELETED
@@ -1,219 +0,0 @@
|
|
1 |
-
from typing import Dict, List, Any
|
2 |
-
import requests
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
-
from duckduckgo_search import ddg
|
5 |
-
from transformers import pipeline
|
6 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
7 |
-
import time
|
8 |
-
import json
|
9 |
-
import os
|
10 |
-
from urllib.parse import urlparse
|
11 |
-
import asyncio
|
12 |
-
|
13 |
-
class ModelManager:
|
14 |
-
"""Manages AI models for text processing"""
|
15 |
-
def __init__(self):
|
16 |
-
# Initialize with smaller, CPU-friendly models
|
17 |
-
self.summarizer = pipeline(
|
18 |
-
"summarization",
|
19 |
-
model="facebook/bart-base",
|
20 |
-
device=-1 # Use CPU
|
21 |
-
)
|
22 |
-
self.embeddings = HuggingFaceEmbeddings(
|
23 |
-
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
24 |
-
)
|
25 |
-
|
26 |
-
def generate_summary(self, text: str, max_length: int = 150) -> str:
|
27 |
-
"""Generate a concise summary of the text"""
|
28 |
-
if not text or len(text.split()) < 50:
|
29 |
-
return text
|
30 |
-
|
31 |
-
try:
|
32 |
-
summary = self.summarizer(
|
33 |
-
text,
|
34 |
-
max_length=max_length,
|
35 |
-
min_length=30,
|
36 |
-
do_sample=False
|
37 |
-
)[0]['summary_text']
|
38 |
-
return summary
|
39 |
-
except Exception as e:
|
40 |
-
print(f"Error in summarization: {e}")
|
41 |
-
return text[:500] + "..."
|
42 |
-
|
43 |
-
class ContentProcessor:
|
44 |
-
"""Processes and analyzes different types of content"""
|
45 |
-
def __init__(self):
|
46 |
-
self.model_manager = ModelManager()
|
47 |
-
|
48 |
-
def process_content(self, content: str) -> Dict[str, Any]:
|
49 |
-
"""Process content and generate insights"""
|
50 |
-
if not content:
|
51 |
-
return {"summary": "", "insights": []}
|
52 |
-
|
53 |
-
try:
|
54 |
-
summary = self.model_manager.generate_summary(content)
|
55 |
-
return {
|
56 |
-
"summary": summary,
|
57 |
-
"insights": [] # Simplified for CPU deployment
|
58 |
-
}
|
59 |
-
except Exception as e:
|
60 |
-
print(f"Error processing content: {e}")
|
61 |
-
return {"summary": content[:500] + "...", "insights": []}
|
62 |
-
|
63 |
-
class OSINTEngine:
|
64 |
-
"""Main OSINT engine class"""
|
65 |
-
def __init__(self):
|
66 |
-
from osint_engine import OSINTEngine as ExternalOSINT
|
67 |
-
self.engine = ExternalOSINT()
|
68 |
-
|
69 |
-
async def search_username(self, query: str) -> Dict[str, Any]:
|
70 |
-
"""Search for usernames"""
|
71 |
-
return await self.engine.search_username(query)
|
72 |
-
|
73 |
-
async def search_image(self, query: str) -> Dict[str, Any]:
|
74 |
-
"""Search for images"""
|
75 |
-
return await self.engine.search_image(query)
|
76 |
-
|
77 |
-
async def search_social_media(self, query: str, platform: str) -> Dict[str, Any]:
|
78 |
-
"""Search for social media profiles"""
|
79 |
-
results = await self.engine.search_username(query)
|
80 |
-
if platform:
|
81 |
-
return {platform: [r for r in results.get('platforms', []) if r['platform'].lower() == platform.lower()]}
|
82 |
-
return results
|
83 |
-
|
84 |
-
async def gather_personal_info(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
85 |
-
"""Gather personal information"""
|
86 |
-
return await self.engine.gather_personal_info(kwargs)
|
87 |
-
|
88 |
-
async def search_historical_data(self, query: str) -> Dict[str, Any]:
|
89 |
-
"""Search for historical data"""
|
90 |
-
return await self.engine.search_historical_data(query)
|
91 |
-
|
92 |
-
class WebSearchEngine:
|
93 |
-
"""Main search engine class"""
|
94 |
-
def __init__(self):
|
95 |
-
self.processor = ContentProcessor()
|
96 |
-
self.session = requests.Session()
|
97 |
-
self.request_delay = 1.0
|
98 |
-
self.last_request_time = 0
|
99 |
-
self.osint_engine = OSINTEngine() # Add OSINT engine
|
100 |
-
|
101 |
-
def is_valid_url(self, url: str) -> bool:
|
102 |
-
"""Check if URL is valid for crawling"""
|
103 |
-
try:
|
104 |
-
parsed = urlparse(url)
|
105 |
-
return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
|
106 |
-
except:
|
107 |
-
return False
|
108 |
-
|
109 |
-
def get_metadata(self, soup: BeautifulSoup) -> Dict[str, str]:
|
110 |
-
"""Extract metadata from page"""
|
111 |
-
metadata = {}
|
112 |
-
|
113 |
-
# Get title
|
114 |
-
title = soup.find('title')
|
115 |
-
if title:
|
116 |
-
metadata['title'] = title.text.strip()
|
117 |
-
|
118 |
-
# Get meta description
|
119 |
-
desc = soup.find('meta', attrs={'name': 'description'})
|
120 |
-
if desc:
|
121 |
-
metadata['description'] = desc.get('content', '')
|
122 |
-
|
123 |
-
# Get publication date
|
124 |
-
date = soup.find('meta', attrs={'property': 'article:published_time'})
|
125 |
-
if date:
|
126 |
-
metadata['published_date'] = date.get('content', '').split('T')[0]
|
127 |
-
|
128 |
-
return metadata
|
129 |
-
|
130 |
-
def process_url(self, url: str) -> Dict[str, Any]:
|
131 |
-
"""Process a single URL"""
|
132 |
-
if not self.is_valid_url(url):
|
133 |
-
return None
|
134 |
-
|
135 |
-
try:
|
136 |
-
# Rate limiting
|
137 |
-
current_time = time.time()
|
138 |
-
if current_time - self.last_request_time < self.request_delay:
|
139 |
-
time.sleep(self.request_delay)
|
140 |
-
|
141 |
-
response = self.session.get(url, timeout=10)
|
142 |
-
self.last_request_time = time.time()
|
143 |
-
|
144 |
-
if response.status_code != 200:
|
145 |
-
return None
|
146 |
-
|
147 |
-
soup = BeautifulSoup(response.text, 'lxml')
|
148 |
-
metadata = self.get_metadata(soup)
|
149 |
-
|
150 |
-
# Extract main content (simplified)
|
151 |
-
content = ' '.join([p.text for p in soup.find_all('p')])
|
152 |
-
processed = self.processor.process_content(content)
|
153 |
-
|
154 |
-
return {
|
155 |
-
'url': url,
|
156 |
-
'title': metadata.get('title', url),
|
157 |
-
'summary': processed['summary'],
|
158 |
-
'published_date': metadata.get('published_date', '')
|
159 |
-
}
|
160 |
-
|
161 |
-
except Exception as e:
|
162 |
-
print(f"Error processing URL {url}: {e}")
|
163 |
-
return None
|
164 |
-
|
165 |
-
def search(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
166 |
-
"""Perform search and process results"""
|
167 |
-
try:
|
168 |
-
# Perform DuckDuckGo search
|
169 |
-
search_results = ddg(query, max_results=max_results)
|
170 |
-
|
171 |
-
results = []
|
172 |
-
for result in search_results:
|
173 |
-
processed = self.process_url(result['link'])
|
174 |
-
if processed:
|
175 |
-
results.append(processed)
|
176 |
-
|
177 |
-
return results[:max_results]
|
178 |
-
|
179 |
-
except Exception as e:
|
180 |
-
print(f"Error in search: {e}")
|
181 |
-
return []
|
182 |
-
|
183 |
-
async def advanced_search(self, query: str, search_type: str = "web", **kwargs) -> Dict[str, Any]:
|
184 |
-
"""Perform advanced search based on type"""
|
185 |
-
results = {}
|
186 |
-
|
187 |
-
try:
|
188 |
-
if search_type == "web":
|
189 |
-
results["web"] = self.search(query, kwargs.get("max_results", 5))
|
190 |
-
elif search_type == "username":
|
191 |
-
results["osint"] = await self.osint_engine.search_username(query)
|
192 |
-
elif search_type == "image":
|
193 |
-
results["image"] = await self.osint_engine.search_image(query)
|
194 |
-
elif search_type == "social":
|
195 |
-
results["social"] = await self.osint_engine.search_social_media(
|
196 |
-
query,
|
197 |
-
kwargs.get("platform")
|
198 |
-
)
|
199 |
-
elif search_type == "personal":
|
200 |
-
results["personal"] = await self.osint_engine.gather_personal_info(kwargs)
|
201 |
-
elif search_type == "historical":
|
202 |
-
results["historical"] = await self.osint_engine.search_historical_data(query)
|
203 |
-
|
204 |
-
except Exception as e:
|
205 |
-
results["error"] = str(e)
|
206 |
-
|
207 |
-
return results
|
208 |
-
|
209 |
-
# Main search function
|
210 |
-
def search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
211 |
-
"""Main search function"""
|
212 |
-
engine = WebSearchEngine()
|
213 |
-
return engine.search(query, max_results)
|
214 |
-
|
215 |
-
# Main advanced search function
|
216 |
-
async def advanced_search(query: str, search_type: str = "web", **kwargs) -> Dict[str, Any]:
|
217 |
-
"""Main advanced search function"""
|
218 |
-
engine = WebSearchEngine()
|
219 |
-
return await engine.advanced_search(query, search_type, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
space.yml
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
title: Intelligent Search Engine
|
2 |
-
emoji: 🔍
|
3 |
-
colorFrom: blue
|
4 |
-
colorTo: indigo
|
5 |
-
sdk: gradio
|
6 |
-
sdk_version: 4.14.0
|
7 |
-
python_version: "3.10"
|
8 |
-
app_file: app.py
|
9 |
-
app_port: 7860
|
10 |
-
pinned: false
|
11 |
-
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|