Spaces:
Runtime error
Runtime error
fikird
commited on
Commit
·
1f9ba54
1
Parent(s):
1c4e216
fix: Update dependencies and imports
Browse files- Update LangChain imports to use community packages
- Fix sherlock package dependency
- Simplify OSINT engine implementation
- engines/osint.py +116 -132
- engines/search.py +2 -3
- requirements.txt +12 -1
engines/osint.py
CHANGED
@@ -1,167 +1,151 @@
|
|
1 |
"""
|
2 |
-
OSINT engine for
|
3 |
"""
|
4 |
from typing import Dict, List, Any, Optional
|
5 |
import asyncio
|
6 |
import json
|
7 |
-
from
|
8 |
-
import
|
9 |
-
from
|
10 |
-
import
|
11 |
-
import numpy as np
|
12 |
-
from PIL import Image
|
13 |
-
import io
|
14 |
-
import requests
|
15 |
from geopy.geocoders import Nominatim
|
16 |
from geopy.exc import GeocoderTimedOut
|
17 |
-
import
|
18 |
-
from
|
19 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
20 |
|
21 |
-
@dataclass
|
22 |
-
class PersonInfo:
|
23 |
-
name: str
|
24 |
-
age: Optional[int] = None
|
25 |
-
location: Optional[str] = None
|
26 |
-
gender: Optional[str] = None
|
27 |
-
social_profiles: List[Dict[str, str]] = None
|
28 |
-
images: List[str] = None
|
29 |
-
|
30 |
-
def to_dict(self) -> Dict[str, Any]:
|
31 |
-
return {
|
32 |
-
"name": self.name,
|
33 |
-
"age": self.age,
|
34 |
-
"location": self.location,
|
35 |
-
"gender": self.gender,
|
36 |
-
"social_profiles": self.social_profiles or [],
|
37 |
-
"images": self.images or []
|
38 |
-
}
|
39 |
-
|
40 |
class OSINTEngine:
|
41 |
def __init__(self):
|
42 |
-
self.
|
43 |
-
self.
|
44 |
-
|
45 |
-
|
46 |
-
"Twitch", "Medium", "Dev.to", "Stack Overflow"
|
47 |
-
]
|
48 |
-
|
49 |
-
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
50 |
async def search_username(self, username: str) -> Dict[str, Any]:
|
51 |
"""Search for username across multiple platforms."""
|
52 |
-
results =
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
holehe_results = await holehe.check_email(email)
|
57 |
-
|
58 |
-
# Use sherlock for username search
|
59 |
-
sherlock_results = sherlock.sherlock(username, self.known_platforms, verbose=False)
|
60 |
-
|
61 |
-
# Combine results
|
62 |
-
for platform, data in {**holehe_results, **sherlock_results}.items():
|
63 |
-
if data.get("exists", False):
|
64 |
-
results.append({
|
65 |
-
"platform": platform,
|
66 |
-
"url": data.get("url", ""),
|
67 |
-
"confidence": data.get("confidence", "high")
|
68 |
-
})
|
69 |
-
|
70 |
-
return {
|
71 |
-
"username": username,
|
72 |
-
"found_on": results
|
73 |
}
|
74 |
-
|
75 |
-
async def search_person(self, name: str, location: Optional[str] = None,
|
76 |
-
age: Optional[int] = None, gender: Optional[str] = None) -> PersonInfo:
|
77 |
-
"""Search for information about a person."""
|
78 |
-
person = PersonInfo(
|
79 |
-
name=name,
|
80 |
-
age=age,
|
81 |
-
location=location,
|
82 |
-
gender=gender
|
83 |
-
)
|
84 |
|
85 |
-
#
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
return
|
101 |
|
102 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
103 |
-
async def
|
104 |
-
"""
|
105 |
try:
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
"
|
115 |
-
"faces": []
|
116 |
}
|
117 |
-
|
118 |
-
# Analyze each face
|
119 |
-
for i, (face_encoding, face_location) in enumerate(zip(face_encodings, face_locations)):
|
120 |
-
face_data = {
|
121 |
-
"location": {
|
122 |
-
"top": face_location[0],
|
123 |
-
"right": face_location[1],
|
124 |
-
"bottom": face_location[2],
|
125 |
-
"left": face_location[3]
|
126 |
-
}
|
127 |
-
}
|
128 |
-
results["faces"].append(face_data)
|
129 |
-
|
130 |
-
return results
|
131 |
except Exception as e:
|
132 |
return {"error": str(e)}
|
133 |
|
|
|
134 |
async def search_location(self, location: str) -> Dict[str, Any]:
|
135 |
-
"""
|
136 |
try:
|
137 |
-
# Geocode the location
|
138 |
location_data = self.geolocator.geocode(location, timeout=10)
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
"raw": location_data.raw
|
148 |
-
}
|
149 |
except GeocoderTimedOut:
|
150 |
return {"error": "Geocoding service timed out"}
|
151 |
except Exception as e:
|
152 |
return {"error": str(e)}
|
153 |
|
154 |
-
async def
|
155 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
try:
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
166 |
except Exception as e:
|
167 |
return {"error": str(e)}
|
|
|
1 |
"""
|
2 |
+
OSINT engine for gathering intelligence from various sources.
|
3 |
"""
|
4 |
from typing import Dict, List, Any, Optional
|
5 |
import asyncio
|
6 |
import json
|
7 |
+
from datetime import datetime
|
8 |
+
import whois
|
9 |
+
from holehe.core import import_submodules
|
10 |
+
from holehe.core import get_functions
|
|
|
|
|
|
|
|
|
11 |
from geopy.geocoders import Nominatim
|
12 |
from geopy.exc import GeocoderTimedOut
|
13 |
+
import python_sherlock
|
14 |
+
from python_sherlock import sherlock_module
|
15 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
class OSINTEngine:
|
18 |
def __init__(self):
|
19 |
+
self.holehe_modules = import_submodules("holehe.modules")
|
20 |
+
self.holehe_functions = get_functions(self.holehe_modules)
|
21 |
+
self.geolocator = Nominatim(user_agent="my_osint_app")
|
22 |
+
|
|
|
|
|
|
|
|
|
23 |
async def search_username(self, username: str) -> Dict[str, Any]:
|
24 |
"""Search for username across multiple platforms."""
|
25 |
+
results = {
|
26 |
+
"found": [],
|
27 |
+
"not_found": [],
|
28 |
+
"errors": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
# Sherlock search
|
32 |
+
try:
|
33 |
+
sherlock_results = sherlock_module.search_username(username)
|
34 |
+
for site, data in sherlock_results.items():
|
35 |
+
if data.get("status") == "found":
|
36 |
+
results["found"].append({
|
37 |
+
"platform": site,
|
38 |
+
"url": data.get("url", ""),
|
39 |
+
"source": "sherlock"
|
40 |
+
})
|
41 |
+
elif data.get("status") == "not found":
|
42 |
+
results["not_found"].append(site)
|
43 |
+
else:
|
44 |
+
results["errors"].append(site)
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Sherlock error: {e}")
|
47 |
|
48 |
+
# Holehe search
|
49 |
+
try:
|
50 |
+
holehe_tasks = []
|
51 |
+
for platform in self.holehe_functions:
|
52 |
+
holehe_tasks.append(platform(username))
|
53 |
+
|
54 |
+
holehe_results = await asyncio.gather(*holehe_tasks, return_exceptions=True)
|
55 |
+
|
56 |
+
for result in holehe_results:
|
57 |
+
if isinstance(result, Exception):
|
58 |
+
continue
|
59 |
+
|
60 |
+
if result.get("exists"):
|
61 |
+
results["found"].append({
|
62 |
+
"platform": result.get("name", "unknown"),
|
63 |
+
"url": result.get("url", ""),
|
64 |
+
"source": "holehe"
|
65 |
+
})
|
66 |
+
else:
|
67 |
+
results["not_found"].append(result.get("name", "unknown"))
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Holehe error: {e}")
|
70 |
|
71 |
+
return results
|
72 |
|
73 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
74 |
+
async def search_domain(self, domain: str) -> Dict[str, Any]:
|
75 |
+
"""Get information about a domain."""
|
76 |
try:
|
77 |
+
w = whois.whois(domain)
|
78 |
+
return {
|
79 |
+
"registrar": w.registrar,
|
80 |
+
"creation_date": w.creation_date,
|
81 |
+
"expiration_date": w.expiration_date,
|
82 |
+
"last_updated": w.updated_date,
|
83 |
+
"status": w.status,
|
84 |
+
"name_servers": w.name_servers,
|
85 |
+
"emails": w.emails
|
|
|
86 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
except Exception as e:
|
88 |
return {"error": str(e)}
|
89 |
|
90 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
91 |
async def search_location(self, location: str) -> Dict[str, Any]:
|
92 |
+
"""Get information about a location."""
|
93 |
try:
|
|
|
94 |
location_data = self.geolocator.geocode(location, timeout=10)
|
95 |
+
if location_data:
|
96 |
+
return {
|
97 |
+
"address": location_data.address,
|
98 |
+
"latitude": location_data.latitude,
|
99 |
+
"longitude": location_data.longitude,
|
100 |
+
"raw": location_data.raw
|
101 |
+
}
|
102 |
+
return {"error": "Location not found"}
|
|
|
|
|
103 |
except GeocoderTimedOut:
|
104 |
return {"error": "Geocoding service timed out"}
|
105 |
except Exception as e:
|
106 |
return {"error": str(e)}
|
107 |
|
108 |
+
async def search_person(self, name: str, location: Optional[str] = None) -> Dict[str, Any]:
|
109 |
+
"""Search for information about a person."""
|
110 |
+
results = {
|
111 |
+
"name": name,
|
112 |
+
"location": location,
|
113 |
+
"social_profiles": [],
|
114 |
+
"possible_emails": [],
|
115 |
+
"location_info": None
|
116 |
+
}
|
117 |
+
|
118 |
+
# Get location information if provided
|
119 |
+
if location:
|
120 |
+
results["location_info"] = await self.search_location(location)
|
121 |
+
|
122 |
+
# Generate possible email formats
|
123 |
+
name_parts = name.lower().split()
|
124 |
+
if len(name_parts) >= 2:
|
125 |
+
first, last = name_parts[0], name_parts[-1]
|
126 |
+
common_domains = ["gmail.com", "yahoo.com", "hotmail.com", "outlook.com"]
|
127 |
+
email_formats = [
|
128 |
+
f"{first}.{last}@{domain}",
|
129 |
+
f"{first}{last}@{domain}",
|
130 |
+
f"{first[0]}{last}@{domain}",
|
131 |
+
f"{first}_{last}@{domain}"
|
132 |
+
]
|
133 |
+
results["possible_emails"] = email_formats
|
134 |
+
|
135 |
+
return results
|
136 |
+
|
137 |
+
async def search(self, query: str, search_type: str = "username") -> Dict[str, Any]:
|
138 |
+
"""Main search interface."""
|
139 |
try:
|
140 |
+
if search_type == "username":
|
141 |
+
return await self.search_username(query)
|
142 |
+
elif search_type == "domain":
|
143 |
+
return await self.search_domain(query)
|
144 |
+
elif search_type == "location":
|
145 |
+
return await self.search_location(query)
|
146 |
+
elif search_type == "person":
|
147 |
+
return await self.search_person(query)
|
148 |
+
else:
|
149 |
+
return {"error": f"Unknown search type: {search_type}"}
|
150 |
except Exception as e:
|
151 |
return {"error": str(e)}
|
engines/search.py
CHANGED
@@ -3,9 +3,8 @@ Advanced RAG-based search engine with multi-source intelligence.
|
|
3 |
"""
|
4 |
from typing import List, Dict, Any, Optional
|
5 |
import asyncio
|
6 |
-
from
|
7 |
-
from
|
8 |
-
from langchain.vectorstores import FAISS
|
9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
from langchain.docstore.document import Document
|
11 |
from duckduckgo_search import DDGS
|
|
|
3 |
"""
|
4 |
from typing import List, Dict, Any, Optional
|
5 |
import asyncio
|
6 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
7 |
+
from langchain_community.vectorstores import FAISS
|
|
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
from langchain.docstore.document import Document
|
10 |
from duckduckgo_search import DDGS
|
requirements.txt
CHANGED
@@ -1,8 +1,12 @@
|
|
1 |
# Core Dependencies
|
2 |
python-dotenv>=1.0.0
|
3 |
-
langchain>=0.0
|
|
|
|
|
4 |
sentence-transformers>=2.2.2
|
5 |
faiss-cpu>=1.7.4
|
|
|
|
|
6 |
|
7 |
# Web Scraping & Search
|
8 |
duckduckgo-search>=3.8.3
|
@@ -13,8 +17,15 @@ tenacity>=8.2.2
|
|
13 |
aiohttp>=3.8.5
|
14 |
httpx>=0.24.1
|
15 |
|
|
|
|
|
|
|
|
|
|
|
16 |
# OSINT Tools
|
17 |
holehe>=1.61
|
|
|
|
|
18 |
python-whois>=0.8.0
|
19 |
geopy>=2.3.0
|
20 |
|
|
|
1 |
# Core Dependencies
|
2 |
python-dotenv>=1.0.0
|
3 |
+
langchain>=0.1.0
|
4 |
+
langchain-community>=0.0.10
|
5 |
+
transformers>=4.30.2
|
6 |
sentence-transformers>=2.2.2
|
7 |
faiss-cpu>=1.7.4
|
8 |
+
torch>=2.0.1 --index-url https://download.pytorch.org/whl/cpu
|
9 |
+
accelerate>=0.21.0
|
10 |
|
11 |
# Web Scraping & Search
|
12 |
duckduckgo-search>=3.8.3
|
|
|
17 |
aiohttp>=3.8.5
|
18 |
httpx>=0.24.1
|
19 |
|
20 |
+
# Image Processing
|
21 |
+
Pillow>=10.0.0
|
22 |
+
face-recognition>=1.3.0
|
23 |
+
opencv-python-headless>=4.8.0
|
24 |
+
|
25 |
# OSINT Tools
|
26 |
holehe>=1.61
|
27 |
+
sherlock-project>=0.14.0
|
28 |
+
python-sherlock>=0.1.0
|
29 |
python-whois>=0.8.0
|
30 |
geopy>=2.3.0
|
31 |
|