fikird commited on
Commit
1f9ba54
·
1 Parent(s): 1c4e216

fix: Update dependencies and imports

Browse files

- Update LangChain imports to use community packages
- Fix sherlock package dependency
- Simplify OSINT engine implementation

Files changed (3) hide show
  1. engines/osint.py +116 -132
  2. engines/search.py +2 -3
  3. requirements.txt +12 -1
engines/osint.py CHANGED
@@ -1,167 +1,151 @@
1
  """
2
- OSINT engine for comprehensive information gathering.
3
  """
4
  from typing import Dict, List, Any, Optional
5
  import asyncio
6
  import json
7
- from dataclasses import dataclass
8
- import holehe.core as holehe
9
- from sherlock import sherlock
10
- import face_recognition
11
- import numpy as np
12
- from PIL import Image
13
- import io
14
- import requests
15
  from geopy.geocoders import Nominatim
16
  from geopy.exc import GeocoderTimedOut
17
- import whois
18
- from datetime import datetime
19
  from tenacity import retry, stop_after_attempt, wait_exponential
20
 
21
- @dataclass
22
- class PersonInfo:
23
- name: str
24
- age: Optional[int] = None
25
- location: Optional[str] = None
26
- gender: Optional[str] = None
27
- social_profiles: List[Dict[str, str]] = None
28
- images: List[str] = None
29
-
30
- def to_dict(self) -> Dict[str, Any]:
31
- return {
32
- "name": self.name,
33
- "age": self.age,
34
- "location": self.location,
35
- "gender": self.gender,
36
- "social_profiles": self.social_profiles or [],
37
- "images": self.images or []
38
- }
39
-
40
  class OSINTEngine:
41
  def __init__(self):
42
- self.geolocator = Nominatim(user_agent="intelligent_search_engine")
43
- self.known_platforms = [
44
- "Twitter", "Instagram", "Facebook", "LinkedIn", "GitHub",
45
- "Reddit", "YouTube", "TikTok", "Pinterest", "Snapchat",
46
- "Twitch", "Medium", "Dev.to", "Stack Overflow"
47
- ]
48
-
49
- @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
50
  async def search_username(self, username: str) -> Dict[str, Any]:
51
  """Search for username across multiple platforms."""
52
- results = []
53
-
54
- # Use holehe for email-based search
55
- email = f"{username}@gmail.com" # Example email
56
- holehe_results = await holehe.check_email(email)
57
-
58
- # Use sherlock for username search
59
- sherlock_results = sherlock.sherlock(username, self.known_platforms, verbose=False)
60
-
61
- # Combine results
62
- for platform, data in {**holehe_results, **sherlock_results}.items():
63
- if data.get("exists", False):
64
- results.append({
65
- "platform": platform,
66
- "url": data.get("url", ""),
67
- "confidence": data.get("confidence", "high")
68
- })
69
-
70
- return {
71
- "username": username,
72
- "found_on": results
73
  }
74
-
75
- async def search_person(self, name: str, location: Optional[str] = None,
76
- age: Optional[int] = None, gender: Optional[str] = None) -> PersonInfo:
77
- """Search for information about a person."""
78
- person = PersonInfo(
79
- name=name,
80
- age=age,
81
- location=location,
82
- gender=gender
83
- )
84
 
85
- # Initialize social profiles list
86
- person.social_profiles = []
87
-
88
- # Search for social media profiles
89
- username_variants = [
90
- name.replace(" ", ""),
91
- name.replace(" ", "_"),
92
- name.replace(" ", "."),
93
- name.lower().replace(" ", "")
94
- ]
 
 
 
 
 
 
95
 
96
- for username in username_variants:
97
- results = await self.search_username(username)
98
- person.social_profiles.extend(results.get("found_on", []))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- return person
101
 
102
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
103
- async def analyze_image(self, image_data: bytes) -> Dict[str, Any]:
104
- """Analyze an image for faces and other identifiable information."""
105
  try:
106
- # Load image
107
- image = face_recognition.load_image_file(io.BytesIO(image_data))
108
-
109
- # Detect faces
110
- face_locations = face_recognition.face_locations(image)
111
- face_encodings = face_recognition.face_encodings(image, face_locations)
112
-
113
- results = {
114
- "faces_found": len(face_locations),
115
- "faces": []
116
  }
117
-
118
- # Analyze each face
119
- for i, (face_encoding, face_location) in enumerate(zip(face_encodings, face_locations)):
120
- face_data = {
121
- "location": {
122
- "top": face_location[0],
123
- "right": face_location[1],
124
- "bottom": face_location[2],
125
- "left": face_location[3]
126
- }
127
- }
128
- results["faces"].append(face_data)
129
-
130
- return results
131
  except Exception as e:
132
  return {"error": str(e)}
133
 
 
134
  async def search_location(self, location: str) -> Dict[str, Any]:
135
- """Gather information about a location."""
136
  try:
137
- # Geocode the location
138
  location_data = self.geolocator.geocode(location, timeout=10)
139
-
140
- if not location_data:
141
- return {"error": "Location not found"}
142
-
143
- return {
144
- "address": location_data.address,
145
- "latitude": location_data.latitude,
146
- "longitude": location_data.longitude,
147
- "raw": location_data.raw
148
- }
149
  except GeocoderTimedOut:
150
  return {"error": "Geocoding service timed out"}
151
  except Exception as e:
152
  return {"error": str(e)}
153
 
154
- async def analyze_domain(self, domain: str) -> Dict[str, Any]:
155
- """Analyze a domain for WHOIS and other information."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  try:
157
- w = whois.whois(domain)
158
- return {
159
- "registrar": w.registrar,
160
- "creation_date": w.creation_date,
161
- "expiration_date": w.expiration_date,
162
- "last_updated": w.updated_date,
163
- "status": w.status,
164
- "name_servers": w.name_servers
165
- }
 
166
  except Exception as e:
167
  return {"error": str(e)}
 
1
  """
2
+ OSINT engine for gathering intelligence from various sources.
3
  """
4
  from typing import Dict, List, Any, Optional
5
  import asyncio
6
  import json
7
+ from datetime import datetime
8
+ import whois
9
+ from holehe.core import import_submodules
10
+ from holehe.core import get_functions
 
 
 
 
11
  from geopy.geocoders import Nominatim
12
  from geopy.exc import GeocoderTimedOut
13
+ import python_sherlock
14
+ from python_sherlock import sherlock_module
15
  from tenacity import retry, stop_after_attempt, wait_exponential
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  class OSINTEngine:
18
  def __init__(self):
19
+ self.holehe_modules = import_submodules("holehe.modules")
20
+ self.holehe_functions = get_functions(self.holehe_modules)
21
+ self.geolocator = Nominatim(user_agent="my_osint_app")
22
+
 
 
 
 
23
  async def search_username(self, username: str) -> Dict[str, Any]:
24
  """Search for username across multiple platforms."""
25
+ results = {
26
+ "found": [],
27
+ "not_found": [],
28
+ "errors": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  }
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Sherlock search
32
+ try:
33
+ sherlock_results = sherlock_module.search_username(username)
34
+ for site, data in sherlock_results.items():
35
+ if data.get("status") == "found":
36
+ results["found"].append({
37
+ "platform": site,
38
+ "url": data.get("url", ""),
39
+ "source": "sherlock"
40
+ })
41
+ elif data.get("status") == "not found":
42
+ results["not_found"].append(site)
43
+ else:
44
+ results["errors"].append(site)
45
+ except Exception as e:
46
+ print(f"Sherlock error: {e}")
47
 
48
+ # Holehe search
49
+ try:
50
+ holehe_tasks = []
51
+ for platform in self.holehe_functions:
52
+ holehe_tasks.append(platform(username))
53
+
54
+ holehe_results = await asyncio.gather(*holehe_tasks, return_exceptions=True)
55
+
56
+ for result in holehe_results:
57
+ if isinstance(result, Exception):
58
+ continue
59
+
60
+ if result.get("exists"):
61
+ results["found"].append({
62
+ "platform": result.get("name", "unknown"),
63
+ "url": result.get("url", ""),
64
+ "source": "holehe"
65
+ })
66
+ else:
67
+ results["not_found"].append(result.get("name", "unknown"))
68
+ except Exception as e:
69
+ print(f"Holehe error: {e}")
70
 
71
+ return results
72
 
73
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
74
+ async def search_domain(self, domain: str) -> Dict[str, Any]:
75
+ """Get information about a domain."""
76
  try:
77
+ w = whois.whois(domain)
78
+ return {
79
+ "registrar": w.registrar,
80
+ "creation_date": w.creation_date,
81
+ "expiration_date": w.expiration_date,
82
+ "last_updated": w.updated_date,
83
+ "status": w.status,
84
+ "name_servers": w.name_servers,
85
+ "emails": w.emails
 
86
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  except Exception as e:
88
  return {"error": str(e)}
89
 
90
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
91
  async def search_location(self, location: str) -> Dict[str, Any]:
92
+ """Get information about a location."""
93
  try:
 
94
  location_data = self.geolocator.geocode(location, timeout=10)
95
+ if location_data:
96
+ return {
97
+ "address": location_data.address,
98
+ "latitude": location_data.latitude,
99
+ "longitude": location_data.longitude,
100
+ "raw": location_data.raw
101
+ }
102
+ return {"error": "Location not found"}
 
 
103
  except GeocoderTimedOut:
104
  return {"error": "Geocoding service timed out"}
105
  except Exception as e:
106
  return {"error": str(e)}
107
 
108
+ async def search_person(self, name: str, location: Optional[str] = None) -> Dict[str, Any]:
109
+ """Search for information about a person."""
110
+ results = {
111
+ "name": name,
112
+ "location": location,
113
+ "social_profiles": [],
114
+ "possible_emails": [],
115
+ "location_info": None
116
+ }
117
+
118
+ # Get location information if provided
119
+ if location:
120
+ results["location_info"] = await self.search_location(location)
121
+
122
+ # Generate possible email formats
123
+ name_parts = name.lower().split()
124
+ if len(name_parts) >= 2:
125
+ first, last = name_parts[0], name_parts[-1]
126
+ common_domains = ["gmail.com", "yahoo.com", "hotmail.com", "outlook.com"]
127
+ email_formats = [
128
+ f"{first}.{last}@{domain}",
129
+ f"{first}{last}@{domain}",
130
+ f"{first[0]}{last}@{domain}",
131
+ f"{first}_{last}@{domain}"
132
+ ]
133
+ results["possible_emails"] = email_formats
134
+
135
+ return results
136
+
137
+ async def search(self, query: str, search_type: str = "username") -> Dict[str, Any]:
138
+ """Main search interface."""
139
  try:
140
+ if search_type == "username":
141
+ return await self.search_username(query)
142
+ elif search_type == "domain":
143
+ return await self.search_domain(query)
144
+ elif search_type == "location":
145
+ return await self.search_location(query)
146
+ elif search_type == "person":
147
+ return await self.search_person(query)
148
+ else:
149
+ return {"error": f"Unknown search type: {search_type}"}
150
  except Exception as e:
151
  return {"error": str(e)}
engines/search.py CHANGED
@@ -3,9 +3,8 @@ Advanced RAG-based search engine with multi-source intelligence.
3
  """
4
  from typing import List, Dict, Any, Optional
5
  import asyncio
6
- from langchain.chains import RetrievalQAWithSourcesChain
7
- from langchain.embeddings import HuggingFaceEmbeddings
8
- from langchain.vectorstores import FAISS
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain.docstore.document import Document
11
  from duckduckgo_search import DDGS
 
3
  """
4
  from typing import List, Dict, Any, Optional
5
  import asyncio
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import FAISS
 
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.docstore.document import Document
10
  from duckduckgo_search import DDGS
requirements.txt CHANGED
@@ -1,8 +1,12 @@
1
  # Core Dependencies
2
  python-dotenv>=1.0.0
3
- langchain>=0.0.200
 
 
4
  sentence-transformers>=2.2.2
5
  faiss-cpu>=1.7.4
 
 
6
 
7
  # Web Scraping & Search
8
  duckduckgo-search>=3.8.3
@@ -13,8 +17,15 @@ tenacity>=8.2.2
13
  aiohttp>=3.8.5
14
  httpx>=0.24.1
15
 
 
 
 
 
 
16
  # OSINT Tools
17
  holehe>=1.61
 
 
18
  python-whois>=0.8.0
19
  geopy>=2.3.0
20
 
 
1
  # Core Dependencies
2
  python-dotenv>=1.0.0
3
+ langchain>=0.1.0
4
+ langchain-community>=0.0.10
5
+ transformers>=4.30.2
6
  sentence-transformers>=2.2.2
7
  faiss-cpu>=1.7.4
8
+ torch>=2.0.1 --index-url https://download.pytorch.org/whl/cpu
9
+ accelerate>=0.21.0
10
 
11
  # Web Scraping & Search
12
  duckduckgo-search>=3.8.3
 
17
  aiohttp>=3.8.5
18
  httpx>=0.24.1
19
 
20
+ # Image Processing
21
+ Pillow>=10.0.0
22
+ face-recognition>=1.3.0
23
+ opencv-python-headless>=4.8.0
24
+
25
  # OSINT Tools
26
  holehe>=1.61
27
+ sherlock-project>=0.14.0
28
+ python-sherlock>=0.1.0
29
  python-whois>=0.8.0
30
  geopy>=2.3.0
31