ISE / utils /helpers.py
fikird
Complete rewrite of ISE with advanced RAG and OSINT capabilities
48922fa
"""
Common helper functions for the search engine.
"""
from typing import Dict, Any, List, Optional
import re
from datetime import datetime
import hashlib
import json
def clean_text(text: str) -> str:
"""Clean and normalize text content."""
# Remove extra whitespace
text = re.sub(r"\s+", " ", text)
# Remove special characters
text = re.sub(r"[^\w\s.,!?-]", "", text)
return text.strip()
def extract_entities(text: str) -> Dict[str, List[str]]:
"""Extract basic entities from text."""
entities = {
"emails": [],
"phones": [],
"urls": [],
"dates": []
}
# Extract emails
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
entities["emails"] = re.findall(email_pattern, text)
# Extract phone numbers
phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}"
entities["phones"] = re.findall(phone_pattern, text)
# Extract URLs
url_pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+"
entities["urls"] = re.findall(url_pattern, text)
# Extract dates
date_pattern = r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
entities["dates"] = re.findall(date_pattern, text)
return entities
def generate_hash(data: Any) -> str:
"""Generate a hash for data deduplication."""
if isinstance(data, (dict, list)):
data = json.dumps(data, sort_keys=True)
elif not isinstance(data, str):
data = str(data)
return hashlib.md5(data.encode()).hexdigest()
def format_date(date_str: str) -> Optional[str]:
"""Format date string to consistent format."""
date_formats = [
"%Y-%m-%d",
"%d/%m/%Y",
"%m/%d/%Y",
"%Y/%m/%d",
"%d-%m-%Y",
"%m-%d-%Y"
]
for fmt in date_formats:
try:
date_obj = datetime.strptime(date_str, fmt)
return date_obj.strftime("%Y-%m-%d")
except ValueError:
continue
return None
def extract_name_parts(full_name: str) -> Dict[str, str]:
"""Extract first, middle, and last names."""
parts = full_name.strip().split()
if len(parts) == 1:
return {
"first_name": parts[0],
"middle_name": None,
"last_name": None
}
elif len(parts) == 2:
return {
"first_name": parts[0],
"middle_name": None,
"last_name": parts[1]
}
else:
return {
"first_name": parts[0],
"middle_name": " ".join(parts[1:-1]),
"last_name": parts[-1]
}
def generate_username_variants(name: str) -> List[str]:
"""Generate possible username variants from a name."""
name = name.lower()
parts = name.split()
variants = []
if len(parts) >= 2:
first, last = parts[0], parts[-1]
variants.extend([
first + last,
first + "_" + last,
first + "." + last,
first[0] + last,
first + last[0],
last + first,
last + "_" + first,
last + "." + first
])
if len(parts) == 1:
variants.extend([
parts[0],
parts[0] + "123",
"the" + parts[0],
"real" + parts[0]
])
return list(set(variants))
def calculate_text_similarity(text1: str, text2: str) -> float:
"""Calculate simple text similarity score."""
# Convert to sets of words
set1 = set(text1.lower().split())
set2 = set(text2.lower().split())
# Calculate Jaccard similarity
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
return intersection / union if union > 0 else 0.0
def extract_social_links(text: str) -> List[Dict[str, str]]:
"""Extract social media profile links from text."""
social_patterns = {
"twitter": r"https?://(?:www\.)?twitter\.com/([a-zA-Z0-9_]+)",
"facebook": r"https?://(?:www\.)?facebook\.com/([a-zA-Z0-9.]+)",
"instagram": r"https?://(?:www\.)?instagram\.com/([a-zA-Z0-9_.]+)",
"linkedin": r"https?://(?:www\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)",
"github": r"https?://(?:www\.)?github\.com/([a-zA-Z0-9_-]+)"
}
results = []
for platform, pattern in social_patterns.items():
matches = re.finditer(pattern, text)
for match in matches:
results.append({
"platform": platform,
"username": match.group(1),
"url": match.group(0)
})
return results