""" Common helper functions for the search engine. """ from typing import Dict, Any, List, Optional import re from datetime import datetime import hashlib import json def clean_text(text: str) -> str: """Clean and normalize text content.""" # Remove extra whitespace text = re.sub(r"\s+", " ", text) # Remove special characters text = re.sub(r"[^\w\s.,!?-]", "", text) return text.strip() def extract_entities(text: str) -> Dict[str, List[str]]: """Extract basic entities from text.""" entities = { "emails": [], "phones": [], "urls": [], "dates": [] } # Extract emails email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" entities["emails"] = re.findall(email_pattern, text) # Extract phone numbers phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}" entities["phones"] = re.findall(phone_pattern, text) # Extract URLs url_pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+" entities["urls"] = re.findall(url_pattern, text) # Extract dates date_pattern = r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}" entities["dates"] = re.findall(date_pattern, text) return entities def generate_hash(data: Any) -> str: """Generate a hash for data deduplication.""" if isinstance(data, (dict, list)): data = json.dumps(data, sort_keys=True) elif not isinstance(data, str): data = str(data) return hashlib.md5(data.encode()).hexdigest() def format_date(date_str: str) -> Optional[str]: """Format date string to consistent format.""" date_formats = [ "%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y/%m/%d", "%d-%m-%Y", "%m-%d-%Y" ] for fmt in date_formats: try: date_obj = datetime.strptime(date_str, fmt) return date_obj.strftime("%Y-%m-%d") except ValueError: continue return None def extract_name_parts(full_name: str) -> Dict[str, str]: """Extract first, middle, and last names.""" parts = full_name.strip().split() if len(parts) == 1: return { "first_name": parts[0], "middle_name": None, "last_name": None } elif len(parts) == 2: return { "first_name": parts[0], "middle_name": None, "last_name": parts[1] } else: return { "first_name": parts[0], "middle_name": " ".join(parts[1:-1]), "last_name": parts[-1] } def generate_username_variants(name: str) -> List[str]: """Generate possible username variants from a name.""" name = name.lower() parts = name.split() variants = [] if len(parts) >= 2: first, last = parts[0], parts[-1] variants.extend([ first + last, first + "_" + last, first + "." + last, first[0] + last, first + last[0], last + first, last + "_" + first, last + "." + first ]) if len(parts) == 1: variants.extend([ parts[0], parts[0] + "123", "the" + parts[0], "real" + parts[0] ]) return list(set(variants)) def calculate_text_similarity(text1: str, text2: str) -> float: """Calculate simple text similarity score.""" # Convert to sets of words set1 = set(text1.lower().split()) set2 = set(text2.lower().split()) # Calculate Jaccard similarity intersection = len(set1.intersection(set2)) union = len(set1.union(set2)) return intersection / union if union > 0 else 0.0 def extract_social_links(text: str) -> List[Dict[str, str]]: """Extract social media profile links from text.""" social_patterns = { "twitter": r"https?://(?:www\.)?twitter\.com/([a-zA-Z0-9_]+)", "facebook": r"https?://(?:www\.)?facebook\.com/([a-zA-Z0-9.]+)", "instagram": r"https?://(?:www\.)?instagram\.com/([a-zA-Z0-9_.]+)", "linkedin": r"https?://(?:www\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)", "github": r"https?://(?:www\.)?github\.com/([a-zA-Z0-9_-]+)" } results = [] for platform, pattern in social_patterns.items(): matches = re.finditer(pattern, text) for match in matches: results.append({ "platform": platform, "username": match.group(1), "url": match.group(0) }) return results