"""
Common helper functions for the search engine.
"""
from typing import Dict, Any, List, Optional
import re
from datetime import datetime
import hashlib
import json

def clean_text(text: str) -> str:
    """Clean and normalize text content."""
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)
    
    # Remove special characters
    text = re.sub(r"[^\w\s.,!?-]", "", text)
    
    return text.strip()

def extract_entities(text: str) -> Dict[str, List[str]]:
    """Extract basic entities from text."""
    entities = {
        "emails": [],
        "phones": [],
        "urls": [],
        "dates": []
    }
    
    # Extract emails
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    entities["emails"] = re.findall(email_pattern, text)
    
    # Extract phone numbers
    phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}"
    entities["phones"] = re.findall(phone_pattern, text)
    
    # Extract URLs
    url_pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+"
    entities["urls"] = re.findall(url_pattern, text)
    
    # Extract dates
    date_pattern = r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
    entities["dates"] = re.findall(date_pattern, text)
    
    return entities

def generate_hash(data: Any) -> str:
    """Generate a hash for data deduplication."""
    if isinstance(data, (dict, list)):
        data = json.dumps(data, sort_keys=True)
    elif not isinstance(data, str):
        data = str(data)
    
    return hashlib.md5(data.encode()).hexdigest()

def format_date(date_str: str) -> Optional[str]:
    """Format date string to consistent format."""
    date_formats = [
        "%Y-%m-%d",
        "%d/%m/%Y",
        "%m/%d/%Y",
        "%Y/%m/%d",
        "%d-%m-%Y",
        "%m-%d-%Y"
    ]
    
    for fmt in date_formats:
        try:
            date_obj = datetime.strptime(date_str, fmt)
            return date_obj.strftime("%Y-%m-%d")
        except ValueError:
            continue
    
    return None

def extract_name_parts(full_name: str) -> Dict[str, str]:
    """Extract first, middle, and last names."""
    parts = full_name.strip().split()
    
    if len(parts) == 1:
        return {
            "first_name": parts[0],
            "middle_name": None,
            "last_name": None
        }
    elif len(parts) == 2:
        return {
            "first_name": parts[0],
            "middle_name": None,
            "last_name": parts[1]
        }
    else:
        return {
            "first_name": parts[0],
            "middle_name": " ".join(parts[1:-1]),
            "last_name": parts[-1]
        }

def generate_username_variants(name: str) -> List[str]:
    """Generate possible username variants from a name."""
    name = name.lower()
    parts = name.split()
    variants = []
    
    if len(parts) >= 2:
        first, last = parts[0], parts[-1]
        variants.extend([
            first + last,
            first + "_" + last,
            first + "." + last,
            first[0] + last,
            first + last[0],
            last + first,
            last + "_" + first,
            last + "." + first
        ])
    
    if len(parts) == 1:
        variants.extend([
            parts[0],
            parts[0] + "123",
            "the" + parts[0],
            "real" + parts[0]
        ])
    
    return list(set(variants))

def calculate_text_similarity(text1: str, text2: str) -> float:
    """Calculate simple text similarity score."""
    # Convert to sets of words
    set1 = set(text1.lower().split())
    set2 = set(text2.lower().split())
    
    # Calculate Jaccard similarity
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union if union > 0 else 0.0

def extract_social_links(text: str) -> List[Dict[str, str]]:
    """Extract social media profile links from text."""
    social_patterns = {
        "twitter": r"https?://(?:www\.)?twitter\.com/([a-zA-Z0-9_]+)",
        "facebook": r"https?://(?:www\.)?facebook\.com/([a-zA-Z0-9.]+)",
        "instagram": r"https?://(?:www\.)?instagram\.com/([a-zA-Z0-9_.]+)",
        "linkedin": r"https?://(?:www\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)",
        "github": r"https?://(?:www\.)?github\.com/([a-zA-Z0-9_-]+)"
    }
    
    results = []
    for platform, pattern in social_patterns.items():
        matches = re.finditer(pattern, text)
        for match in matches:
            results.append({
                "platform": platform,
                "username": match.group(1),
                "url": match.group(0)
            })
    
    return results