Spaces:

tensor-boy
/

ISE

Runtime error

ISE

File size: 4,641 Bytes

48922fa

"""
Common helper functions for the search engine.
"""
from typing import Dict, Any, List, Optional
import re
from datetime import datetime
import hashlib
import json

def clean_text(text: str) -> str:
    """Clean and normalize text content."""
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)
    
    # Remove special characters
    text = re.sub(r"[^\w\s.,!?-]", "", text)
    
    return text.strip()

def extract_entities(text: str) -> Dict[str, List[str]]:
    """Extract basic entities from text."""
    entities = {
        "emails": [],
        "phones": [],
        "urls": [],
        "dates": []
    }
    
    # Extract emails
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    entities["emails"] = re.findall(email_pattern, text)
    
    # Extract phone numbers
    phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}"
    entities["phones"] = re.findall(phone_pattern, text)
    
    # Extract URLs
    url_pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+"
    entities["urls"] = re.findall(url_pattern, text)
    
    # Extract dates
    date_pattern = r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
    entities["dates"] = re.findall(date_pattern, text)
    
    return entities

def generate_hash(data: Any) -> str:
    """Generate a hash for data deduplication."""
    if isinstance(data, (dict, list)):
        data = json.dumps(data, sort_keys=True)
    elif not isinstance(data, str):
        data = str(data)
    
    return hashlib.md5(data.encode()).hexdigest()

def format_date(date_str: str) -> Optional[str]:
    """Format date string to consistent format."""
    date_formats = [
        "%Y-%m-%d",
        "%d/%m/%Y",
        "%m/%d/%Y",
        "%Y/%m/%d",
        "%d-%m-%Y",
        "%m-%d-%Y"
    ]
    
    for fmt in date_formats:
        try:
            date_obj = datetime.strptime(date_str, fmt)
            return date_obj.strftime("%Y-%m-%d")
        except ValueError:
            continue
    
    return None

def extract_name_parts(full_name: str) -> Dict[str, str]:
    """Extract first, middle, and last names."""
    parts = full_name.strip().split()
    
    if len(parts) == 1:
        return {
            "first_name": parts[0],
            "middle_name": None,
            "last_name": None
        }
    elif len(parts) == 2:
        return {
            "first_name": parts[0],
            "middle_name": None,
            "last_name": parts[1]
        }
    else:
        return {
            "first_name": parts[0],
            "middle_name": " ".join(parts[1:-1]),
            "last_name": parts[-1]
        }

def generate_username_variants(name: str) -> List[str]:
    """Generate possible username variants from a name."""
    name = name.lower()
    parts = name.split()
    variants = []
    
    if len(parts) >= 2:
        first, last = parts[0], parts[-1]
        variants.extend([
            first + last,
            first + "_" + last,
            first + "." + last,
            first[0] + last,
            first + last[0],
            last + first,
            last + "_" + first,
            last + "." + first
        ])
    
    if len(parts) == 1:
        variants.extend([
            parts[0],
            parts[0] + "123",
            "the" + parts[0],
            "real" + parts[0]
        ])
    
    return list(set(variants))

def calculate_text_similarity(text1: str, text2: str) -> float:
    """Calculate simple text similarity score."""
    # Convert to sets of words
    set1 = set(text1.lower().split())
    set2 = set(text2.lower().split())
    
    # Calculate Jaccard similarity
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union if union > 0 else 0.0

def extract_social_links(text: str) -> List[Dict[str, str]]:
    """Extract social media profile links from text."""
    social_patterns = {
        "twitter": r"https?://(?:www\.)?twitter\.com/([a-zA-Z0-9_]+)",
        "facebook": r"https?://(?:www\.)?facebook\.com/([a-zA-Z0-9.]+)",
        "instagram": r"https?://(?:www\.)?instagram\.com/([a-zA-Z0-9_.]+)",
        "linkedin": r"https?://(?:www\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)",
        "github": r"https?://(?:www\.)?github\.com/([a-zA-Z0-9_-]+)"
    }
    
    results = []
    for platform, pattern in social_patterns.items():
        matches = re.finditer(pattern, text)
        for match in matches:
            results.append({
                "platform": platform,
                "username": match.group(1),
                "url": match.group(0)
            })
    
    return results