File size: 4,641 Bytes
48922fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Common helper functions for the search engine.
"""
from typing import Dict, Any, List, Optional
import re
from datetime import datetime
import hashlib
import json

def clean_text(text: str) -> str:
    """Clean and normalize text content."""
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)
    
    # Remove special characters
    text = re.sub(r"[^\w\s.,!?-]", "", text)
    
    return text.strip()

def extract_entities(text: str) -> Dict[str, List[str]]:
    """Extract basic entities from text."""
    entities = {
        "emails": [],
        "phones": [],
        "urls": [],
        "dates": []
    }
    
    # Extract emails
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    entities["emails"] = re.findall(email_pattern, text)
    
    # Extract phone numbers
    phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}"
    entities["phones"] = re.findall(phone_pattern, text)
    
    # Extract URLs
    url_pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+"
    entities["urls"] = re.findall(url_pattern, text)
    
    # Extract dates
    date_pattern = r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
    entities["dates"] = re.findall(date_pattern, text)
    
    return entities

def generate_hash(data: Any) -> str:
    """Generate a hash for data deduplication."""
    if isinstance(data, (dict, list)):
        data = json.dumps(data, sort_keys=True)
    elif not isinstance(data, str):
        data = str(data)
    
    return hashlib.md5(data.encode()).hexdigest()

def format_date(date_str: str) -> Optional[str]:
    """Format date string to consistent format."""
    date_formats = [
        "%Y-%m-%d",
        "%d/%m/%Y",
        "%m/%d/%Y",
        "%Y/%m/%d",
        "%d-%m-%Y",
        "%m-%d-%Y"
    ]
    
    for fmt in date_formats:
        try:
            date_obj = datetime.strptime(date_str, fmt)
            return date_obj.strftime("%Y-%m-%d")
        except ValueError:
            continue
    
    return None

def extract_name_parts(full_name: str) -> Dict[str, str]:
    """Extract first, middle, and last names."""
    parts = full_name.strip().split()
    
    if len(parts) == 1:
        return {
            "first_name": parts[0],
            "middle_name": None,
            "last_name": None
        }
    elif len(parts) == 2:
        return {
            "first_name": parts[0],
            "middle_name": None,
            "last_name": parts[1]
        }
    else:
        return {
            "first_name": parts[0],
            "middle_name": " ".join(parts[1:-1]),
            "last_name": parts[-1]
        }

def generate_username_variants(name: str) -> List[str]:
    """Generate possible username variants from a name."""
    name = name.lower()
    parts = name.split()
    variants = []
    
    if len(parts) >= 2:
        first, last = parts[0], parts[-1]
        variants.extend([
            first + last,
            first + "_" + last,
            first + "." + last,
            first[0] + last,
            first + last[0],
            last + first,
            last + "_" + first,
            last + "." + first
        ])
    
    if len(parts) == 1:
        variants.extend([
            parts[0],
            parts[0] + "123",
            "the" + parts[0],
            "real" + parts[0]
        ])
    
    return list(set(variants))

def calculate_text_similarity(text1: str, text2: str) -> float:
    """Calculate simple text similarity score."""
    # Convert to sets of words
    set1 = set(text1.lower().split())
    set2 = set(text2.lower().split())
    
    # Calculate Jaccard similarity
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union if union > 0 else 0.0

def extract_social_links(text: str) -> List[Dict[str, str]]:
    """Extract social media profile links from text."""
    social_patterns = {
        "twitter": r"https?://(?:www\.)?twitter\.com/([a-zA-Z0-9_]+)",
        "facebook": r"https?://(?:www\.)?facebook\.com/([a-zA-Z0-9.]+)",
        "instagram": r"https?://(?:www\.)?instagram\.com/([a-zA-Z0-9_.]+)",
        "linkedin": r"https?://(?:www\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)",
        "github": r"https?://(?:www\.)?github\.com/([a-zA-Z0-9_-]+)"
    }
    
    results = []
    for platform, pattern in social_patterns.items():
        matches = re.finditer(pattern, text)
        for match in matches:
            results.append({
                "platform": platform,
                "username": match.group(1),
                "url": match.group(0)
            })
    
    return results