Spaces:
Runtime error
Runtime error
""" | |
Common helper functions for the search engine. | |
""" | |
from typing import Dict, Any, List, Optional | |
import re | |
from datetime import datetime | |
import hashlib | |
import json | |
def clean_text(text: str) -> str: | |
"""Clean and normalize text content.""" | |
# Remove extra whitespace | |
text = re.sub(r"\s+", " ", text) | |
# Remove special characters | |
text = re.sub(r"[^\w\s.,!?-]", "", text) | |
return text.strip() | |
def extract_entities(text: str) -> Dict[str, List[str]]: | |
"""Extract basic entities from text.""" | |
entities = { | |
"emails": [], | |
"phones": [], | |
"urls": [], | |
"dates": [] | |
} | |
# Extract emails | |
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" | |
entities["emails"] = re.findall(email_pattern, text) | |
# Extract phone numbers | |
phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}" | |
entities["phones"] = re.findall(phone_pattern, text) | |
# Extract URLs | |
url_pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+" | |
entities["urls"] = re.findall(url_pattern, text) | |
# Extract dates | |
date_pattern = r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}" | |
entities["dates"] = re.findall(date_pattern, text) | |
return entities | |
def generate_hash(data: Any) -> str: | |
"""Generate a hash for data deduplication.""" | |
if isinstance(data, (dict, list)): | |
data = json.dumps(data, sort_keys=True) | |
elif not isinstance(data, str): | |
data = str(data) | |
return hashlib.md5(data.encode()).hexdigest() | |
def format_date(date_str: str) -> Optional[str]: | |
"""Format date string to consistent format.""" | |
date_formats = [ | |
"%Y-%m-%d", | |
"%d/%m/%Y", | |
"%m/%d/%Y", | |
"%Y/%m/%d", | |
"%d-%m-%Y", | |
"%m-%d-%Y" | |
] | |
for fmt in date_formats: | |
try: | |
date_obj = datetime.strptime(date_str, fmt) | |
return date_obj.strftime("%Y-%m-%d") | |
except ValueError: | |
continue | |
return None | |
def extract_name_parts(full_name: str) -> Dict[str, str]: | |
"""Extract first, middle, and last names.""" | |
parts = full_name.strip().split() | |
if len(parts) == 1: | |
return { | |
"first_name": parts[0], | |
"middle_name": None, | |
"last_name": None | |
} | |
elif len(parts) == 2: | |
return { | |
"first_name": parts[0], | |
"middle_name": None, | |
"last_name": parts[1] | |
} | |
else: | |
return { | |
"first_name": parts[0], | |
"middle_name": " ".join(parts[1:-1]), | |
"last_name": parts[-1] | |
} | |
def generate_username_variants(name: str) -> List[str]: | |
"""Generate possible username variants from a name.""" | |
name = name.lower() | |
parts = name.split() | |
variants = [] | |
if len(parts) >= 2: | |
first, last = parts[0], parts[-1] | |
variants.extend([ | |
first + last, | |
first + "_" + last, | |
first + "." + last, | |
first[0] + last, | |
first + last[0], | |
last + first, | |
last + "_" + first, | |
last + "." + first | |
]) | |
if len(parts) == 1: | |
variants.extend([ | |
parts[0], | |
parts[0] + "123", | |
"the" + parts[0], | |
"real" + parts[0] | |
]) | |
return list(set(variants)) | |
def calculate_text_similarity(text1: str, text2: str) -> float: | |
"""Calculate simple text similarity score.""" | |
# Convert to sets of words | |
set1 = set(text1.lower().split()) | |
set2 = set(text2.lower().split()) | |
# Calculate Jaccard similarity | |
intersection = len(set1.intersection(set2)) | |
union = len(set1.union(set2)) | |
return intersection / union if union > 0 else 0.0 | |
def extract_social_links(text: str) -> List[Dict[str, str]]: | |
"""Extract social media profile links from text.""" | |
social_patterns = { | |
"twitter": r"https?://(?:www\.)?twitter\.com/([a-zA-Z0-9_]+)", | |
"facebook": r"https?://(?:www\.)?facebook\.com/([a-zA-Z0-9.]+)", | |
"instagram": r"https?://(?:www\.)?instagram\.com/([a-zA-Z0-9_.]+)", | |
"linkedin": r"https?://(?:www\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)", | |
"github": r"https?://(?:www\.)?github\.com/([a-zA-Z0-9_-]+)" | |
} | |
results = [] | |
for platform, pattern in social_patterns.items(): | |
matches = re.finditer(pattern, text) | |
for match in matches: | |
results.append({ | |
"platform": platform, | |
"username": match.group(1), | |
"url": match.group(0) | |
}) | |
return results | |