Spaces:
Runtime error
Runtime error
File size: 4,641 Bytes
48922fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
"""
Common helper functions for the search engine.
"""
from typing import Dict, Any, List, Optional
import re
from datetime import datetime
import hashlib
import json
def clean_text(text: str) -> str:
"""Clean and normalize text content."""
# Remove extra whitespace
text = re.sub(r"\s+", " ", text)
# Remove special characters
text = re.sub(r"[^\w\s.,!?-]", "", text)
return text.strip()
def extract_entities(text: str) -> Dict[str, List[str]]:
"""Extract basic entities from text."""
entities = {
"emails": [],
"phones": [],
"urls": [],
"dates": []
}
# Extract emails
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
entities["emails"] = re.findall(email_pattern, text)
# Extract phone numbers
phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}"
entities["phones"] = re.findall(phone_pattern, text)
# Extract URLs
url_pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+"
entities["urls"] = re.findall(url_pattern, text)
# Extract dates
date_pattern = r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
entities["dates"] = re.findall(date_pattern, text)
return entities
def generate_hash(data: Any) -> str:
"""Generate a hash for data deduplication."""
if isinstance(data, (dict, list)):
data = json.dumps(data, sort_keys=True)
elif not isinstance(data, str):
data = str(data)
return hashlib.md5(data.encode()).hexdigest()
def format_date(date_str: str) -> Optional[str]:
"""Format date string to consistent format."""
date_formats = [
"%Y-%m-%d",
"%d/%m/%Y",
"%m/%d/%Y",
"%Y/%m/%d",
"%d-%m-%Y",
"%m-%d-%Y"
]
for fmt in date_formats:
try:
date_obj = datetime.strptime(date_str, fmt)
return date_obj.strftime("%Y-%m-%d")
except ValueError:
continue
return None
def extract_name_parts(full_name: str) -> Dict[str, str]:
"""Extract first, middle, and last names."""
parts = full_name.strip().split()
if len(parts) == 1:
return {
"first_name": parts[0],
"middle_name": None,
"last_name": None
}
elif len(parts) == 2:
return {
"first_name": parts[0],
"middle_name": None,
"last_name": parts[1]
}
else:
return {
"first_name": parts[0],
"middle_name": " ".join(parts[1:-1]),
"last_name": parts[-1]
}
def generate_username_variants(name: str) -> List[str]:
"""Generate possible username variants from a name."""
name = name.lower()
parts = name.split()
variants = []
if len(parts) >= 2:
first, last = parts[0], parts[-1]
variants.extend([
first + last,
first + "_" + last,
first + "." + last,
first[0] + last,
first + last[0],
last + first,
last + "_" + first,
last + "." + first
])
if len(parts) == 1:
variants.extend([
parts[0],
parts[0] + "123",
"the" + parts[0],
"real" + parts[0]
])
return list(set(variants))
def calculate_text_similarity(text1: str, text2: str) -> float:
"""Calculate simple text similarity score."""
# Convert to sets of words
set1 = set(text1.lower().split())
set2 = set(text2.lower().split())
# Calculate Jaccard similarity
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
return intersection / union if union > 0 else 0.0
def extract_social_links(text: str) -> List[Dict[str, str]]:
"""Extract social media profile links from text."""
social_patterns = {
"twitter": r"https?://(?:www\.)?twitter\.com/([a-zA-Z0-9_]+)",
"facebook": r"https?://(?:www\.)?facebook\.com/([a-zA-Z0-9.]+)",
"instagram": r"https?://(?:www\.)?instagram\.com/([a-zA-Z0-9_.]+)",
"linkedin": r"https?://(?:www\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)",
"github": r"https?://(?:www\.)?github\.com/([a-zA-Z0-9_-]+)"
}
results = []
for platform, pattern in social_patterns.items():
matches = re.finditer(pattern, text)
for match in matches:
results.append({
"platform": platform,
"username": match.group(1),
"url": match.group(0)
})
return results
|