ISE / utils /web.py
fikird
Complete rewrite of ISE with advanced RAG and OSINT capabilities
48922fa
"""
Web scraping and processing utilities.
"""
from typing import Dict, Any, List, Optional
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
from tenacity import retry, stop_after_attempt, wait_exponential
class WebUtils:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def fetch_url(self, url: str, timeout: int = 10) -> Optional[str]:
"""Fetch content from a URL."""
try:
response = self.session.get(url, timeout=timeout)
response.raise_for_status()
return response.text
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
def extract_text(self, html: str) -> str:
"""Extract clean text from HTML content."""
soup = BeautifulSoup(html, "html.parser")
# Remove unwanted elements
for element in soup(["script", "style", "nav", "footer", "header"]):
element.decompose()
# Get text and clean it
text = soup.get_text(separator="\n", strip=True)
# Remove excessive newlines
text = re.sub(r"\n\s*\n", "\n\n", text)
return text.strip()
def extract_metadata(self, html: str, url: str) -> Dict[str, Any]:
"""Extract metadata from HTML content."""
soup = BeautifulSoup(html, "html.parser")
metadata = {
"url": url,
"title": None,
"description": None,
"keywords": None,
"author": None,
"published_date": None
}
# Extract title
metadata["title"] = (
soup.title.string if soup.title else None
)
# Extract meta tags
meta_tags = soup.find_all("meta")
for tag in meta_tags:
# Description
if tag.get("name", "").lower() == "description":
metadata["description"] = tag.get("content")
# Keywords
elif tag.get("name", "").lower() == "keywords":
metadata["keywords"] = tag.get("content")
# Author
elif tag.get("name", "").lower() == "author":
metadata["author"] = tag.get("content")
# Published date
elif tag.get("name", "").lower() in ["published_time", "publication_date"]:
metadata["published_date"] = tag.get("content")
return metadata
def extract_links(self, html: str, base_url: str) -> List[str]:
"""Extract all links from HTML content."""
soup = BeautifulSoup(html, "html.parser")
links = []
for link in soup.find_all("a"):
href = link.get("href")
if href:
# Convert relative URLs to absolute
absolute_url = urljoin(base_url, href)
# Only include http(s) URLs
if absolute_url.startswith(("http://", "https://")):
links.append(absolute_url)
return list(set(links)) # Remove duplicates
def is_valid_url(self, url: str) -> bool:
"""Check if a URL is valid."""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except Exception:
return False
def clean_url(self, url: str) -> str:
"""Clean and normalize a URL."""
# Remove tracking parameters
parsed = urlparse(url)
path = parsed.path
# Remove common tracking parameters
query_params = []
if parsed.query:
for param in parsed.query.split("&"):
if "=" in param:
key = param.split("=")[0].lower()
if not any(track in key for track in ["utm_", "ref_", "source", "campaign"]):
query_params.append(param)
# Rebuild URL
clean_url = f"{parsed.scheme}://{parsed.netloc}{path}"
if query_params:
clean_url += "?" + "&".join(query_params)
return clean_url