Spaces:
Running
Running
from fastapi import APIRouter, Depends, HTTPException, status, Body, BackgroundTasks | |
from sqlalchemy.ext.asyncio import AsyncSession | |
from typing import List, Optional, Dict, Any | |
import logging | |
from datetime import datetime | |
from src.api.database import get_db | |
from src.api.auth import get_current_user | |
from src.api.schemas import User, CrawlRequest, CrawlResult | |
from src.services.scraper import WebScraper, ScraperError | |
from src.services.tor_proxy import TorProxyService, TorProxyError | |
# Configure logger | |
logger = logging.getLogger(__name__) | |
router = APIRouter( | |
prefix="/scraping", | |
tags=["scraping"], | |
responses={404: {"description": "Not found"}} | |
) | |
# Initialize services | |
scraper = WebScraper() | |
async def test_tor_connection( | |
current_user: User = Depends(get_current_user) | |
): | |
""" | |
Test Tor connection. | |
Args: | |
current_user: Current authenticated user | |
Returns: | |
Dict[str, Any]: Connection status | |
""" | |
try: | |
tor_proxy = TorProxyService() | |
is_connected = await tor_proxy.check_connection() | |
return { | |
"status": "success", | |
"is_connected": is_connected, | |
"timestamp": datetime.utcnow().isoformat() | |
} | |
except TorProxyError as e: | |
logger.error(f"Tor proxy error: {e}") | |
raise HTTPException( | |
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | |
detail=f"Tor proxy error: {str(e)}" | |
) | |
except Exception as e: | |
logger.error(f"Error testing Tor connection: {e}") | |
raise HTTPException( | |
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | |
detail=f"An error occurred: {str(e)}" | |
) | |
async def scrape_page( | |
url: str, | |
use_tor: bool = Body(False), | |
current_user: User = Depends(get_current_user) | |
): | |
""" | |
Scrape a single page. | |
Args: | |
url: URL to scrape | |
use_tor: Whether to use Tor proxy | |
current_user: Current authenticated user | |
Returns: | |
Dict[str, Any]: Scraped content | |
""" | |
try: | |
result = await scraper.extract_content(url, use_tor=use_tor) | |
return { | |
"status": "success", | |
"data": result, | |
"timestamp": datetime.utcnow().isoformat() | |
} | |
except ScraperError as e: | |
logger.error(f"Scraper error: {e}") | |
raise HTTPException( | |
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | |
detail=f"Scraper error: {str(e)}" | |
) | |
except Exception as e: | |
logger.error(f"Error scraping page: {e}") | |
raise HTTPException( | |
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | |
detail=f"An error occurred: {str(e)}" | |
) | |
async def crawl_site( | |
crawl_request: CrawlRequest, | |
background_tasks: BackgroundTasks, | |
current_user: User = Depends(get_current_user) | |
): | |
""" | |
Crawl a site. | |
Args: | |
crawl_request: Crawl request data | |
background_tasks: Background tasks | |
current_user: Current authenticated user | |
Returns: | |
Dict[str, Any]: Crawl status | |
""" | |
# For longer crawls, we add them as background tasks | |
# This prevents timeouts on the API request | |
# Start crawl in background | |
if crawl_request.max_depth > 1 or '.onion' in crawl_request.url: | |
# Add to background tasks | |
background_tasks.add_task( | |
scraper.crawl, | |
crawl_request.url, | |
max_depth=crawl_request.max_depth, | |
max_pages=50, | |
keyword_filter=crawl_request.keywords | |
) | |
return { | |
"status": "started", | |
"message": "Crawl started in background", | |
"timestamp": datetime.utcnow().isoformat() | |
} | |
else: | |
# For simple crawls, we perform them synchronously | |
try: | |
results = await scraper.crawl( | |
crawl_request.url, | |
max_depth=crawl_request.max_depth, | |
max_pages=10, | |
keyword_filter=crawl_request.keywords | |
) | |
return { | |
"status": "completed", | |
"results": results, | |
"count": len(results), | |
"timestamp": datetime.utcnow().isoformat() | |
} | |
except ScraperError as e: | |
logger.error(f"Scraper error: {e}") | |
raise HTTPException( | |
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | |
detail=f"Scraper error: {str(e)}" | |
) | |
except Exception as e: | |
logger.error(f"Error crawling site: {e}") | |
raise HTTPException( | |
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | |
detail=f"An error occurred: {str(e)}" | |
) |