Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, Request, HTTPException | |
from fastapi.responses import PlainTextResponse | |
from urllib.parse import unquote | |
import uvicorn | |
from scrapling import AsyncFetcher, StealthyFetcher | |
from markitdown._markitdown import HtmlConverter | |
import tempfile | |
import os | |
import asyncio | |
import re | |
app = FastAPI() | |
async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher | |
stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally) | |
md = HtmlConverter() | |
async def stealthy_scraper(url): | |
"""Fetches HTML content using StealthyFetcher (already async internally).""" | |
# Use await if async_fetch is available, otherwise keep .fetch | |
html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch | |
return html.html_content | |
async def scraper(url): | |
"""Fetches HTML content using AsyncFetcher.""" | |
html = await async_fetcher.get(url) # Use await for async operations | |
return html.html_content | |
async def text_scraper(url): | |
"""Fetches HTML content using AsyncFetcher and than extract text.""" | |
html = await async_fetcher.get(url) | |
return re.sub(r'\s+', ' ', re.sub(r'\n+', ' ', html.get_all_text())).strip() | |
async def convert_html_to_md(html): | |
"""Converts HTML to Markdown using MarkItDown (assuming it's synchronous).""" | |
md_text = await asyncio.to_thread(md._convert, html) | |
return md_text.text_content | |
async def get_markdown_get(request: Request, url: str): | |
"""Handles GET requests to /read/{url}, returning Markdown content.""" | |
try: | |
full_url = str(request.url) | |
full_url = full_url.split("/read/")[1] | |
if not full_url.startswith(('http://', 'https://')): | |
full_url = f"http://{full_url}" | |
markdown_output = await convert_html_to_md(await scraper(full_url)) | |
return PlainTextResponse(markdown_output) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}") | |
async def get_text_get(request: Request, url: str): | |
"""Handles GET requests to /read/{url}, returning Markdown content.""" | |
try: | |
full_url = str(request.url) | |
full_url = full_url.split("/read_text/")[1] | |
if not full_url.startswith(('http://', 'https://')): | |
full_url = f"http://{full_url}" | |
text_output = await text_scraper(full_url) | |
return PlainTextResponse(text_output) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}") | |
async def get_markdown_get_stealthy(request: Request, url: str): # Renamed for clarity | |
"""Handles GET requests to /reader/{url}, using StealthyFetcher.""" | |
try: | |
full_url = str(request.url) | |
full_url = full_url.split("/reader/")[1] | |
if not full_url.startswith(('http://', 'https://')): | |
full_url = f"http://{full_url}" | |
markdown_output = await convert_html_to_md(await stealthy_scraper(full_url)) | |
return PlainTextResponse(markdown_output) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}") | |
if __name__ == "__main__": | |
async def run_app(): | |
#This part is only needed for StealthyFetcher to work. | |
try: | |
process = await asyncio.create_subprocess_exec('camoufox', 'fetch') | |
await process.wait() #Wait for camoufox to initialize | |
print("Camoufox initialized successfully!") | |
except Exception as e: | |
print(f"An unexpected error occurred starting camoufox: {e}") | |
config = uvicorn.Config(app, host="0.0.0.0", port=7860) | |
server = uvicorn.Server(config) | |
await server.serve() | |
asyncio.run(run_app()) |