from fastapi import FastAPI, Request, HTTPException from fastapi.responses import PlainTextResponse from urllib.parse import unquote import uvicorn from scrapling import AsyncFetcher, StealthyFetcher from markitdown._markitdown import HtmlConverter import tempfile import os import asyncio import re app = FastAPI() async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally) md = HtmlConverter() async def stealthy_scraper(url): """Fetches HTML content using StealthyFetcher (already async internally).""" # Use await if async_fetch is available, otherwise keep .fetch html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch return html.html_content async def scraper(url): """Fetches HTML content using AsyncFetcher.""" html = await async_fetcher.get(url) # Use await for async operations return html.html_content async def text_scraper(url): """Fetches HTML content using AsyncFetcher and than extract text.""" html = await async_fetcher.get(url) return re.sub(r'\s+', ' ', re.sub(r'\n+', ' ', html.get_all_text())).strip() async def convert_html_to_md(html): """Converts HTML to Markdown using MarkItDown (assuming it's synchronous).""" md_text = await asyncio.to_thread(md._convert, html) return md_text.text_content @app.get("/read/{url:path}", response_class=PlainTextResponse) async def get_markdown_get(request: Request, url: str): """Handles GET requests to /read/{url}, returning Markdown content.""" try: full_url = str(request.url) full_url = full_url.split("/read/")[1] if not full_url.startswith(('http://', 'https://')): full_url = f"http://{full_url}" markdown_output = await convert_html_to_md(await scraper(full_url)) return PlainTextResponse(markdown_output) except Exception as e: raise HTTPException(status_code=500, detail=f"Error processing URL: {e}") @app.get("/read_text/{url:path}", response_class=PlainTextResponse) async def get_text_get(request: Request, url: str): """Handles GET requests to /read/{url}, returning Markdown content.""" try: full_url = str(request.url) full_url = full_url.split("/read_text/")[1] if not full_url.startswith(('http://', 'https://')): full_url = f"http://{full_url}" text_output = await text_scraper(full_url) return PlainTextResponse(text_output) except Exception as e: raise HTTPException(status_code=500, detail=f"Error processing URL: {e}") @app.get("/reader/{url:path}", response_class=PlainTextResponse) async def get_markdown_get_stealthy(request: Request, url: str): # Renamed for clarity """Handles GET requests to /reader/{url}, using StealthyFetcher.""" try: full_url = str(request.url) full_url = full_url.split("/reader/")[1] if not full_url.startswith(('http://', 'https://')): full_url = f"http://{full_url}" markdown_output = await convert_html_to_md(await stealthy_scraper(full_url)) return PlainTextResponse(markdown_output) except Exception as e: raise HTTPException(status_code=500, detail=f"Error processing URL: {e}") if __name__ == "__main__": async def run_app(): #This part is only needed for StealthyFetcher to work. try: process = await asyncio.create_subprocess_exec('camoufox', 'fetch') await process.wait() #Wait for camoufox to initialize print("Camoufox initialized successfully!") except Exception as e: print(f"An unexpected error occurred starting camoufox: {e}") config = uvicorn.Config(app, host="0.0.0.0", port=7860) server = uvicorn.Server(config) await server.serve() asyncio.run(run_app())