reader-api / app.py
bcci's picture
Update app.py
dd86a22 verified
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import AsyncFetcher, StealthyFetcher
from markitdown._markitdown import HtmlConverter
import tempfile
import os
import asyncio
import re
app = FastAPI()
async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher
stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally)
md = HtmlConverter()
async def stealthy_scraper(url):
"""Fetches HTML content using StealthyFetcher (already async internally)."""
# Use await if async_fetch is available, otherwise keep .fetch
html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch
return html.html_content
async def scraper(url):
"""Fetches HTML content using AsyncFetcher."""
html = await async_fetcher.get(url) # Use await for async operations
return html.html_content
async def text_scraper(url):
"""Fetches HTML content using AsyncFetcher and than extract text."""
html = await async_fetcher.get(url)
return re.sub(r'\s+', ' ', re.sub(r'\n+', ' ', html.get_all_text())).strip()
async def convert_html_to_md(html):
"""Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
md_text = await asyncio.to_thread(md._convert, html)
return md_text.text_content
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
"""Handles GET requests to /read/{url}, returning Markdown content."""
try:
full_url = str(request.url)
full_url = full_url.split("/read/")[1]
if not full_url.startswith(('http://', 'https://')):
full_url = f"http://{full_url}"
markdown_output = await convert_html_to_md(await scraper(full_url))
return PlainTextResponse(markdown_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
@app.get("/read_text/{url:path}", response_class=PlainTextResponse)
async def get_text_get(request: Request, url: str):
"""Handles GET requests to /read/{url}, returning Markdown content."""
try:
full_url = str(request.url)
full_url = full_url.split("/read_text/")[1]
if not full_url.startswith(('http://', 'https://')):
full_url = f"http://{full_url}"
text_output = await text_scraper(full_url)
return PlainTextResponse(text_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
@app.get("/reader/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get_stealthy(request: Request, url: str): # Renamed for clarity
"""Handles GET requests to /reader/{url}, using StealthyFetcher."""
try:
full_url = str(request.url)
full_url = full_url.split("/reader/")[1]
if not full_url.startswith(('http://', 'https://')):
full_url = f"http://{full_url}"
markdown_output = await convert_html_to_md(await stealthy_scraper(full_url))
return PlainTextResponse(markdown_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
if __name__ == "__main__":
async def run_app():
#This part is only needed for StealthyFetcher to work.
try:
process = await asyncio.create_subprocess_exec('camoufox', 'fetch')
await process.wait() #Wait for camoufox to initialize
print("Camoufox initialized successfully!")
except Exception as e:
print(f"An unexpected error occurred starting camoufox: {e}")
config = uvicorn.Config(app, host="0.0.0.0", port=7860)
server = uvicorn.Server(config)
await server.serve()
asyncio.run(run_app())