Spaces:
Sleeping
Sleeping
File size: 3,931 Bytes
be31cd7 169e42a 7b5f305 dd86a22 169e42a e550b21 2150fcd 169e42a e550b21 dd86a22 169e42a e550b21 90e2778 e550b21 9af1ada 0b3c590 e550b21 90e2778 e550b21 9af1ada 5dc46f9 169e42a e550b21 dd86a22 169e42a e550b21 169e42a dd5e687 e550b21 169e42a 51bbabe e550b21 51bbabe e550b21 48f0eb9 2150fcd 9af1ada 2150fcd 863101c 9af1ada 2150fcd e4bb89b 2150fcd 06eae38 169e42a e550b21 0b3c590 e550b21 0b3c590 e550b21 0b3c590 e550b21 48f0eb9 0b3c590 169e42a e550b21 fe8c37e e550b21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import AsyncFetcher, StealthyFetcher
from markitdown._markitdown import HtmlConverter
import tempfile
import os
import asyncio
import re
app = FastAPI()
async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher
stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally)
md = HtmlConverter()
async def stealthy_scraper(url):
"""Fetches HTML content using StealthyFetcher (already async internally)."""
# Use await if async_fetch is available, otherwise keep .fetch
html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch
return html.html_content
async def scraper(url):
"""Fetches HTML content using AsyncFetcher."""
html = await async_fetcher.get(url) # Use await for async operations
return html.html_content
async def text_scraper(url):
"""Fetches HTML content using AsyncFetcher and than extract text."""
html = await async_fetcher.get(url)
return re.sub(r'\s+', ' ', re.sub(r'\n+', ' ', html.get_all_text())).strip()
async def convert_html_to_md(html):
"""Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
md_text = await asyncio.to_thread(md._convert, html)
return md_text.text_content
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
"""Handles GET requests to /read/{url}, returning Markdown content."""
try:
full_url = str(request.url)
full_url = full_url.split("/read/")[1]
if not full_url.startswith(('http://', 'https://')):
full_url = f"http://{full_url}"
markdown_output = await convert_html_to_md(await scraper(full_url))
return PlainTextResponse(markdown_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
@app.get("/read_text/{url:path}", response_class=PlainTextResponse)
async def get_text_get(request: Request, url: str):
"""Handles GET requests to /read/{url}, returning Markdown content."""
try:
full_url = str(request.url)
full_url = full_url.split("/read_text/")[1]
if not full_url.startswith(('http://', 'https://')):
full_url = f"http://{full_url}"
text_output = await text_scraper(full_url)
return PlainTextResponse(text_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
@app.get("/reader/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get_stealthy(request: Request, url: str): # Renamed for clarity
"""Handles GET requests to /reader/{url}, using StealthyFetcher."""
try:
full_url = str(request.url)
full_url = full_url.split("/reader/")[1]
if not full_url.startswith(('http://', 'https://')):
full_url = f"http://{full_url}"
markdown_output = await convert_html_to_md(await stealthy_scraper(full_url))
return PlainTextResponse(markdown_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
if __name__ == "__main__":
async def run_app():
#This part is only needed for StealthyFetcher to work.
try:
process = await asyncio.create_subprocess_exec('camoufox', 'fetch')
await process.wait() #Wait for camoufox to initialize
print("Camoufox initialized successfully!")
except Exception as e:
print(f"An unexpected error occurred starting camoufox: {e}")
config = uvicorn.Config(app, host="0.0.0.0", port=7860)
server = uvicorn.Server(config)
await server.serve()
asyncio.run(run_app()) |