Spaces:

bcci
/

reader-api

Sleeping

App Files Files Community

reader-api / app.py

bcci

Update app.py

5dc46f9 verified 3 months ago

raw

history blame

4.4 kB

	from fastapi import FastAPI, Request, HTTPException
	from fastapi.responses import PlainTextResponse
	from urllib.parse import unquote
	import uvicorn
	from scrapling import AsyncFetcher, StealthyFetcher
	from markitdown import MarkItDown
	import tempfile
	import os
	import asyncio
	import re

	app = FastAPI()

	async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher
	stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally)
	md = MarkItDown()


	async def stealthy_scraper(url):
	"""Fetches HTML content using StealthyFetcher (already async internally)."""
	# Use await if async_fetch is available, otherwise keep .fetch
	html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch
	return html.html_content


	async def scraper(url):
	"""Fetches HTML content using AsyncFetcher."""
	html = await async_fetcher.get(url) # Use await for async operations
	return html.html_content


	async def text_scraper(url):
	"""Fetches HTML content using AsyncFetcher and than extract text."""
	html = await async_fetcher.get(url)
	return re.sub(r'\s+', ' ', re.sub(r'\n+', ' ', html.get_all_text())).strip()


	async def convert_html_to_md(html):
	"""Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
	with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as temp_file:
	# Asynchronously write to the temporary file
	await asyncio.to_thread(temp_file.write, html.encode('utf-8'))
	await asyncio.to_thread(temp_file.flush)
	temp_file_path = temp_file.name
	# Use asyncio.to_thread for synchronous operations within async functions
	md_text = await asyncio.to_thread(md.convert, temp_file_path)
	x = md_text.text_content
	# Asynchronously remove file
	await asyncio.to_thread(os.remove, temp_file_path)
	return x


	@app.get("/read/{url:path}", response_class=PlainTextResponse)
	async def get_markdown_get(request: Request, url: str):
	"""Handles GET requests to /read/{url}, returning Markdown content."""
	try:
	full_url = str(request.url)
	full_url = full_url.split("/read/")[1]

	if not full_url.startswith(('http://', 'https://')):
	full_url = f"http://{full_url}"

	markdown_output = await convert_html_to_md(await scraper(full_url))
	return PlainTextResponse(markdown_output)
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


	@app.get("/read_text/{url:path}", response_class=PlainTextResponse)
	async def get_text_get(request: Request, url: str):
	"""Handles GET requests to /read/{url}, returning Markdown content."""
	try:
	full_url = str(request.url)
	full_url = full_url.split("/read_text/")[1]

	if not full_url.startswith(('http://', 'https://')):
	full_url = f"http://{full_url}"

	text_output = await text_scraper(full_url)
	return PlainTextResponse(text_output)
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


	@app.get("/reader/{url:path}", response_class=PlainTextResponse)
	async def get_markdown_get_stealthy(request: Request, url: str): # Renamed for clarity
	"""Handles GET requests to /reader/{url}, using StealthyFetcher."""
	try:
	full_url = str(request.url)
	full_url = full_url.split("/reader/")[1]

	if not full_url.startswith(('http://', 'https://')):
	full_url = f"http://{full_url}"

	markdown_output = await convert_html_to_md(await stealthy_scraper(full_url))
	return PlainTextResponse(markdown_output)
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


	if __name__ == "__main__":
	async def run_app():

	#This part is only needed for StealthyFetcher to work.
	try:
	process = await asyncio.create_subprocess_exec('camoufox', 'fetch')
	await process.wait() #Wait for camoufox to initialize
	print("Camoufox initialized successfully!")
	except Exception as e:
	print(f"An unexpected error occurred starting camoufox: {e}")


	config = uvicorn.Config(app, host="0.0.0.0", port=7860)
	server = uvicorn.Server(config)
	await server.serve()

	asyncio.run(run_app())