Spaces:

bcci
/

reader-api

Sleeping

App Files Files Community

bcci commited on Feb 11

Commit

e550b21

verified ·

1 Parent(s): 90e2778

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -38

app.py CHANGED Viewed

@@ -2,82 +2,93 @@ from fastapi import FastAPI, Request, HTTPException
 from fastapi.responses import PlainTextResponse
 from urllib.parse import unquote
 import uvicorn
-from scrapling import AsyncFetcher, StealthyFetcher
 from markitdown import MarkItDown
 import tempfile
 import os
 app = FastAPI()
-fetcher = AsyncFetcher(auto_match=True)
-stealthy_fetcher = StealthyFetcher()
 md = MarkItDown()
 async def stealthy_scraper(url):
-    html = await stealthy_fetcher.async_fetch(url)
     return html.html_content
 async def scraper(url):
-    html = await fetcher.get(url)
     return html.html_content
-def convert_html_to_md(html):
-    with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
-        temp_file.write(html.encode('utf-8'))
-        temp_file.flush()
         temp_file_path = temp_file.name
-        print(temp_file_path)
-        x = md.convert(temp_file_path).text_content
-    os.remove(temp_file_path)
     return x
-# GET endpoint to /read/{url:path} expecting URL in path
 @app.get("/read/{url:path}", response_class=PlainTextResponse)
 async def get_markdown_get(request: Request, url: str):
     try:
-        # Retrieve the full path from the request
         full_url = str(request.url)
-        # Extract the part of the URL after `/read/`
         full_url = full_url.split("/read/")[1]
-        # Additional optional URL validation if needed
         if not full_url.startswith(('http://', 'https://')):
             full_url = f"http://{full_url}"
-        markdown_output = convert_html_to_md(scraper(full_url))
         return PlainTextResponse(markdown_output)
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
-# GET endpoint to /read/{url:path} expecting URL in path
 @app.get("/reader/{url:path}", response_class=PlainTextResponse)
-async def get_markdown_get(request: Request, url: str):
     try:
-        # Retrieve the full path from the request
         full_url = str(request.url)
-        # Extract the part of the URL after `/read/`
         full_url = full_url.split("/reader/")[1]
-        # Additional optional URL validation if needed
         if not full_url.startswith(('http://', 'https://')):
             full_url = f"http://{full_url}"
-        markdown_output = convert_html_to_md(stealthy_scraper(full_url))
         return PlainTextResponse(markdown_output)
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
 if __name__ == "__main__":
-    import subprocess
-    try:
-        subprocess.run(['camoufox', 'fetch'], check=True)
-        print("Command executed successfully!")
-    except Exception as e:
-        print(f"An unexpected error occurred: {e}")
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 from fastapi.responses import PlainTextResponse
 from urllib.parse import unquote
 import uvicorn
+from scrapling import AsyncFetcher, StealthyFetcher  # Import AsyncFetcher
 from markitdown import MarkItDown
 import tempfile
 import os
+import asyncio
 app = FastAPI()
+async_fetcher = AsyncFetcher(auto_match=True)  # Use AsyncFetcher
+stealthy_fetcher = StealthyFetcher()  # Keep StealthyFetcher (it handles its own async internally)
 md = MarkItDown()
 async def stealthy_scraper(url):
+    """Fetches HTML content using StealthyFetcher (already async internally)."""
+    # Use await if async_fetch is available, otherwise keep .fetch
+    html = await stealthy_fetcher.async_fetch(url)  #Corrected to async_fetch
     return html.html_content
 async def scraper(url):
+    """Fetches HTML content using AsyncFetcher."""
+    html = await async_fetcher.get(url)  # Use await for async operations
     return html.html_content
+async def convert_html_to_md(html):
+    """Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
+    with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as temp_file:
+        # Asynchronously write to the temporary file
+        await asyncio.to_thread(temp_file.write, html.encode('utf-8'))
+        await asyncio.to_thread(temp_file.flush)
         temp_file_path = temp_file.name
+        # Use asyncio.to_thread for synchronous operations within async functions
+        md_text = await asyncio.to_thread(md.convert, temp_file_path)
+        x = md_text.text_content
+    # Asynchronously remove file
+    await asyncio.to_thread(os.remove, temp_file_path)
     return x
 @app.get("/read/{url:path}", response_class=PlainTextResponse)
 async def get_markdown_get(request: Request, url: str):
+    """Handles GET requests to /read/{url}, returning Markdown content."""
     try:
         full_url = str(request.url)
         full_url = full_url.split("/read/")[1]
         if not full_url.startswith(('http://', 'https://')):
             full_url = f"http://{full_url}"
+        markdown_output = await convert_html_to_md(await scraper(full_url))
         return PlainTextResponse(markdown_output)
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
 @app.get("/reader/{url:path}", response_class=PlainTextResponse)
+async def get_markdown_get_stealthy(request: Request, url: str):  # Renamed for clarity
+    """Handles GET requests to /reader/{url}, using StealthyFetcher."""
     try:
         full_url = str(request.url)
         full_url = full_url.split("/reader/")[1]
         if not full_url.startswith(('http://', 'https://')):
             full_url = f"http://{full_url}"
+        markdown_output = await convert_html_to_md(await stealthy_scraper(full_url))
         return PlainTextResponse(markdown_output)
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
 if __name__ == "__main__":
+    async def run_app():
+        #This part is only needed for StealthyFetcher to work.
+        try:
+            process = await asyncio.create_subprocess_exec('camoufox', 'fetch')
+            await process.wait() #Wait for camoufox to initialize
+            print("Camoufox initialized successfully!")
+        except Exception as e:
+             print(f"An unexpected error occurred starting camoufox: {e}")
+        config = uvicorn.Config(app, host="0.0.0.0", port=7860)
+        server = uvicorn.Server(config)
+        await server.serve()
+    asyncio.run(run_app())