Spaces:

bcci
/

reader-api

Sleeping

bcci commited on Feb 11

Commit

0b3c590

verified ·

1 Parent(s): dd5e687

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ from fastapi import FastAPI, Request, HTTPException
 from fastapi.responses import PlainTextResponse
 from urllib.parse import unquote
 import uvicorn
-from scrapling import Fetcher
 from markitdown import MarkItDown
 import tempfile
 import os
@@ -10,8 +10,13 @@ import os
 app = FastAPI()
 fetcher = Fetcher(auto_match=True)
 md = MarkItDown()
 def scraper(url):
     html = fetcher.get(url)
     return html.prettify()
@@ -45,6 +50,28 @@ async def get_markdown_get(request: Request, url: str):
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from fastapi.responses import PlainTextResponse
 from urllib.parse import unquote
 import uvicorn
+from scrapling import Fetcher, StealthyFetcher
 from markitdown import MarkItDown
 import tempfile
 import os
 app = FastAPI()
 fetcher = Fetcher(auto_match=True)
+stealthy_fetcher = StealthyFetcher()
 md = MarkItDown()
+def stealthy_scraper(url):
+    html = stealthy_fetcher.fetch(url)
+    return html
 def scraper(url):
     html = fetcher.get(url)
     return html.prettify()
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
+# GET endpoint to /read/{url:path} expecting URL in path
+@app.get("/reader/{url:path}", response_class=PlainTextResponse)
+async def get_markdown_get(request: Request, url: str):
+    try:
+        # Retrieve the full path from the request
+        full_url = str(request.url)
+        # Extract the part of the URL after `/read/`
+        full_url = full_url.split("/reader/")[1]
+        # Additional optional URL validation if needed
+        if not full_url.startswith(('http://', 'https://')):
+            full_url = f"http://{full_url}"
+        markdown_output = convert_html_to_md(stealthy_scraper(full_url))
+        return PlainTextResponse(markdown_output)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
 if __name__ == "__main__":
+    import subprocess
+    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)