Spaces:

bcci
/

reader-api

Sleeping

App Files Files Community

bcci commited on Feb 11

Commit

169e42a

verified ·

1 Parent(s): 00d44b1

Create app.py

Browse files

Files changed (1) hide show

app.py +56 -0

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from fastapi import FastAPI, PlainTextResponse, Request, HTTPException
+from urllib.parse import unquote
+import uvicorn
+from scrapling import Fetcher
+from markitdown import MarkItDown
+import tempfile
+import os
+app = FastAPI()
+fetcher = Fetcher(auto_match=True)
+md = MarkItDown()
+def scraper(url):
+    html = fetcher.get(url)
+    return html.prettify()
+def convert_html_to_md(html):
+    with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
+        temp_file.write(html.encode('utf-8'))
+        temp_file.flush()
+        temp_file_path = temp_file.name
+        print(temp_file_path)
+        x = md.convert(temp_file_path).text_content
+    os.remove(temp_file_path)
+    return x
+# POST endpoint to /reader expecting URL in JSON body
+@app.post("/reader", response_class=PlainTextResponse)
+async def get_markdown_post(request: Request):
+    try:
+        request_data = await request.json()
+        url = request_data.get("url")
+        if not url:
+            raise HTTPException(status_code=400, detail="Please provide a URL in the request body as JSON: {'url': 'your_url'}")
+        decoded_url = unquote(url)
+        markdown_output = convert_html_to_md(scraper(decoded_url))
+        return PlainTextResponse(markdown_output)
+    except HTTPException as http_exc:
+        raise http_exc
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
+# GET endpoint to /read/{url:path} expecting URL in path
+@app.get("/read/{url:path}", response_class=PlainTextResponse)
+async def get_markdown_get(url: str):
+    try:
+        decoded_url = unquote(url) # URL in path needs unquoting as well
+        markdown_output = convert_html_to_md(scraper(decoded_url))
+        return PlainTextResponse(markdown_output)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)