Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, PlainTextResponse, Request, HTTPException
|
2 |
+
from urllib.parse import unquote
|
3 |
+
import uvicorn
|
4 |
+
from scrapling import Fetcher
|
5 |
+
from markitdown import MarkItDown
|
6 |
+
import tempfile
|
7 |
+
import os
|
8 |
+
|
9 |
+
app = FastAPI()
|
10 |
+
|
11 |
+
fetcher = Fetcher(auto_match=True)
|
12 |
+
md = MarkItDown()
|
13 |
+
|
14 |
+
def scraper(url):
|
15 |
+
html = fetcher.get(url)
|
16 |
+
return html.prettify()
|
17 |
+
|
18 |
+
def convert_html_to_md(html):
|
19 |
+
with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
|
20 |
+
temp_file.write(html.encode('utf-8'))
|
21 |
+
temp_file.flush()
|
22 |
+
temp_file_path = temp_file.name
|
23 |
+
print(temp_file_path)
|
24 |
+
x = md.convert(temp_file_path).text_content
|
25 |
+
os.remove(temp_file_path)
|
26 |
+
return x
|
27 |
+
|
28 |
+
# POST endpoint to /reader expecting URL in JSON body
|
29 |
+
@app.post("/reader", response_class=PlainTextResponse)
|
30 |
+
async def get_markdown_post(request: Request):
|
31 |
+
try:
|
32 |
+
request_data = await request.json()
|
33 |
+
url = request_data.get("url")
|
34 |
+
if not url:
|
35 |
+
raise HTTPException(status_code=400, detail="Please provide a URL in the request body as JSON: {'url': 'your_url'}")
|
36 |
+
decoded_url = unquote(url)
|
37 |
+
markdown_output = convert_html_to_md(scraper(decoded_url))
|
38 |
+
return PlainTextResponse(markdown_output)
|
39 |
+
except HTTPException as http_exc:
|
40 |
+
raise http_exc
|
41 |
+
except Exception as e:
|
42 |
+
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
|
43 |
+
|
44 |
+
# GET endpoint to /read/{url:path} expecting URL in path
|
45 |
+
@app.get("/read/{url:path}", response_class=PlainTextResponse)
|
46 |
+
async def get_markdown_get(url: str):
|
47 |
+
try:
|
48 |
+
decoded_url = unquote(url) # URL in path needs unquoting as well
|
49 |
+
markdown_output = convert_html_to_md(scraper(decoded_url))
|
50 |
+
return PlainTextResponse(markdown_output)
|
51 |
+
except Exception as e:
|
52 |
+
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
|
53 |
+
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|