bcci commited on
Commit
169e42a
·
verified ·
1 Parent(s): 00d44b1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, PlainTextResponse, Request, HTTPException
2
+ from urllib.parse import unquote
3
+ import uvicorn
4
+ from scrapling import Fetcher
5
+ from markitdown import MarkItDown
6
+ import tempfile
7
+ import os
8
+
9
+ app = FastAPI()
10
+
11
+ fetcher = Fetcher(auto_match=True)
12
+ md = MarkItDown()
13
+
14
+ def scraper(url):
15
+ html = fetcher.get(url)
16
+ return html.prettify()
17
+
18
+ def convert_html_to_md(html):
19
+ with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
20
+ temp_file.write(html.encode('utf-8'))
21
+ temp_file.flush()
22
+ temp_file_path = temp_file.name
23
+ print(temp_file_path)
24
+ x = md.convert(temp_file_path).text_content
25
+ os.remove(temp_file_path)
26
+ return x
27
+
28
+ # POST endpoint to /reader expecting URL in JSON body
29
+ @app.post("/reader", response_class=PlainTextResponse)
30
+ async def get_markdown_post(request: Request):
31
+ try:
32
+ request_data = await request.json()
33
+ url = request_data.get("url")
34
+ if not url:
35
+ raise HTTPException(status_code=400, detail="Please provide a URL in the request body as JSON: {'url': 'your_url'}")
36
+ decoded_url = unquote(url)
37
+ markdown_output = convert_html_to_md(scraper(decoded_url))
38
+ return PlainTextResponse(markdown_output)
39
+ except HTTPException as http_exc:
40
+ raise http_exc
41
+ except Exception as e:
42
+ raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
43
+
44
+ # GET endpoint to /read/{url:path} expecting URL in path
45
+ @app.get("/read/{url:path}", response_class=PlainTextResponse)
46
+ async def get_markdown_get(url: str):
47
+ try:
48
+ decoded_url = unquote(url) # URL in path needs unquoting as well
49
+ markdown_output = convert_html_to_md(scraper(decoded_url))
50
+ return PlainTextResponse(markdown_output)
51
+ except Exception as e:
52
+ raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
53
+
54
+
55
+ if __name__ == "__main__":
56
+ uvicorn.run(app, host="0.0.0.0", port=7860)