bcci commited on
Commit
0b3c590
·
verified ·
1 Parent(s): dd5e687

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -1
app.py CHANGED
@@ -2,7 +2,7 @@ from fastapi import FastAPI, Request, HTTPException
2
  from fastapi.responses import PlainTextResponse
3
  from urllib.parse import unquote
4
  import uvicorn
5
- from scrapling import Fetcher
6
  from markitdown import MarkItDown
7
  import tempfile
8
  import os
@@ -10,8 +10,13 @@ import os
10
  app = FastAPI()
11
 
12
  fetcher = Fetcher(auto_match=True)
 
13
  md = MarkItDown()
14
 
 
 
 
 
15
  def scraper(url):
16
  html = fetcher.get(url)
17
  return html.prettify()
@@ -45,6 +50,28 @@ async def get_markdown_get(request: Request, url: str):
45
  except Exception as e:
46
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  if __name__ == "__main__":
 
 
 
50
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
2
  from fastapi.responses import PlainTextResponse
3
  from urllib.parse import unquote
4
  import uvicorn
5
+ from scrapling import Fetcher, StealthyFetcher
6
  from markitdown import MarkItDown
7
  import tempfile
8
  import os
 
10
  app = FastAPI()
11
 
12
  fetcher = Fetcher(auto_match=True)
13
+ stealthy_fetcher = StealthyFetcher()
14
  md = MarkItDown()
15
 
16
+ def stealthy_scraper(url):
17
+ html = stealthy_fetcher.fetch(url)
18
+ return html
19
+
20
  def scraper(url):
21
  html = fetcher.get(url)
22
  return html.prettify()
 
50
  except Exception as e:
51
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
52
 
53
+ # GET endpoint to /read/{url:path} expecting URL in path
54
+ @app.get("/reader/{url:path}", response_class=PlainTextResponse)
55
+ async def get_markdown_get(request: Request, url: str):
56
+ try:
57
+ # Retrieve the full path from the request
58
+ full_url = str(request.url)
59
+
60
+ # Extract the part of the URL after `/read/`
61
+ full_url = full_url.split("/reader/")[1]
62
+
63
+ # Additional optional URL validation if needed
64
+ if not full_url.startswith(('http://', 'https://')):
65
+ full_url = f"http://{full_url}"
66
+
67
+ markdown_output = convert_html_to_md(stealthy_scraper(full_url))
68
+ return PlainTextResponse(markdown_output)
69
+ except Exception as e:
70
+ raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
71
+
72
 
73
  if __name__ == "__main__":
74
+ import subprocess
75
+
76
+ import uvicorn
77
  uvicorn.run(app, host="0.0.0.0", port=7860)