Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,82 +2,93 @@ from fastapi import FastAPI, Request, HTTPException
|
|
2 |
from fastapi.responses import PlainTextResponse
|
3 |
from urllib.parse import unquote
|
4 |
import uvicorn
|
5 |
-
from scrapling import AsyncFetcher, StealthyFetcher
|
6 |
from markitdown import MarkItDown
|
7 |
import tempfile
|
8 |
import os
|
|
|
9 |
|
10 |
app = FastAPI()
|
11 |
|
12 |
-
|
13 |
-
stealthy_fetcher = StealthyFetcher()
|
14 |
md = MarkItDown()
|
15 |
|
|
|
16 |
async def stealthy_scraper(url):
|
17 |
-
|
|
|
|
|
18 |
return html.html_content
|
19 |
|
|
|
20 |
async def scraper(url):
|
21 |
-
|
|
|
22 |
return html.html_content
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
28 |
temp_file_path = temp_file.name
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
32 |
return x
|
33 |
|
34 |
-
|
35 |
@app.get("/read/{url:path}", response_class=PlainTextResponse)
|
36 |
async def get_markdown_get(request: Request, url: str):
|
|
|
37 |
try:
|
38 |
-
# Retrieve the full path from the request
|
39 |
full_url = str(request.url)
|
40 |
-
|
41 |
-
# Extract the part of the URL after `/read/`
|
42 |
full_url = full_url.split("/read/")[1]
|
43 |
-
|
44 |
-
# Additional optional URL validation if needed
|
45 |
if not full_url.startswith(('http://', 'https://')):
|
46 |
full_url = f"http://{full_url}"
|
47 |
-
|
48 |
-
markdown_output = convert_html_to_md(scraper(full_url))
|
49 |
return PlainTextResponse(markdown_output)
|
50 |
except Exception as e:
|
51 |
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
|
52 |
|
53 |
-
|
54 |
@app.get("/reader/{url:path}", response_class=PlainTextResponse)
|
55 |
-
async def
|
|
|
56 |
try:
|
57 |
-
# Retrieve the full path from the request
|
58 |
full_url = str(request.url)
|
59 |
-
|
60 |
-
# Extract the part of the URL after `/read/`
|
61 |
full_url = full_url.split("/reader/")[1]
|
62 |
-
|
63 |
-
# Additional optional URL validation if needed
|
64 |
if not full_url.startswith(('http://', 'https://')):
|
65 |
full_url = f"http://{full_url}"
|
66 |
-
|
67 |
-
markdown_output = convert_html_to_md(stealthy_scraper(full_url))
|
68 |
return PlainTextResponse(markdown_output)
|
69 |
except Exception as e:
|
70 |
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
|
71 |
|
72 |
|
73 |
if __name__ == "__main__":
|
74 |
-
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from fastapi.responses import PlainTextResponse
|
3 |
from urllib.parse import unquote
|
4 |
import uvicorn
|
5 |
+
from scrapling import AsyncFetcher, StealthyFetcher # Import AsyncFetcher
|
6 |
from markitdown import MarkItDown
|
7 |
import tempfile
|
8 |
import os
|
9 |
+
import asyncio
|
10 |
|
11 |
app = FastAPI()
|
12 |
|
13 |
+
async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher
|
14 |
+
stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally)
|
15 |
md = MarkItDown()
|
16 |
|
17 |
+
|
18 |
async def stealthy_scraper(url):
|
19 |
+
"""Fetches HTML content using StealthyFetcher (already async internally)."""
|
20 |
+
# Use await if async_fetch is available, otherwise keep .fetch
|
21 |
+
html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch
|
22 |
return html.html_content
|
23 |
|
24 |
+
|
25 |
async def scraper(url):
|
26 |
+
"""Fetches HTML content using AsyncFetcher."""
|
27 |
+
html = await async_fetcher.get(url) # Use await for async operations
|
28 |
return html.html_content
|
29 |
|
30 |
+
|
31 |
+
async def convert_html_to_md(html):
|
32 |
+
"""Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
|
33 |
+
with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as temp_file:
|
34 |
+
# Asynchronously write to the temporary file
|
35 |
+
await asyncio.to_thread(temp_file.write, html.encode('utf-8'))
|
36 |
+
await asyncio.to_thread(temp_file.flush)
|
37 |
temp_file_path = temp_file.name
|
38 |
+
# Use asyncio.to_thread for synchronous operations within async functions
|
39 |
+
md_text = await asyncio.to_thread(md.convert, temp_file_path)
|
40 |
+
x = md_text.text_content
|
41 |
+
# Asynchronously remove file
|
42 |
+
await asyncio.to_thread(os.remove, temp_file_path)
|
43 |
return x
|
44 |
|
45 |
+
|
46 |
@app.get("/read/{url:path}", response_class=PlainTextResponse)
|
47 |
async def get_markdown_get(request: Request, url: str):
|
48 |
+
"""Handles GET requests to /read/{url}, returning Markdown content."""
|
49 |
try:
|
|
|
50 |
full_url = str(request.url)
|
|
|
|
|
51 |
full_url = full_url.split("/read/")[1]
|
52 |
+
|
|
|
53 |
if not full_url.startswith(('http://', 'https://')):
|
54 |
full_url = f"http://{full_url}"
|
55 |
+
|
56 |
+
markdown_output = await convert_html_to_md(await scraper(full_url))
|
57 |
return PlainTextResponse(markdown_output)
|
58 |
except Exception as e:
|
59 |
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
|
60 |
|
61 |
+
|
62 |
@app.get("/reader/{url:path}", response_class=PlainTextResponse)
|
63 |
+
async def get_markdown_get_stealthy(request: Request, url: str): # Renamed for clarity
|
64 |
+
"""Handles GET requests to /reader/{url}, using StealthyFetcher."""
|
65 |
try:
|
|
|
66 |
full_url = str(request.url)
|
|
|
|
|
67 |
full_url = full_url.split("/reader/")[1]
|
68 |
+
|
|
|
69 |
if not full_url.startswith(('http://', 'https://')):
|
70 |
full_url = f"http://{full_url}"
|
71 |
+
|
72 |
+
markdown_output = await convert_html_to_md(await stealthy_scraper(full_url))
|
73 |
return PlainTextResponse(markdown_output)
|
74 |
except Exception as e:
|
75 |
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
|
76 |
|
77 |
|
78 |
if __name__ == "__main__":
|
79 |
+
async def run_app():
|
80 |
|
81 |
+
#This part is only needed for StealthyFetcher to work.
|
82 |
+
try:
|
83 |
+
process = await asyncio.create_subprocess_exec('camoufox', 'fetch')
|
84 |
+
await process.wait() #Wait for camoufox to initialize
|
85 |
+
print("Camoufox initialized successfully!")
|
86 |
+
except Exception as e:
|
87 |
+
print(f"An unexpected error occurred starting camoufox: {e}")
|
88 |
+
|
89 |
+
|
90 |
+
config = uvicorn.Config(app, host="0.0.0.0", port=7860)
|
91 |
+
server = uvicorn.Server(config)
|
92 |
+
await server.serve()
|
93 |
+
|
94 |
+
asyncio.run(run_app())
|