Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ from fastapi.responses import PlainTextResponse
|
|
3 |
from urllib.parse import unquote
|
4 |
import uvicorn
|
5 |
from scrapling import AsyncFetcher, StealthyFetcher
|
6 |
-
from markitdown import
|
7 |
import tempfile
|
8 |
import os
|
9 |
import asyncio
|
@@ -13,7 +13,7 @@ app = FastAPI()
|
|
13 |
|
14 |
async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher
|
15 |
stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally)
|
16 |
-
md =
|
17 |
|
18 |
|
19 |
async def stealthy_scraper(url):
|
@@ -37,17 +37,8 @@ async def text_scraper(url):
|
|
37 |
|
38 |
async def convert_html_to_md(html):
|
39 |
"""Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
|
40 |
-
|
41 |
-
|
42 |
-
await asyncio.to_thread(temp_file.write, html.encode('utf-8'))
|
43 |
-
await asyncio.to_thread(temp_file.flush)
|
44 |
-
temp_file_path = temp_file.name
|
45 |
-
# Use asyncio.to_thread for synchronous operations within async functions
|
46 |
-
md_text = await asyncio.to_thread(md.convert, temp_file_path)
|
47 |
-
x = md_text.text_content
|
48 |
-
# Asynchronously remove file
|
49 |
-
await asyncio.to_thread(os.remove, temp_file_path)
|
50 |
-
return x
|
51 |
|
52 |
|
53 |
@app.get("/read/{url:path}", response_class=PlainTextResponse)
|
|
|
3 |
from urllib.parse import unquote
|
4 |
import uvicorn
|
5 |
from scrapling import AsyncFetcher, StealthyFetcher
|
6 |
+
from markitdown._markitdown import HtmlConverter
|
7 |
import tempfile
|
8 |
import os
|
9 |
import asyncio
|
|
|
13 |
|
14 |
async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher
|
15 |
stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally)
|
16 |
+
md = HtmlConverter()
|
17 |
|
18 |
|
19 |
async def stealthy_scraper(url):
|
|
|
37 |
|
38 |
async def convert_html_to_md(html):
|
39 |
"""Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
|
40 |
+
md_text = await asyncio.to_thread(md._convert, html)
|
41 |
+
return md_text.text_content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
|
44 |
@app.get("/read/{url:path}", response_class=PlainTextResponse)
|