bcci commited on
Commit
dd86a22
·
verified ·
1 Parent(s): 5dc46f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -13
app.py CHANGED
@@ -3,7 +3,7 @@ from fastapi.responses import PlainTextResponse
3
  from urllib.parse import unquote
4
  import uvicorn
5
  from scrapling import AsyncFetcher, StealthyFetcher
6
- from markitdown import MarkItDown
7
  import tempfile
8
  import os
9
  import asyncio
@@ -13,7 +13,7 @@ app = FastAPI()
13
 
14
  async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher
15
  stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally)
16
- md = MarkItDown()
17
 
18
 
19
  async def stealthy_scraper(url):
@@ -37,17 +37,8 @@ async def text_scraper(url):
37
 
38
  async def convert_html_to_md(html):
39
  """Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
40
- with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as temp_file:
41
- # Asynchronously write to the temporary file
42
- await asyncio.to_thread(temp_file.write, html.encode('utf-8'))
43
- await asyncio.to_thread(temp_file.flush)
44
- temp_file_path = temp_file.name
45
- # Use asyncio.to_thread for synchronous operations within async functions
46
- md_text = await asyncio.to_thread(md.convert, temp_file_path)
47
- x = md_text.text_content
48
- # Asynchronously remove file
49
- await asyncio.to_thread(os.remove, temp_file_path)
50
- return x
51
 
52
 
53
  @app.get("/read/{url:path}", response_class=PlainTextResponse)
 
3
  from urllib.parse import unquote
4
  import uvicorn
5
  from scrapling import AsyncFetcher, StealthyFetcher
6
+ from markitdown._markitdown import HtmlConverter
7
  import tempfile
8
  import os
9
  import asyncio
 
13
 
14
  async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher
15
  stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally)
16
+ md = HtmlConverter()
17
 
18
 
19
  async def stealthy_scraper(url):
 
37
 
38
  async def convert_html_to_md(html):
39
  """Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
40
+ md_text = await asyncio.to_thread(md._convert, html)
41
+ return md_text.text_content
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  @app.get("/read/{url:path}", response_class=PlainTextResponse)