bcci commited on
Commit
2150fcd
·
verified ·
1 Parent(s): e550b21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -4
app.py CHANGED
@@ -7,6 +7,7 @@ from markitdown import MarkItDown
7
  import tempfile
8
  import os
9
  import asyncio
 
10
 
11
  app = FastAPI()
12
 
@@ -19,13 +20,13 @@ async def stealthy_scraper(url):
19
  """Fetches HTML content using StealthyFetcher (already async internally)."""
20
  # Use await if async_fetch is available, otherwise keep .fetch
21
  html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch
22
- return html.html_content
23
 
24
 
25
  async def scraper(url):
26
  """Fetches HTML content using AsyncFetcher."""
27
  html = await async_fetcher.get(url) # Use await for async operations
28
- return html.html_content
29
 
30
 
31
  async def convert_html_to_md(html):
@@ -42,6 +43,12 @@ async def convert_html_to_md(html):
42
  await asyncio.to_thread(os.remove, temp_file_path)
43
  return x
44
 
 
 
 
 
 
 
45
 
46
  @app.get("/read/{url:path}", response_class=PlainTextResponse)
47
  async def get_markdown_get(request: Request, url: str):
@@ -53,7 +60,22 @@ async def get_markdown_get(request: Request, url: str):
53
  if not full_url.startswith(('http://', 'https://')):
54
  full_url = f"http://{full_url}"
55
 
56
- markdown_output = await convert_html_to_md(await scraper(full_url))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  return PlainTextResponse(markdown_output)
58
  except Exception as e:
59
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
@@ -69,7 +91,7 @@ async def get_markdown_get_stealthy(request: Request, url: str): # Renamed for
69
  if not full_url.startswith(('http://', 'https://')):
70
  full_url = f"http://{full_url}"
71
 
72
- markdown_output = await convert_html_to_md(await stealthy_scraper(full_url))
73
  return PlainTextResponse(markdown_output)
74
  except Exception as e:
75
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
 
7
  import tempfile
8
  import os
9
  import asyncio
10
+ import re
11
 
12
  app = FastAPI()
13
 
 
20
  """Fetches HTML content using StealthyFetcher (already async internally)."""
21
  # Use await if async_fetch is available, otherwise keep .fetch
22
  html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch
23
+ return html
24
 
25
 
26
  async def scraper(url):
27
  """Fetches HTML content using AsyncFetcher."""
28
  html = await async_fetcher.get(url) # Use await for async operations
29
+ return html
30
 
31
 
32
  async def convert_html_to_md(html):
 
43
  await asyncio.to_thread(os.remove, temp_file_path)
44
  return x
45
 
46
+ async def convert_html_to_text(html):
47
+ """Converts HTML to Text using re."""
48
+ html = re.sub(r'\n+', ' ', html)
49
+ html = re.sub(r'\s+', ' ', html)
50
+ return html
51
+
52
 
53
  @app.get("/read/{url:path}", response_class=PlainTextResponse)
54
  async def get_markdown_get(request: Request, url: str):
 
60
  if not full_url.startswith(('http://', 'https://')):
61
  full_url = f"http://{full_url}"
62
 
63
+ markdown_output = await convert_html_to_md(await scraper(full_url).html_content)
64
+ return PlainTextResponse(markdown_output)
65
+ except Exception as e:
66
+ raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
67
+
68
+ @app.get("/read/text/{url:path}", response_class=PlainTextResponse)
69
+ async def get_markdown_get(request: Request, url: str):
70
+ """Handles GET requests to /read/{url}, returning Markdown content."""
71
+ try:
72
+ full_url = str(request.url)
73
+ full_url = full_url.split("/read/")[1]
74
+
75
+ if not full_url.startswith(('http://', 'https://')):
76
+ full_url = f"http://{full_url}"
77
+
78
+ markdown_output = await convert_html_to_md(await scraper(full_url).get_all_text())
79
  return PlainTextResponse(markdown_output)
80
  except Exception as e:
81
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
 
91
  if not full_url.startswith(('http://', 'https://')):
92
  full_url = f"http://{full_url}"
93
 
94
+ markdown_output = await convert_html_to_md(await stealthy_scraper(full_url).html_content)
95
  return PlainTextResponse(markdown_output)
96
  except Exception as e:
97
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")