bcci commited on
Commit
9af1ada
·
verified ·
1 Parent(s): 2150fcd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -11
app.py CHANGED
@@ -20,13 +20,22 @@ async def stealthy_scraper(url):
20
  """Fetches HTML content using StealthyFetcher (already async internally)."""
21
  # Use await if async_fetch is available, otherwise keep .fetch
22
  html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch
23
- return html
24
 
25
 
26
  async def scraper(url):
27
  """Fetches HTML content using AsyncFetcher."""
28
  html = await async_fetcher.get(url) # Use await for async operations
29
- return html
 
 
 
 
 
 
 
 
 
30
 
31
 
32
  async def convert_html_to_md(html):
@@ -43,12 +52,6 @@ async def convert_html_to_md(html):
43
  await asyncio.to_thread(os.remove, temp_file_path)
44
  return x
45
 
46
- async def convert_html_to_text(html):
47
- """Converts HTML to Text using re."""
48
- html = re.sub(r'\n+', ' ', html)
49
- html = re.sub(r'\s+', ' ', html)
50
- return html
51
-
52
 
53
  @app.get("/read/{url:path}", response_class=PlainTextResponse)
54
  async def get_markdown_get(request: Request, url: str):
@@ -60,13 +63,14 @@ async def get_markdown_get(request: Request, url: str):
60
  if not full_url.startswith(('http://', 'https://')):
61
  full_url = f"http://{full_url}"
62
 
63
- markdown_output = await convert_html_to_md(await scraper(full_url).html_content)
64
  return PlainTextResponse(markdown_output)
65
  except Exception as e:
66
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
 
67
 
68
  @app.get("/read/text/{url:path}", response_class=PlainTextResponse)
69
- async def get_markdown_get(request: Request, url: str):
70
  """Handles GET requests to /read/{url}, returning Markdown content."""
71
  try:
72
  full_url = str(request.url)
@@ -75,7 +79,7 @@ async def get_markdown_get(request: Request, url: str):
75
  if not full_url.startswith(('http://', 'https://')):
76
  full_url = f"http://{full_url}"
77
 
78
- markdown_output = await convert_html_to_md(await scraper(full_url).get_all_text())
79
  return PlainTextResponse(markdown_output)
80
  except Exception as e:
81
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
 
20
  """Fetches HTML content using StealthyFetcher (already async internally)."""
21
  # Use await if async_fetch is available, otherwise keep .fetch
22
  html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch
23
+ return html.html_content
24
 
25
 
26
  async def scraper(url):
27
  """Fetches HTML content using AsyncFetcher."""
28
  html = await async_fetcher.get(url) # Use await for async operations
29
+ return html.html_content
30
+
31
+
32
+ async def text_scraper(url):
33
+ """Fetches HTML content using AsyncFetcher and than extract text."""
34
+ html = await async_fetcher.get(url) # Use await for async operations
35
+ text = html.get_all_text()
36
+ text = re.sub(r'\n+', ' ', text)
37
+ text = re.sub(r'\s+', ' ', text)
38
+ return text
39
 
40
 
41
  async def convert_html_to_md(html):
 
52
  await asyncio.to_thread(os.remove, temp_file_path)
53
  return x
54
 
 
 
 
 
 
 
55
 
56
  @app.get("/read/{url:path}", response_class=PlainTextResponse)
57
  async def get_markdown_get(request: Request, url: str):
 
63
  if not full_url.startswith(('http://', 'https://')):
64
  full_url = f"http://{full_url}"
65
 
66
+ markdown_output = await convert_html_to_md(await scraper(full_url).)
67
  return PlainTextResponse(markdown_output)
68
  except Exception as e:
69
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
70
+
71
 
72
  @app.get("/read/text/{url:path}", response_class=PlainTextResponse)
73
+ async def get_text_get(request: Request, url: str):
74
  """Handles GET requests to /read/{url}, returning Markdown content."""
75
  try:
76
  full_url = str(request.url)
 
79
  if not full_url.startswith(('http://', 'https://')):
80
  full_url = f"http://{full_url}"
81
 
82
+ markdown_output = await text_scraper(url)
83
  return PlainTextResponse(markdown_output)
84
  except Exception as e:
85
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")