bcci commited on
Commit
e550b21
·
verified ·
1 Parent(s): 90e2778

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -38
app.py CHANGED
@@ -2,82 +2,93 @@ from fastapi import FastAPI, Request, HTTPException
2
  from fastapi.responses import PlainTextResponse
3
  from urllib.parse import unquote
4
  import uvicorn
5
- from scrapling import AsyncFetcher, StealthyFetcher
6
  from markitdown import MarkItDown
7
  import tempfile
8
  import os
 
9
 
10
  app = FastAPI()
11
 
12
- fetcher = AsyncFetcher(auto_match=True)
13
- stealthy_fetcher = StealthyFetcher()
14
  md = MarkItDown()
15
 
 
16
  async def stealthy_scraper(url):
17
- html = await stealthy_fetcher.async_fetch(url)
 
 
18
  return html.html_content
19
 
 
20
  async def scraper(url):
21
- html = await fetcher.get(url)
 
22
  return html.html_content
23
 
24
- def convert_html_to_md(html):
25
- with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
26
- temp_file.write(html.encode('utf-8'))
27
- temp_file.flush()
 
 
 
28
  temp_file_path = temp_file.name
29
- print(temp_file_path)
30
- x = md.convert(temp_file_path).text_content
31
- os.remove(temp_file_path)
 
 
32
  return x
33
 
34
- # GET endpoint to /read/{url:path} expecting URL in path
35
  @app.get("/read/{url:path}", response_class=PlainTextResponse)
36
  async def get_markdown_get(request: Request, url: str):
 
37
  try:
38
- # Retrieve the full path from the request
39
  full_url = str(request.url)
40
-
41
- # Extract the part of the URL after `/read/`
42
  full_url = full_url.split("/read/")[1]
43
-
44
- # Additional optional URL validation if needed
45
  if not full_url.startswith(('http://', 'https://')):
46
  full_url = f"http://{full_url}"
47
-
48
- markdown_output = convert_html_to_md(scraper(full_url))
49
  return PlainTextResponse(markdown_output)
50
  except Exception as e:
51
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
52
 
53
- # GET endpoint to /read/{url:path} expecting URL in path
54
  @app.get("/reader/{url:path}", response_class=PlainTextResponse)
55
- async def get_markdown_get(request: Request, url: str):
 
56
  try:
57
- # Retrieve the full path from the request
58
  full_url = str(request.url)
59
-
60
- # Extract the part of the URL after `/read/`
61
  full_url = full_url.split("/reader/")[1]
62
-
63
- # Additional optional URL validation if needed
64
  if not full_url.startswith(('http://', 'https://')):
65
  full_url = f"http://{full_url}"
66
-
67
- markdown_output = convert_html_to_md(stealthy_scraper(full_url))
68
  return PlainTextResponse(markdown_output)
69
  except Exception as e:
70
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
71
 
72
 
73
  if __name__ == "__main__":
74
- import subprocess
75
 
76
- try:
77
- subprocess.run(['camoufox', 'fetch'], check=True)
78
- print("Command executed successfully!")
79
- except Exception as e:
80
- print(f"An unexpected error occurred: {e}")
81
-
82
- import uvicorn
83
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
2
  from fastapi.responses import PlainTextResponse
3
  from urllib.parse import unquote
4
  import uvicorn
5
+ from scrapling import AsyncFetcher, StealthyFetcher # Import AsyncFetcher
6
  from markitdown import MarkItDown
7
  import tempfile
8
  import os
9
+ import asyncio
10
 
11
  app = FastAPI()
12
 
13
+ async_fetcher = AsyncFetcher(auto_match=True) # Use AsyncFetcher
14
+ stealthy_fetcher = StealthyFetcher() # Keep StealthyFetcher (it handles its own async internally)
15
  md = MarkItDown()
16
 
17
+
18
  async def stealthy_scraper(url):
19
+ """Fetches HTML content using StealthyFetcher (already async internally)."""
20
+ # Use await if async_fetch is available, otherwise keep .fetch
21
+ html = await stealthy_fetcher.async_fetch(url) #Corrected to async_fetch
22
  return html.html_content
23
 
24
+
25
  async def scraper(url):
26
+ """Fetches HTML content using AsyncFetcher."""
27
+ html = await async_fetcher.get(url) # Use await for async operations
28
  return html.html_content
29
 
30
+
31
+ async def convert_html_to_md(html):
32
+ """Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
33
+ with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as temp_file:
34
+ # Asynchronously write to the temporary file
35
+ await asyncio.to_thread(temp_file.write, html.encode('utf-8'))
36
+ await asyncio.to_thread(temp_file.flush)
37
  temp_file_path = temp_file.name
38
+ # Use asyncio.to_thread for synchronous operations within async functions
39
+ md_text = await asyncio.to_thread(md.convert, temp_file_path)
40
+ x = md_text.text_content
41
+ # Asynchronously remove file
42
+ await asyncio.to_thread(os.remove, temp_file_path)
43
  return x
44
 
45
+
46
  @app.get("/read/{url:path}", response_class=PlainTextResponse)
47
  async def get_markdown_get(request: Request, url: str):
48
+ """Handles GET requests to /read/{url}, returning Markdown content."""
49
  try:
 
50
  full_url = str(request.url)
 
 
51
  full_url = full_url.split("/read/")[1]
52
+
 
53
  if not full_url.startswith(('http://', 'https://')):
54
  full_url = f"http://{full_url}"
55
+
56
+ markdown_output = await convert_html_to_md(await scraper(full_url))
57
  return PlainTextResponse(markdown_output)
58
  except Exception as e:
59
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
60
 
61
+
62
  @app.get("/reader/{url:path}", response_class=PlainTextResponse)
63
+ async def get_markdown_get_stealthy(request: Request, url: str): # Renamed for clarity
64
+ """Handles GET requests to /reader/{url}, using StealthyFetcher."""
65
  try:
 
66
  full_url = str(request.url)
 
 
67
  full_url = full_url.split("/reader/")[1]
68
+
 
69
  if not full_url.startswith(('http://', 'https://')):
70
  full_url = f"http://{full_url}"
71
+
72
+ markdown_output = await convert_html_to_md(await stealthy_scraper(full_url))
73
  return PlainTextResponse(markdown_output)
74
  except Exception as e:
75
  raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
76
 
77
 
78
  if __name__ == "__main__":
79
+ async def run_app():
80
 
81
+ #This part is only needed for StealthyFetcher to work.
82
+ try:
83
+ process = await asyncio.create_subprocess_exec('camoufox', 'fetch')
84
+ await process.wait() #Wait for camoufox to initialize
85
+ print("Camoufox initialized successfully!")
86
+ except Exception as e:
87
+ print(f"An unexpected error occurred starting camoufox: {e}")
88
+
89
+
90
+ config = uvicorn.Config(app, host="0.0.0.0", port=7860)
91
+ server = uvicorn.Server(config)
92
+ await server.serve()
93
+
94
+ asyncio.run(run_app())