File size: 3,931 Bytes
be31cd7
 
169e42a
 
7b5f305
dd86a22
169e42a
 
e550b21
2150fcd
169e42a
 
 
e550b21
 
dd86a22
169e42a
e550b21
90e2778
e550b21
 
 
9af1ada
0b3c590
e550b21
90e2778
e550b21
 
9af1ada
 
 
 
 
5dc46f9
 
169e42a
e550b21
 
 
dd86a22
 
169e42a
e550b21
169e42a
dd5e687
e550b21
169e42a
51bbabe
 
e550b21
51bbabe
 
e550b21
48f0eb9
2150fcd
 
 
9af1ada
2150fcd
863101c
9af1ada
2150fcd
 
 
e4bb89b
2150fcd
 
 
 
06eae38
 
169e42a
 
 
e550b21
0b3c590
e550b21
 
0b3c590
 
 
e550b21
0b3c590
 
e550b21
48f0eb9
0b3c590
 
 
 
169e42a
 
e550b21
fe8c37e
e550b21
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import AsyncFetcher, StealthyFetcher 
from markitdown._markitdown import HtmlConverter
import tempfile
import os
import asyncio
import re

app = FastAPI()

async_fetcher = AsyncFetcher(auto_match=True)  # Use AsyncFetcher
stealthy_fetcher = StealthyFetcher()  # Keep StealthyFetcher (it handles its own async internally)
md = HtmlConverter()


async def stealthy_scraper(url):
    """Fetches HTML content using StealthyFetcher (already async internally)."""
    # Use await if async_fetch is available, otherwise keep .fetch
    html = await stealthy_fetcher.async_fetch(url)  #Corrected to async_fetch
    return html.html_content


async def scraper(url):
    """Fetches HTML content using AsyncFetcher."""
    html = await async_fetcher.get(url)  # Use await for async operations
    return html.html_content
    

async def text_scraper(url):
    """Fetches HTML content using AsyncFetcher and than extract text."""
    html = await async_fetcher.get(url)
    return re.sub(r'\s+', ' ', re.sub(r'\n+', ' ', html.get_all_text())).strip()


async def convert_html_to_md(html):
    """Converts HTML to Markdown using MarkItDown (assuming it's synchronous)."""
    md_text = await asyncio.to_thread(md._convert, html)
    return md_text.text_content


@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
    """Handles GET requests to /read/{url}, returning Markdown content."""
    try:
        full_url = str(request.url)
        full_url = full_url.split("/read/")[1]

        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"

        markdown_output = await convert_html_to_md(await scraper(full_url))
        return PlainTextResponse(markdown_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
        

@app.get("/read_text/{url:path}", response_class=PlainTextResponse)
async def get_text_get(request: Request, url: str):
    """Handles GET requests to /read/{url}, returning Markdown content."""
    try:
        full_url = str(request.url)
        full_url = full_url.split("/read_text/")[1]

        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"

        text_output = await text_scraper(full_url)
        return PlainTextResponse(text_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


@app.get("/reader/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get_stealthy(request: Request, url: str):  # Renamed for clarity
    """Handles GET requests to /reader/{url}, using StealthyFetcher."""
    try:
        full_url = str(request.url)
        full_url = full_url.split("/reader/")[1]

        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"

        markdown_output = await convert_html_to_md(await stealthy_scraper(full_url))
        return PlainTextResponse(markdown_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


if __name__ == "__main__":
    async def run_app():

        #This part is only needed for StealthyFetcher to work.
        try:
            process = await asyncio.create_subprocess_exec('camoufox', 'fetch')
            await process.wait() #Wait for camoufox to initialize
            print("Camoufox initialized successfully!")
        except Exception as e:
             print(f"An unexpected error occurred starting camoufox: {e}")


        config = uvicorn.Config(app, host="0.0.0.0", port=7860)
        server = uvicorn.Server(config)
        await server.serve()

    asyncio.run(run_app())