File size: 3,984 Bytes
2a808f8
900966d
 
 
 
4d9e1a7
900966d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d9e1a7
a5ba9ea
 
cf919e4
a5ba9ea
 
 
cf919e4
900966d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1db6081
900966d
a5ba9ea
1db6081
22f3b84
0deeeca
1db6081
0deeeca
900966d
0deeeca
900966d
0deeeca
 
900966d
0deeeca
900966d
0deeeca
900966d
 
4d9e1a7
bf6070d
2a808f8
900966d
 
 
 
 
 
0deeeca
a5ba9ea
 
0deeeca
900966d
a5ba9ea
900966d
 
 
a5ba9ea
900966d
 
bf6070d
900966d
bf6070d
900966d
4d9e1a7
900966d
 
4d9e1a7
 
2a808f8
b8a4185
4d9e1a7
 
b8a4185
900966d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
import urllib3
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Custom HTTP Session and Response Classes
class CustomSession:
    def __init__(self):
        self.pool_manager = urllib3.PoolManager()

    def get(self, url):
        response = self.pool_manager.request('GET', url)
        return CustomResponse(response)

class CustomResponse:
    def __init__(self, response):
        self.status_code = response.status
        self.headers = response.headers
        self.content = response.data

    def soup(self):
        return BeautifulSoup(self.content, 'lxml')

def get(url):
    session = CustomSession()
    return session.get(url)

# Utility Functions
def extract_texts(soup, title):
    """Extracts all text content from the soup, excluding the title."""
    texts = [text for text in soup.stripped_strings]
    # Remove the title from the texts if it exists
    if title in texts:
        texts.remove(title)
    return texts

def extract_links(soup, base_url):
    """Extracts all valid links from the soup."""
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
        link_text = link.get_text(strip=True) or "No Text"
        links.append({"Text": link_text, "URL": full_url})
    return links

def extract_images(soup, base_url):
    """Extracts all valid image URLs and their alt text from the soup."""
    images = []
    for img in soup.find_all('img', src=True):
        img_url = img['src']
        full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
        alt_text = img.get('alt', 'No Alt Text')
        images.append({"Alt Text": alt_text, "Image URL": full_img_url})
        return images

def format_detailed_output(structured_data, title):
    """Formats the structured data into a Markdown string."""
    result = f"### Title\n\n{title}\n\n"
    result += "### Texts\n\n"
    result += " ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found."
    result += "\n\n### Links\n\n"
    if structured_data["Links"]:
        result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"])
    else:
        result += "No links found."
    result += "\n\n### Images\n\n"
    if structured_data["Images"]:
        result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"])
    else:
        result += "No images found."
    return result

# Web Page Processing Function
def download_and_process_web_page(url):
    """Downloads a web page and returns the structured content."""
    if not url.startswith("http://") and not url.startswith("https://"):
        url = "http://" + url  # Prepend "http://" if not present

    try:
        response = get(url)
        soup = response.soup()
        
        # Extract title
        title = soup.title.string if soup.title else "No Title Found"

        structured_data = {
            "Texts": extract_texts(soup, title),
            "Links": extract_links(soup, url),
            "Images": extract_images(soup, url)
        }
        return format_detailed_output(structured_data, title)

    except urllib3.exceptions.HTTPError as e:
        return f"Error: {e}"
    except Exception as e:
        return f"Error processing web page: {e}"

# Gradio Interface
iface = gr.Interface(
    fn=download_and_process_web_page,
    inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
    outputs=gr.Markdown(label="Web Page Content"),
    title="Web Page Processor for Hugging Face Chat Tools",
    description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools."
)

# Launch the interface without sharing
iface.launch()