File size: 8,745 Bytes
da88570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import gzip
import sys

import httpx
from urllib.parse import urljoin

from src.crawler.utils.regEx import PATTERNS
from src.crawler.utils.keywords import KEYWORDS
from src.crawler.crawler_service import *
from urllib.robotparser import RobotFileParser

URL_KEYWORDS = [
    "veranstaltung", "event", "kalender", "kunst", "kultur",
    "freizeit", "termine",
    "happenings", "ausgehen", "aktivitäten", "aktivitaeten", "programm",
    "wochenendtipps", "party", "festivals", "konzerte", "musik",
    "shows", "theater", "veranstaltungskalender", "ausstellungen", "feste", "spielplan", "veranstaltungsplan"
]


class Crawler:
    sys.path.append("..")
    # filter variables
    keywords = KEYWORDS
    url_patterns = PATTERNS

    def __init__(self, url: str, url_type: str, depth: int):

        self.visited_urls = set()
        self.excluded_urls = set()
        self.excluded_urls.update(set(get_disallowed_urls(url)))
        self.url_type = url_type
        # set tupel of url and max depth to crawl
        self.queue = [(url, depth)]
        self.domain = urlparse(url).netloc
        self.sitemaps_urls = get_sitemaps(url)
        print(self.url_type)
        print(f"Crawler startet for {url}")


    def crawl(self):
        # Loop through the URLs in the queue until it is empty
        if self.sitemaps_urls:
            print("Sitemaps Crawler startet...")
            for url in self.sitemaps_urls:
                if self.include_url(url):
                    print("include URL: ", url)
                    self.visited_urls.add(url)
        else:
            print("Crawler startet...")
            while self.queue:
                try:
                    # get the next URL to crawl
                    url_tupels = self.queue.pop(0)
                    current_url = url_tupels[0]
                    depth = url_tupels[1]
                    # access = ask_robots(current_url, "*")
                    access = True
                    # make request
                    if access and depth > 0:
                        response = requests.get(current_url)
                        if response.status_code >= 400:
                            print(f"Skipping {current_url} with status code {response.status_code}")
                            continue
                        page_content = response.content

                        # Parse the HTML content and extract links to other pages
                        soup = BeautifulSoup(page_content, "lxml")
                        urls_to_crawl = self.find_urls(soup, current_url)
                        urls_to_crawl_tupels = []
                        for url in urls_to_crawl:
                            if self.include_url(url):
                                urls_to_crawl_tupels.append((url, depth - 1))

                        # Add the new URLs to the queue and mark the current URL as visited
                        self.queue.extend(urls_to_crawl_tupels)


                        print(f"Crawled {current_url} and found {len(urls_to_crawl)} new URLs to crawl")
                    else:
                        print("access denied for ",current_url )
                except Exception as e:
                    print("Exception:", e)

                self.visited_urls.add(current_url)
                if current_url in self.queue:
                    self.queue.remove(current_url)
        print("Done. Found ", len(self.visited_urls), " Urls")
        return self.visited_urls

    def find_urls(self, soup: BeautifulSoup, current_url: str):
        # get all links from page content
        links = soup.find_all("a", href=True)
        urls_to_crawl = set()
        for link in links:
            href = link["href"]
            url = urljoin(current_url, href)
            url = urlparse(url)._replace(query="", fragment="").geturl()
            urls_to_crawl.add(url)
        return urls_to_crawl


    def include_url(self,url) -> bool:

        if urlparse(url).netloc.lower() != self.domain.lower() \
                or url in self.visited_urls \
                or not check_regex(url, self.url_patterns)\
                or url in self.queue\
                or url in self.excluded_urls:
            return False
        else:
            print("Checking ", url)
            # if self.url_type == "city":
            if any(keyword in url for keyword in URL_KEYWORDS):
                print("Found Event URL:", url)
                return True
            else:
                self.excluded_urls.add(url)
                return False
            # else:
            #     # if ask_robots(url,"*"):
            #     if True:
            #         response = requests.get(url)
            #         if response.status_code >= 400:
            #             self.excluded_urls.add(url)
            #             print(f"Skipping {url} with status code {response.status_code}")
            #             return False
            #         else:
            #             page_content = response.content
            #             # Parse the HTML content and extract links to other pages
            #             soup = BeautifulSoup(page_content, "html.parser")
            #             # remove navigation elements
            #             for nav in soup.find_all('nav'):
            #                 nav.decompose()
            #
            #             # Step 2: Remove elements with "navigation" or "menu" in the id or class attributes
            #
            #             nav_elements= []
            #             nav_elements.extend(soup.find_all(id=re.compile(r'.*navigation.*')))
            #             nav_elements.extend(soup.find_all(id=re.compile(r'.*menu.*')))
            #             nav_elements.extend(soup.find_all(class_=re.compile(r'.*navigation.*')))
            #             nav_elements.extend(soup.find_all(class_=re.compile(r'.*menu.*')))
            #
            #             print(len(nav_elements))
            #             for elem in nav_elements:
            #                 if elem:
            #                     elem.decompose()
            #
            #             content = get_page_content(soup)
            #             print("searching content for keywords...")
            #             if check_keywords(content, self.keywords):
            #                 print("Found Keyword in ", url)
            #                 return True
            #             else:
            #                 self.excluded_urls.add(url)
            #                 return False



def get_sitemaps(url):
    url_parsed = urlparse(url)
    url_robots_txt = url_parsed.scheme + '://' + url_parsed.netloc + '/robots.txt'
    sitemaps = set()
    all_urls = set()

    robotParse = urllib.robotparser.RobotFileParser()
    robotParse.set_url(url_robots_txt)
    try:
        robotParse.read()
        robot_sitemaps = robotParse.site_maps()
        if robot_sitemaps:
            sitemaps.update(robot_sitemaps)
        else:
            sitemaps.add(url_parsed.scheme + "://" + url_parsed.netloc + "/sitemap.xml")
            sitemaps.add(url_parsed.scheme + "://" + url_parsed.netloc + "/sitemaps.xml")


        for sitemap in sitemaps:
            print("Parsing sitemap:", sitemap)
            sitemap_urls = get_urls_from_sitemap(sitemap, set())
            all_urls.update(sitemap_urls)

        print("Total urls collected from sitemaps: ", len(all_urls))
    except Exception as e:
        print("Exception while parsing sitemap from ", url, ":", e)
    return all_urls

def get_urls_from_sitemap(sitemap, urls):
    print("Getting URLs from sitemap:", sitemap)
    try:
        response = httpx.get(sitemap)
        if response.status_code == httpx.codes.OK:
            # Prüfen, ob die Sitemap eine gezippte Datei ist
            content_type = response.headers.get('Content-Type', '')
            if 'gzip' in content_type or sitemap.endswith('.gz'):
                # Falls gezippt, entpacken und lesen
                decompressed_content = gzip.decompress(response.content)
                soup = BeautifulSoup(decompressed_content, 'lxml')
            else:
                # Falls nicht gezippt, direkt parsen
                soup = BeautifulSoup(response.content, 'lxml')

            # URLs aus der Sitemap extrahieren
            locs = soup.find_all("loc")
            for loc in locs:
                url = loc.get_text()
                if "sitemap" in url:
                    # Rekursiv aufrufen, falls es eine Unter-Sitemap ist
                    urls.update(get_urls_from_sitemap(url, urls))
                else:
                    urls.add(url)

    except Exception as e:
        print("Exception while resolving sitemap:", sitemap, "-", e)
    return urls