File size: 8,745 Bytes
da88570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import gzip
import sys
import httpx
from urllib.parse import urljoin
from src.crawler.utils.regEx import PATTERNS
from src.crawler.utils.keywords import KEYWORDS
from src.crawler.crawler_service import *
from urllib.robotparser import RobotFileParser
URL_KEYWORDS = [
"veranstaltung", "event", "kalender", "kunst", "kultur",
"freizeit", "termine",
"happenings", "ausgehen", "aktivitäten", "aktivitaeten", "programm",
"wochenendtipps", "party", "festivals", "konzerte", "musik",
"shows", "theater", "veranstaltungskalender", "ausstellungen", "feste", "spielplan", "veranstaltungsplan"
]
class Crawler:
sys.path.append("..")
# filter variables
keywords = KEYWORDS
url_patterns = PATTERNS
def __init__(self, url: str, url_type: str, depth: int):
self.visited_urls = set()
self.excluded_urls = set()
self.excluded_urls.update(set(get_disallowed_urls(url)))
self.url_type = url_type
# set tupel of url and max depth to crawl
self.queue = [(url, depth)]
self.domain = urlparse(url).netloc
self.sitemaps_urls = get_sitemaps(url)
print(self.url_type)
print(f"Crawler startet for {url}")
def crawl(self):
# Loop through the URLs in the queue until it is empty
if self.sitemaps_urls:
print("Sitemaps Crawler startet...")
for url in self.sitemaps_urls:
if self.include_url(url):
print("include URL: ", url)
self.visited_urls.add(url)
else:
print("Crawler startet...")
while self.queue:
try:
# get the next URL to crawl
url_tupels = self.queue.pop(0)
current_url = url_tupels[0]
depth = url_tupels[1]
# access = ask_robots(current_url, "*")
access = True
# make request
if access and depth > 0:
response = requests.get(current_url)
if response.status_code >= 400:
print(f"Skipping {current_url} with status code {response.status_code}")
continue
page_content = response.content
# Parse the HTML content and extract links to other pages
soup = BeautifulSoup(page_content, "lxml")
urls_to_crawl = self.find_urls(soup, current_url)
urls_to_crawl_tupels = []
for url in urls_to_crawl:
if self.include_url(url):
urls_to_crawl_tupels.append((url, depth - 1))
# Add the new URLs to the queue and mark the current URL as visited
self.queue.extend(urls_to_crawl_tupels)
print(f"Crawled {current_url} and found {len(urls_to_crawl)} new URLs to crawl")
else:
print("access denied for ",current_url )
except Exception as e:
print("Exception:", e)
self.visited_urls.add(current_url)
if current_url in self.queue:
self.queue.remove(current_url)
print("Done. Found ", len(self.visited_urls), " Urls")
return self.visited_urls
def find_urls(self, soup: BeautifulSoup, current_url: str):
# get all links from page content
links = soup.find_all("a", href=True)
urls_to_crawl = set()
for link in links:
href = link["href"]
url = urljoin(current_url, href)
url = urlparse(url)._replace(query="", fragment="").geturl()
urls_to_crawl.add(url)
return urls_to_crawl
def include_url(self,url) -> bool:
if urlparse(url).netloc.lower() != self.domain.lower() \
or url in self.visited_urls \
or not check_regex(url, self.url_patterns)\
or url in self.queue\
or url in self.excluded_urls:
return False
else:
print("Checking ", url)
# if self.url_type == "city":
if any(keyword in url for keyword in URL_KEYWORDS):
print("Found Event URL:", url)
return True
else:
self.excluded_urls.add(url)
return False
# else:
# # if ask_robots(url,"*"):
# if True:
# response = requests.get(url)
# if response.status_code >= 400:
# self.excluded_urls.add(url)
# print(f"Skipping {url} with status code {response.status_code}")
# return False
# else:
# page_content = response.content
# # Parse the HTML content and extract links to other pages
# soup = BeautifulSoup(page_content, "html.parser")
# # remove navigation elements
# for nav in soup.find_all('nav'):
# nav.decompose()
#
# # Step 2: Remove elements with "navigation" or "menu" in the id or class attributes
#
# nav_elements= []
# nav_elements.extend(soup.find_all(id=re.compile(r'.*navigation.*')))
# nav_elements.extend(soup.find_all(id=re.compile(r'.*menu.*')))
# nav_elements.extend(soup.find_all(class_=re.compile(r'.*navigation.*')))
# nav_elements.extend(soup.find_all(class_=re.compile(r'.*menu.*')))
#
# print(len(nav_elements))
# for elem in nav_elements:
# if elem:
# elem.decompose()
#
# content = get_page_content(soup)
# print("searching content for keywords...")
# if check_keywords(content, self.keywords):
# print("Found Keyword in ", url)
# return True
# else:
# self.excluded_urls.add(url)
# return False
def get_sitemaps(url):
url_parsed = urlparse(url)
url_robots_txt = url_parsed.scheme + '://' + url_parsed.netloc + '/robots.txt'
sitemaps = set()
all_urls = set()
robotParse = urllib.robotparser.RobotFileParser()
robotParse.set_url(url_robots_txt)
try:
robotParse.read()
robot_sitemaps = robotParse.site_maps()
if robot_sitemaps:
sitemaps.update(robot_sitemaps)
else:
sitemaps.add(url_parsed.scheme + "://" + url_parsed.netloc + "/sitemap.xml")
sitemaps.add(url_parsed.scheme + "://" + url_parsed.netloc + "/sitemaps.xml")
for sitemap in sitemaps:
print("Parsing sitemap:", sitemap)
sitemap_urls = get_urls_from_sitemap(sitemap, set())
all_urls.update(sitemap_urls)
print("Total urls collected from sitemaps: ", len(all_urls))
except Exception as e:
print("Exception while parsing sitemap from ", url, ":", e)
return all_urls
def get_urls_from_sitemap(sitemap, urls):
print("Getting URLs from sitemap:", sitemap)
try:
response = httpx.get(sitemap)
if response.status_code == httpx.codes.OK:
# Prüfen, ob die Sitemap eine gezippte Datei ist
content_type = response.headers.get('Content-Type', '')
if 'gzip' in content_type or sitemap.endswith('.gz'):
# Falls gezippt, entpacken und lesen
decompressed_content = gzip.decompress(response.content)
soup = BeautifulSoup(decompressed_content, 'lxml')
else:
# Falls nicht gezippt, direkt parsen
soup = BeautifulSoup(response.content, 'lxml')
# URLs aus der Sitemap extrahieren
locs = soup.find_all("loc")
for loc in locs:
url = loc.get_text()
if "sitemap" in url:
# Rekursiv aufrufen, falls es eine Unter-Sitemap ist
urls.update(get_urls_from_sitemap(url, urls))
else:
urls.add(url)
except Exception as e:
print("Exception while resolving sitemap:", sitemap, "-", e)
return urls
|