Spaces:
Sleeping
Sleeping
Update scraper.py
Browse files- scraper.py +10 -1
scraper.py
CHANGED
@@ -6,14 +6,23 @@ import logging
|
|
6 |
import os
|
7 |
import time
|
8 |
import random
|
|
|
9 |
from config import SCRAPER_TIMEOUT, CHROME_DRIVER_PATH, SCRAPER_MAX_RETRIES
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
12 |
def get_text(url, n_words=15):
|
13 |
try:
|
14 |
driver = None
|
15 |
logging.warning(f"Initiated Scraping {url}")
|
16 |
-
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
|
|
|
|
|
|
|
|
|
17 |
options = uc.ChromeOptions()
|
18 |
options.add_argument("--headless")
|
19 |
options.add_argument(f"user-agent={user_agent}")
|
|
|
6 |
import os
|
7 |
import time
|
8 |
import random
|
9 |
+
import pandas as pd
|
10 |
from config import SCRAPER_TIMEOUT, CHROME_DRIVER_PATH, SCRAPER_MAX_RETRIES
|
11 |
|
12 |
|
13 |
+
USER_AGENTS = us_ag = pd.read_csv("https://gist.githubusercontent.com/pzb/b4b6f57144aea7827ae4/raw/cf847b76a142955b1410c8bcef3aabe221a63db1/user-agents.txt", sep="\t", header=None)
|
14 |
+
USER_AGENTS = USER_AGENTS.iloc[:, 0].copy()
|
15 |
+
|
16 |
+
|
17 |
def get_text(url, n_words=15):
|
18 |
try:
|
19 |
driver = None
|
20 |
logging.warning(f"Initiated Scraping {url}")
|
21 |
+
# user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
|
22 |
+
ua = USER_AGENTS[np.random.randint(low=0, high=len(USER_AGENTS), size=1)]
|
23 |
+
ua = ua.reset_index(drop=True)
|
24 |
+
ua = ua[0]
|
25 |
+
user_agent = ua
|
26 |
options = uc.ChromeOptions()
|
27 |
options.add_argument("--headless")
|
28 |
options.add_argument(f"user-agent={user_agent}")
|