ksvmuralidhar commited on
Commit
0ec3dc5
·
verified ·
1 Parent(s): 5bf68ac

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +10 -1
scraper.py CHANGED
@@ -6,14 +6,23 @@ import logging
6
  import os
7
  import time
8
  import random
 
9
  from config import SCRAPER_TIMEOUT, CHROME_DRIVER_PATH, SCRAPER_MAX_RETRIES
10
 
11
 
 
 
 
 
12
  def get_text(url, n_words=15):
13
  try:
14
  driver = None
15
  logging.warning(f"Initiated Scraping {url}")
16
- user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
 
 
 
 
17
  options = uc.ChromeOptions()
18
  options.add_argument("--headless")
19
  options.add_argument(f"user-agent={user_agent}")
 
6
  import os
7
  import time
8
  import random
9
+ import pandas as pd
10
  from config import SCRAPER_TIMEOUT, CHROME_DRIVER_PATH, SCRAPER_MAX_RETRIES
11
 
12
 
13
+ USER_AGENTS = us_ag = pd.read_csv("https://gist.githubusercontent.com/pzb/b4b6f57144aea7827ae4/raw/cf847b76a142955b1410c8bcef3aabe221a63db1/user-agents.txt", sep="\t", header=None)
14
+ USER_AGENTS = USER_AGENTS.iloc[:, 0].copy()
15
+
16
+
17
  def get_text(url, n_words=15):
18
  try:
19
  driver = None
20
  logging.warning(f"Initiated Scraping {url}")
21
+ # user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
22
+ ua = USER_AGENTS[np.random.randint(low=0, high=len(USER_AGENTS), size=1)]
23
+ ua = ua.reset_index(drop=True)
24
+ ua = ua[0]
25
+ user_agent = ua
26
  options = uc.ChromeOptions()
27
  options.add_argument("--headless")
28
  options.add_argument(f"user-agent={user_agent}")