Dataset / getFiles /getGithub.py
vansh9878's picture
files added
825e978
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
def githubDataset(url,query):
# time.sleep(3)
download_folder = os.path.abspath(f"./downloads/{query}")
os.makedirs(download_folder, exist_ok=True)
chrome_options = Options()
chrome_options.add_argument("--headless") # Uncomment to run headless (no UI)
chrome_options.add_experimental_option("prefs", {
"download.default_directory": download_folder, # Set the custom download folder
"download.prompt_for_download": False, # Don't ask for confirmation to download
"download.directory_upgrade": True, # Allow downloading into the custom folder
"safebrowsing.enabled": True # Enable safe browsing (to avoid warnings during download)
})
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
try:
csv_links = driver.find_elements(By.XPATH, "//a[contains(@href, '.csv')]")
for link in csv_links:
csv_file_name = link.text
if csv_file_name.endswith(".csv"):
print(f"Found CSV file: {csv_file_name}")
href=link.get_attribute("href")
# print("hello : "+href)
driver.get(href)
time.sleep(5)
download_button = driver.find_element(By.XPATH, "//button[contains(@class, 'Box-sc-g0xbh4-0 ivobqY prc-Button-ButtonBase-c50BI prc-Button-IconButton-szpyj')]")
href2=download_button.get_attribute("href")
if href2:
driver.get(href2)
print("Button clicked!!")
else:
download_button.click()
time.sleep(7)
break
else:
print("No CSV file found.")
except Exception as e:
print("No CSV File")
print(e)
finally:
driver.quit()
# print(f"CSV file should be downloaded to {download_folder}")
# githubDataset("https://github.com/ageron/handson-ml2/tree/master/datasets/housing","housing")
# githubDataset("https://github.com/nytimes/covid-19-data","housing")