|
import openml |
|
from langchain_core.prompts import PromptTemplate |
|
from langchain_folder.llm_helper import llm |
|
from dotenv import load_dotenv |
|
import os |
|
from selenium import webdriver |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.common.keys import Keys |
|
from selenium.webdriver.support.ui import WebDriverWait |
|
from selenium.webdriver.support import expected_conditions as EC |
|
from selenium.webdriver.chrome.options import Options |
|
from selenium.webdriver.chrome.service import Service |
|
import requests |
|
import openai_openml as oo |
|
|
|
|
|
load_dotenv() |
|
url_list = [] |
|
api_key = os.getenv('openml_api') |
|
openml.config.apikey = api_key |
|
|
|
def extract_keywords(query): |
|
prompt = PromptTemplate.from_template(""" |
|
You are an assistant whose job is to extract the keywords from the query and return it: |
|
Query = "{query}" |
|
For example, if the query is Generate a list of links to datasets related to house price prediction |
|
your response should be -> "house price". |
|
Note that the query might not always be related to house price predictions, it can be related to other things as well. |
|
return only the keywords do not return anything else |
|
""") |
|
rendered_prompt = prompt.format(query=query) |
|
response = llm.invoke(rendered_prompt) |
|
return response.content |
|
|
|
def fetch_dataset_urls(query, limit=4): |
|
print(f"Searching for datasets related to: {query}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
global url_list |
|
url_list=oo.openDataset(query) |
|
|
|
|
|
def openDataset(user_prompt): |
|
|
|
extracted_keywords = extract_keywords(user_prompt) |
|
print(extracted_keywords) |
|
fetch_dataset_urls(extracted_keywords) |
|
|
|
download_folder = "./input_folder/"+user_prompt |
|
if not os.path.exists(download_folder): |
|
os.makedirs(download_folder) |
|
|
|
chrome_options = Options() |
|
chrome_options.add_argument("--headless") |
|
chrome_options.add_experimental_option("prefs", { |
|
"download.default_directory": download_folder, |
|
"download.prompt_for_download": False, |
|
"download.directory_upgrade": True, |
|
"safebrowsing.enabled": True |
|
}) |
|
driver = webdriver.Chrome(options=chrome_options) |
|
for url in url_list: |
|
driver.get(url) |
|
try: |
|
download_button = WebDriverWait(driver, 10).until( |
|
EC.presence_of_element_located((By.CSS_SELECTOR, "a[aria-label='Download dataset']")) |
|
) |
|
actual_download_url = download_button.get_attribute("href") |
|
filename = actual_download_url.split("/")[-2] + "_" + actual_download_url.split("/")[-1] |
|
file_path = os.path.join(download_folder, filename) |
|
|
|
print(f"β¬οΈ Downloading from {actual_download_url}") |
|
response = requests.get(actual_download_url) |
|
with open(file_path, "wb") as f: |
|
f.write(response.content) |
|
print(f"β
Saved to {file_path}\n") |
|
|
|
except Exception as e: |
|
print(f"β Failed to fetch or download from {url}: {e}") |
|
|
|
|
|
|
|
|