|
import os |
|
import pandas as pd |
|
from openai import OpenAI |
|
from dotenv import load_dotenv |
|
import google.generativeai as genai |
|
import ast |
|
load_dotenv() |
|
|
|
def LabelsExtraction(query,dfs,csv_files,skip): |
|
|
|
columnNames={} |
|
j=0 |
|
for i,df in enumerate(dfs): |
|
if j in skip: |
|
j+=1 |
|
name=os.path.basename(csv_files[j]).lower() |
|
columnNames[name]=df.columns |
|
j+=1 |
|
|
|
|
|
|
|
client = OpenAI( |
|
api_key=os.getenv("OPENAI_API_KEY"), |
|
base_url=os.getenv("OPENAI_API_BASE_URL", "") |
|
) |
|
prompt = ( |
|
"The following is a dictionary with key as name of csv file and value as array of their column headers:\n\n" |
|
"Eg: {'21754539_dataset.csv': Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street','Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType','HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt'])}" |
|
+ "The content in the Index([]) is the list of the headers" |
|
+ f"{columnNames}" |
|
+ "\n\nwith this info try to figure out which column in each file represents its label and also indentify if there is any file with no label" |
|
"Return an array with column name that represents the label in that file and if there is no label return 0 in that place. Match the indexes as given in the dictionary, do not return any other content, just reutrn the array" |
|
+ f"the labels which you will return must in context with this query {query}, so return the most relevant labels" |
|
) |
|
|
|
response = client.chat.completions.create( |
|
model="gpt-4", |
|
messages=[ |
|
{"role": "system", "content": "You are a helpful data analyst."}, |
|
{"role": "user", "content": prompt} |
|
], |
|
temperature=0.3 |
|
) |
|
|
|
merge_map_text = response.choices[0].message.content.strip() |
|
stripped=merge_map_text.split("```python")[1].replace("[","").replace("]","").replace("```","").split(",") |
|
array=[str1.replace("\n","").strip() for str1 in stripped] |
|
print(array) |
|
arr2=[arr.strip("'") for arr in array] |
|
print(arr2) |
|
|
|
return arr2 |
|
|
|
|
|
def LabelsExtraction2(query,dfs,csv_files,skip): |
|
|
|
columnNames={} |
|
j=0 |
|
for i,df in enumerate(dfs): |
|
if j in skip: |
|
j+=1 |
|
name=os.path.basename(csv_files[j]).lower() |
|
columnNames[name]=df.columns |
|
j+=1 |
|
|
|
|
|
|
|
prompt = ( |
|
"You are given a dictionary where each key is the name of a CSV file, and the value is an array (in pandas Index format) representing the column headers of that file.\n\n" |
|
"Example format:\n" |
|
"{'21754539_dataset.csv': Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', " |
|
"'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', " |
|
"'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt'])}\n\n" |
|
"Your task is to analyze this dictionary and, for each file, determine which column is most likely to represent the " |
|
"**label** (i.e., the target variable) relevant to the following query:\n\n" |
|
f"{query}\n\n" |
|
"If you believe a file has **no clear label** based on the column names and the query, return **0** for that file.\n\n" |
|
"Return your response as a **Python list**, maintaining the **same order as the keys in the input dictionary**. Each " |
|
"entry in the list should be either:\n" |
|
"- the column name (string) that most likely represents the label for that file, or\n" |
|
"- the integer `0` if no label can be identified.\n\n" |
|
"⚠️ Do not return any explanation, reasoning, or code. Only return the final list of labels, e.g.:\n" |
|
"```python\n['SalePrice', 0, 'target_column_name']\n```\n\n" |
|
"Now use the following data to generate your answer:\n" |
|
f"{columnNames}" |
|
) |
|
|
|
genai.configure(api_key=os.getenv("gemini_api")) |
|
|
|
model = genai.GenerativeModel("gemini-2.0-flash") |
|
response = model.generate_content(prompt) |
|
merge_map_text = response.text.strip() |
|
print(merge_map_text) |
|
str1=merge_map_text.replace("```","").replace("python","") |
|
actual_list = ast.literal_eval(str1) |
|
return actual_list |
|
|
|
|