import os import pandas as pd from openai import OpenAI from dotenv import load_dotenv import google.generativeai as genai import ast load_dotenv() def LabelsExtraction(query,dfs,csv_files,skip): columnNames={} j=0 for i,df in enumerate(dfs): if j in skip: j+=1 name=os.path.basename(csv_files[j]).lower() columnNames[name]=df.columns j+=1 # print(columnNames) client = OpenAI( api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_API_BASE_URL", "") ) prompt = ( "The following is a dictionary with key as name of csv file and value as array of their column headers:\n\n" "Eg: {'21754539_dataset.csv': Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street','Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType','HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt'])}" + "The content in the Index([]) is the list of the headers" + f"{columnNames}" + "\n\nwith this info try to figure out which column in each file represents its label and also indentify if there is any file with no label" "Return an array with column name that represents the label in that file and if there is no label return 0 in that place. Match the indexes as given in the dictionary, do not return any other content, just reutrn the array" + f"the labels which you will return must in context with this query {query}, so return the most relevant labels" ) response = client.chat.completions.create( model="gpt-4", messages=[ {"role": "system", "content": "You are a helpful data analyst."}, {"role": "user", "content": prompt} ], temperature=0.3 ) merge_map_text = response.choices[0].message.content.strip() stripped=merge_map_text.split("```python")[1].replace("[","").replace("]","").replace("```","").split(",") array=[str1.replace("\n","").strip() for str1 in stripped] print(array) arr2=[arr.strip("'") for arr in array] print(arr2) # print(arr2) return arr2 def LabelsExtraction2(query,dfs,csv_files,skip): columnNames={} j=0 for i,df in enumerate(dfs): if j in skip: j+=1 name=os.path.basename(csv_files[j]).lower() columnNames[name]=df.columns j+=1 # print(columnNames) prompt = ( "You are given a dictionary where each key is the name of a CSV file, and the value is an array (in pandas Index format) representing the column headers of that file.\n\n" "Example format:\n" "{'21754539_dataset.csv': Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', " "'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', " "'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt'])}\n\n" "Your task is to analyze this dictionary and, for each file, determine which column is most likely to represent the " "**label** (i.e., the target variable) relevant to the following query:\n\n" f"{query}\n\n" "If you believe a file has **no clear label** based on the column names and the query, return **0** for that file.\n\n" "Return your response as a **Python list**, maintaining the **same order as the keys in the input dictionary**. Each " "entry in the list should be either:\n" "- the column name (string) that most likely represents the label for that file, or\n" "- the integer `0` if no label can be identified.\n\n" "⚠️ Do not return any explanation, reasoning, or code. Only return the final list of labels, e.g.:\n" "```python\n['SalePrice', 0, 'target_column_name']\n```\n\n" "Now use the following data to generate your answer:\n" f"{columnNames}" ) genai.configure(api_key=os.getenv("gemini_api")) model = genai.GenerativeModel("gemini-2.0-flash") response = model.generate_content(prompt) merge_map_text = response.text.strip() print(merge_map_text) str1=merge_map_text.replace("```","").replace("python","") actual_list = ast.literal_eval(str1) return actual_list