Spaces:
Runtime error
Runtime error
File size: 7,047 Bytes
5ea2a69 0d628a0 5ea2a69 0d628a0 76b8fa2 6620ef1 2f22c67 5ea2a69 0d628a0 5ea2a69 2586d92 ddffaa4 2586d92 e73d501 5ea2a69 aee4cdd 5ea2a69 0d628a0 0741da6 5ea2a69 0d628a0 0741da6 3f55db3 0d628a0 9cf039d 0d628a0 e1fe61c 5ea2a69 1480aa8 6620ef1 0d628a0 5ea2a69 c97b86a 0d628a0 d46a650 0d628a0 d46a650 6620ef1 d46a650 1480aa8 6620ef1 0d628a0 6620ef1 1480aa8 f3dda89 2586d92 6620ef1 0d628a0 f3dda89 0d628a0 a79c682 6620ef1 6794b69 6620ef1 0d628a0 6620ef1 5ea2a69 0d628a0 6620ef1 0d628a0 6620ef1 0d628a0 6620ef1 6794b69 6620ef1 5ea2a69 19d8bb4 e45053f 19d8bb4 d46a650 0d628a0 0afb7c1 0d628a0 d46a650 0d628a0 ba1aaa5 0d628a0 62139c9 6620ef1 aee4cdd 6620ef1 0d628a0 c97b86a d46a650 0d628a0 d46a650 5ea2a69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import os
import shutil
import gradio as gr
from transformers import ReactCodeAgent, HfEngine, Tool
import pandas as pd
from gradio import Chatbot
from transformers.agents import stream_to_gradio
from huggingface_hub import login
from gradio.data_classes import FileData
import google.generativeai as genai
os.environ["API_KEY"] = os.environ["API_KEY"]
os.environ["GOOGLE_API_KEY"] = os.environ["API_KEY"]
genai.configure(api_key=os.environ["API_KEY"])
generation_config = {
"temperature": 0.2,
"top_p": 0.95,
"top_k": 0,
"max_output_tokens": 8192,
}
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
]
context = "You are an expert data analyst who can provide guidance around what needs to be analyzed from a dataset by just looking at metadata."
system_instruction = context
import re
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
generation_config=generation_config,
system_instruction=system_instruction,
safety_settings=safety_settings)
def model_response(text):
#model = genai.GenerativeModel('gemini-pro')
response = model.generate_content(text)
return response.text
login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")
agent = ReactCodeAgent(
tools=[],
llm_engine=llm_engine,
additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "scipy.stats"],
max_iterations=10,
)
base_prompt = """You are an expert data analyst.
According to the features you have and the data structure given below, determine which feature should be the target.
If a user asks a very specific question, then just answer that question by performing data exploration. If not, then list 5 interesting questions that could be asked on this data by examining the metadata of the columns, for instance about specific correlations with target variable.
For example, outlier analysis and trend analysis are considered interesting questions.
Then answer these questions one by one, by finding the relevant numbers.
Meanwhile, plot some figures using matplotlib/seaborn and save them to the (already existing) folder './figures/': take care to clear each figure with plt.clf() before doing another plot.
Generate a summary of each of the plot generated.
In your final answer: summarize these correlations and trends
After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
Your final answer should be a long string with at least 3 numbered and detailed parts.
You should also include 3 follow-up questions that can be answered with this analysis
Provide suggestions around what additional input needs to be provided by the user for better analysis
Structure of the data:
{structure_notes}
The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
"""
example_notes="""This data is about the telco churn data. I am interested in understanding the factors behind the churn."""
def get_images_in_directory(directory):
image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}
image_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if os.path.splitext(file)[1].lower() in image_extensions:
image_files.append(os.path.join(root, file))
return image_files
def interact_with_agent(file_input, file_input_2, additional_notes):
shutil.rmtree("./figures")
os.makedirs("./figures")
file_1 = pd.read_csv(file_input)
file_2 = pd.read_csv(file_input_2)
print (file_1.head())
print (file_2.head())
data_file = pd.read_csv(file_input)
data_structure_notes = f"""- Description (output of .describe()):
{data_file.describe()}
- Columns with dtypes:
{data_file.dtypes}"""
enhanced_notes = model_response(f'''Given the metadata of the dataset {data_structure_notes} and the context provided by the user {additional_notes}, figure out the
domain this dataset belongs to. Now assume the role of an expert data analyst in this domain and generate instructions/commentary that will help a large language model analyze
this dataset.''')
prompt = base_prompt.format(structure_notes=data_structure_notes)
if additional_notes and len(additional_notes) > 0:
prompt += "\nAdditional notes on the data:\n" + enhanced_notes
messages = [gr.ChatMessage(role="user", content=enhanced_notes)]
yield messages + [
gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
]
plot_image_paths = {}
for msg in stream_to_gradio(agent, prompt, data_file=data_file):
messages.append(msg)
for image_path in get_images_in_directory("./figures"):
if image_path not in plot_image_paths:
image_message = gr.ChatMessage(
role="assistant",
content=FileData(path=image_path, mime_type="image/png"),
)
plot_image_paths[image_path] = True
messages.append(image_message)
yield messages + [
gr.ChatMessage(role="assistant", content="⏳ _Still processing..._")
]
yield messages
with gr.Blocks(
theme=gr.themes.Soft(
primary_hue=gr.themes.colors.green,
secondary_hue=gr.themes.colors.blue,
)
) as demo:
gr.Markdown("""# Agentville Autonomous Data Exploration 📈🧠 (Research preview)
Drop a `.csv` file below, add notes to describe this data if needed, and **Agents powered by Gemini and Llama-3.1-70B will analyze the file content and does the analysis for you!**""")
file_input = gr.File(label="Your file to analyze")
file_input_2 = gr.File(label="Your file to analyze")
text_input = gr.Textbox(
label="Additional notes to guide the analysis"
)
submit = gr.Button("Run analysis!", variant="primary")
chatbot = gr.Chatbot(
label="Data Analyst Agent",
type="messages",
avatar_images=(
None,
"https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
),
)
gr.Examples(
examples=[["./example/churn.csv", example_notes]],
inputs=[file_input, file_input_2, text_input],
cache_examples=False
)
submit.click(interact_with_agent, [file_input, file_input_2,text_input], [chatbot])
if __name__ == "__main__":
demo.launch() |