Spaces:
Build error
Build error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from ydata_profiling import ProfileReport | |
import json | |
import os | |
from langchain.llms import HuggingFaceHub | |
from langchain.chains import LLMChain | |
from langchain.prompts import PromptTemplate | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain.tools.python.tool import PythonAstREPLTool | |
from langchain.agents import AgentExecutor, create_react_agent | |
from langchain_experimental.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent | |
from langchain.agents.agent_types import AgentType | |
# Set page configuration | |
st.set_page_config(page_title="Interactive Data Profiler & Chat", layout="wide", page_icon="π") | |
# Create session states for DataFrame and chat history if they don't exist | |
if 'df' not in st.session_state: | |
st.session_state.df = None | |
if 'chat_history' not in st.session_state: | |
st.session_state.chat_history = [] | |
if 'suggestions' not in st.session_state: | |
st.session_state.suggestions = [] | |
# Initialize Hugging Face API | |
def get_llm(): | |
# Using a small but capable open-source model | |
llm = HuggingFaceHub( | |
repo_id="google/flan-t5-large", | |
model_kwargs={"temperature": 0.1, "max_length": 512}, | |
huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN", "") | |
) | |
return llm | |
# Function to generate report | |
def generate_profile_report(df): | |
with st.spinner("Generating profile report..."): | |
profile = ProfileReport(df, | |
title="Profiling Report", | |
explorative=True, | |
minimal=True) # Minimal for faster processing | |
return profile | |
# Function to generate query suggestions | |
def generate_suggestions(df): | |
# Get basic info about the dataframe | |
num_rows = df.shape[0] | |
num_cols = df.shape[1] | |
column_names = df.columns.tolist() | |
data_types = df.dtypes.astype(str).tolist() | |
# Sample suggestions based on dataframe structure | |
suggestions = [ | |
f"How many rows are in this dataset?", | |
f"What are all the column names?", | |
f"Show me the first 5 rows", | |
f"What is the average of {column_names[0] if len(column_names) > 0 else 'column'}" | |
] | |
# Add column-specific suggestions | |
for col, dtype in zip(column_names[:min(3, len(column_names))], data_types[:min(3, len(data_types))]): | |
if 'int' in dtype or 'float' in dtype: | |
suggestions.append(f"What is the mean value of {col}?") | |
suggestions.append(f"What is the maximum value of {col}?") | |
elif 'object' in dtype or 'str' in dtype: | |
suggestions.append(f"What are the unique values in {col}?") | |
suggestions.append(f"How many missing values in {col}?") | |
return suggestions | |
# Function to execute pandas operations safely | |
def execute_pandas_query(df, query): | |
try: | |
# Create pandas agent | |
agent = create_pandas_dataframe_agent( | |
llm=get_llm(), | |
df=df, | |
agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, | |
verbose=True | |
) | |
# Execute query | |
result = agent.run(query) | |
return result | |
except Exception as e: | |
# Fallback to basic operations if agent fails | |
if "rows" in query.lower() and "how many" in query.lower(): | |
return f"The dataset has {df.shape[0]} rows." | |
elif "columns" in query.lower() and "how many" in query.lower(): | |
return f"The dataset has {df.shape[1]} columns." | |
elif "column names" in query.lower(): | |
return f"The column names are: {', '.join(df.columns.tolist())}" | |
elif "first" in query.lower() and "rows" in query.lower(): | |
num = 5 # Default | |
for word in query.split(): | |
if word.isdigit(): | |
num = int(word) | |
break | |
return df.head(num).to_string() | |
elif "describe" in query.lower(): | |
return df.describe().to_string() | |
else: | |
return f"I couldn't process that query. Error: {str(e)}" | |
# Main app header | |
st.title("π Interactive Data Profiler & Chat") | |
st.markdown(""" | |
Upload your CSV file to get detailed profiling and ask questions about your data! | |
This app combines interactive data profiling with a chat interface for data exploration. | |
""") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload a CSV file", type="csv") | |
# Process uploaded file | |
if uploaded_file is not None: | |
try: | |
# Read CSV into DataFrame | |
df = pd.read_csv(uploaded_file) | |
st.session_state.df = df | |
st.success(f"β File uploaded successfully! Found {df.shape[0]} rows and {df.shape[1]} columns.") | |
# Generate suggestions when a new file is uploaded | |
if len(st.session_state.suggestions) == 0: | |
st.session_state.suggestions = generate_suggestions(df) | |
# Create tabs for different functionalities | |
tab1, tab2 = st.tabs(["π Data Profiling", "π¬ Data Chat"]) | |
# Tab 1: Data Profiling | |
with tab1: | |
st.header("Data Profiling") | |
# Basic info | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.metric("Rows", df.shape[0]) | |
with col2: | |
st.metric("Columns", df.shape[1]) | |
with col3: | |
st.metric("Missing Values", df.isna().sum().sum()) | |
# Show raw data sample | |
with st.expander("Preview Data"): | |
st.dataframe(df.head(10)) | |
# Generate the profile report | |
profile = generate_profile_report(df) | |
# Convert report to HTML and display | |
report_html = profile.to_html() | |
st.components.v1.html(report_html, height=1000, scrolling=True) | |
# Provide download button | |
st.write("### Download the Profiling Report") | |
report_bytes = report_html.encode('utf-8') | |
st.download_button( | |
label="Download Report (HTML)", | |
data=report_bytes, | |
file_name="profiling_report.html", | |
mime="text/html" | |
) | |
# Tab 2: Interactive Chat | |
with tab2: | |
st.header("Chat with Your Data") | |
st.info("Ask questions about your data and get instant answers!") | |
# Chat input and suggested questions | |
user_question = st.text_input("Your question:", key="question_input") | |
# Show suggestion chips | |
st.write("Suggested questions:") | |
cols = st.columns(2) | |
for i, suggestion in enumerate(st.session_state.suggestions): | |
col_idx = i % 2 | |
with cols[col_idx]: | |
if st.button(suggestion, key=f"suggestion_{i}"): | |
user_question = suggestion | |
st.session_state.question_input = suggestion | |
st.experimental_rerun() | |
# Process question | |
if user_question: | |
st.session_state.chat_history.append({"role": "user", "content": user_question}) | |
# Get answer | |
with st.spinner("Thinking..."): | |
answer = execute_pandas_query(df, user_question) | |
# Add answer to chat history | |
st.session_state.chat_history.append({"role": "assistant", "content": answer}) | |
# Display chat history | |
st.write("### Conversation History") | |
for message in st.session_state.chat_history: | |
if message["role"] == "user": | |
st.markdown(f"**You:** {message['content']}") | |
else: | |
st.markdown(f"**Assistant:** {message['content']}") | |
st.markdown("---") | |
# Clear chat button | |
if st.button("Clear Chat History"): | |
st.session_state.chat_history = [] | |
st.experimental_rerun() | |
except Exception as e: | |
st.error(f"An error occurred: {str(e)}") | |
else: | |
st.info("π Please upload a CSV file to begin.") | |
# Placeholder visuals | |
st.markdown("### What you can do with this app:") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("**π Data Profiling**") | |
st.markdown("- Automatic data quality assessment") | |
st.markdown("- Column statistics and distributions") | |
st.markdown("- Correlation analysis") | |
st.markdown("- Missing values analysis") | |
with col2: | |
st.markdown("**π¬ Interactive Data Chat**") | |
st.markdown("- Ask natural language questions") | |
st.markdown("- Get instant insights") | |
st.markdown("- Suggested questions for quick exploration") | |
st.markdown("- No coding required!") |