chat-with-csv / app.py
Shreneek's picture
Create app.py
28545e3 verified
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
import json
import os
from langchain.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.tools.python.tool import PythonAstREPLTool
from langchain.agents import AgentExecutor, create_react_agent
from langchain_experimental.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
# Set page configuration
st.set_page_config(page_title="Interactive Data Profiler & Chat", layout="wide", page_icon="πŸ“Š")
# Create session states for DataFrame and chat history if they don't exist
if 'df' not in st.session_state:
st.session_state.df = None
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
if 'suggestions' not in st.session_state:
st.session_state.suggestions = []
# Initialize Hugging Face API
def get_llm():
# Using a small but capable open-source model
llm = HuggingFaceHub(
repo_id="google/flan-t5-large",
model_kwargs={"temperature": 0.1, "max_length": 512},
huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN", "")
)
return llm
# Function to generate report
def generate_profile_report(df):
with st.spinner("Generating profile report..."):
profile = ProfileReport(df,
title="Profiling Report",
explorative=True,
minimal=True) # Minimal for faster processing
return profile
# Function to generate query suggestions
def generate_suggestions(df):
# Get basic info about the dataframe
num_rows = df.shape[0]
num_cols = df.shape[1]
column_names = df.columns.tolist()
data_types = df.dtypes.astype(str).tolist()
# Sample suggestions based on dataframe structure
suggestions = [
f"How many rows are in this dataset?",
f"What are all the column names?",
f"Show me the first 5 rows",
f"What is the average of {column_names[0] if len(column_names) > 0 else 'column'}"
]
# Add column-specific suggestions
for col, dtype in zip(column_names[:min(3, len(column_names))], data_types[:min(3, len(data_types))]):
if 'int' in dtype or 'float' in dtype:
suggestions.append(f"What is the mean value of {col}?")
suggestions.append(f"What is the maximum value of {col}?")
elif 'object' in dtype or 'str' in dtype:
suggestions.append(f"What are the unique values in {col}?")
suggestions.append(f"How many missing values in {col}?")
return suggestions
# Function to execute pandas operations safely
def execute_pandas_query(df, query):
try:
# Create pandas agent
agent = create_pandas_dataframe_agent(
llm=get_llm(),
df=df,
agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
verbose=True
)
# Execute query
result = agent.run(query)
return result
except Exception as e:
# Fallback to basic operations if agent fails
if "rows" in query.lower() and "how many" in query.lower():
return f"The dataset has {df.shape[0]} rows."
elif "columns" in query.lower() and "how many" in query.lower():
return f"The dataset has {df.shape[1]} columns."
elif "column names" in query.lower():
return f"The column names are: {', '.join(df.columns.tolist())}"
elif "first" in query.lower() and "rows" in query.lower():
num = 5 # Default
for word in query.split():
if word.isdigit():
num = int(word)
break
return df.head(num).to_string()
elif "describe" in query.lower():
return df.describe().to_string()
else:
return f"I couldn't process that query. Error: {str(e)}"
# Main app header
st.title("πŸ” Interactive Data Profiler & Chat")
st.markdown("""
Upload your CSV file to get detailed profiling and ask questions about your data!
This app combines interactive data profiling with a chat interface for data exploration.
""")
# File uploader
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
# Process uploaded file
if uploaded_file is not None:
try:
# Read CSV into DataFrame
df = pd.read_csv(uploaded_file)
st.session_state.df = df
st.success(f"βœ… File uploaded successfully! Found {df.shape[0]} rows and {df.shape[1]} columns.")
# Generate suggestions when a new file is uploaded
if len(st.session_state.suggestions) == 0:
st.session_state.suggestions = generate_suggestions(df)
# Create tabs for different functionalities
tab1, tab2 = st.tabs(["πŸ“Š Data Profiling", "πŸ’¬ Data Chat"])
# Tab 1: Data Profiling
with tab1:
st.header("Data Profiling")
# Basic info
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Rows", df.shape[0])
with col2:
st.metric("Columns", df.shape[1])
with col3:
st.metric("Missing Values", df.isna().sum().sum())
# Show raw data sample
with st.expander("Preview Data"):
st.dataframe(df.head(10))
# Generate the profile report
profile = generate_profile_report(df)
# Convert report to HTML and display
report_html = profile.to_html()
st.components.v1.html(report_html, height=1000, scrolling=True)
# Provide download button
st.write("### Download the Profiling Report")
report_bytes = report_html.encode('utf-8')
st.download_button(
label="Download Report (HTML)",
data=report_bytes,
file_name="profiling_report.html",
mime="text/html"
)
# Tab 2: Interactive Chat
with tab2:
st.header("Chat with Your Data")
st.info("Ask questions about your data and get instant answers!")
# Chat input and suggested questions
user_question = st.text_input("Your question:", key="question_input")
# Show suggestion chips
st.write("Suggested questions:")
cols = st.columns(2)
for i, suggestion in enumerate(st.session_state.suggestions):
col_idx = i % 2
with cols[col_idx]:
if st.button(suggestion, key=f"suggestion_{i}"):
user_question = suggestion
st.session_state.question_input = suggestion
st.experimental_rerun()
# Process question
if user_question:
st.session_state.chat_history.append({"role": "user", "content": user_question})
# Get answer
with st.spinner("Thinking..."):
answer = execute_pandas_query(df, user_question)
# Add answer to chat history
st.session_state.chat_history.append({"role": "assistant", "content": answer})
# Display chat history
st.write("### Conversation History")
for message in st.session_state.chat_history:
if message["role"] == "user":
st.markdown(f"**You:** {message['content']}")
else:
st.markdown(f"**Assistant:** {message['content']}")
st.markdown("---")
# Clear chat button
if st.button("Clear Chat History"):
st.session_state.chat_history = []
st.experimental_rerun()
except Exception as e:
st.error(f"An error occurred: {str(e)}")
else:
st.info("πŸ‘† Please upload a CSV file to begin.")
# Placeholder visuals
st.markdown("### What you can do with this app:")
col1, col2 = st.columns(2)
with col1:
st.markdown("**πŸ“Š Data Profiling**")
st.markdown("- Automatic data quality assessment")
st.markdown("- Column statistics and distributions")
st.markdown("- Correlation analysis")
st.markdown("- Missing values analysis")
with col2:
st.markdown("**πŸ’¬ Interactive Data Chat**")
st.markdown("- Ask natural language questions")
st.markdown("- Get instant insights")
st.markdown("- Suggested questions for quick exploration")
st.markdown("- No coding required!")