Spaces:

Shreneek
/

chat-with-csv

Build error

File size: 9,069 Bytes

28545e3

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
import json
import os
from langchain.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.tools.python.tool import PythonAstREPLTool
from langchain.agents import AgentExecutor, create_react_agent
from langchain_experimental.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType

# Set page configuration
st.set_page_config(page_title="Interactive Data Profiler & Chat", layout="wide", page_icon="📊")

# Create session states for DataFrame and chat history if they don't exist
if 'df' not in st.session_state:
    st.session_state.df = None
if 'chat_history' not in st.session_state:
    st.session_state.chat_history = []
if 'suggestions' not in st.session_state:
    st.session_state.suggestions = []

# Initialize Hugging Face API
def get_llm():
    # Using a small but capable open-source model
    llm = HuggingFaceHub(
        repo_id="google/flan-t5-large",
        model_kwargs={"temperature": 0.1, "max_length": 512},
        huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN", "")
    )
    return llm

# Function to generate report
def generate_profile_report(df):
    with st.spinner("Generating profile report..."):
        profile = ProfileReport(df, 
                               title="Profiling Report", 
                               explorative=True,
                               minimal=True)  # Minimal for faster processing
        return profile

# Function to generate query suggestions
def generate_suggestions(df):
    # Get basic info about the dataframe
    num_rows = df.shape[0]
    num_cols = df.shape[1]
    column_names = df.columns.tolist()
    data_types = df.dtypes.astype(str).tolist()
    
    # Sample suggestions based on dataframe structure
    suggestions = [
        f"How many rows are in this dataset?",
        f"What are all the column names?",
        f"Show me the first 5 rows",
        f"What is the average of {column_names[0] if len(column_names) > 0 else 'column'}"
    ]
    
    # Add column-specific suggestions
    for col, dtype in zip(column_names[:min(3, len(column_names))], data_types[:min(3, len(data_types))]):
        if 'int' in dtype or 'float' in dtype:
            suggestions.append(f"What is the mean value of {col}?")
            suggestions.append(f"What is the maximum value of {col}?")
        elif 'object' in dtype or 'str' in dtype:
            suggestions.append(f"What are the unique values in {col}?")
            suggestions.append(f"How many missing values in {col}?")
    
    return suggestions

# Function to execute pandas operations safely
def execute_pandas_query(df, query):
    try:
        # Create pandas agent
        agent = create_pandas_dataframe_agent(
            llm=get_llm(),
            df=df,
            agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
            verbose=True
        )
        
        # Execute query
        result = agent.run(query)
        return result
    except Exception as e:
        # Fallback to basic operations if agent fails
        if "rows" in query.lower() and "how many" in query.lower():
            return f"The dataset has {df.shape[0]} rows."
        elif "columns" in query.lower() and "how many" in query.lower():
            return f"The dataset has {df.shape[1]} columns."
        elif "column names" in query.lower():
            return f"The column names are: {', '.join(df.columns.tolist())}"
        elif "first" in query.lower() and "rows" in query.lower():
            num = 5  # Default
            for word in query.split():
                if word.isdigit():
                    num = int(word)
                    break
            return df.head(num).to_string()
        elif "describe" in query.lower():
            return df.describe().to_string()
        else:
            return f"I couldn't process that query. Error: {str(e)}"

# Main app header
st.title("🔍 Interactive Data Profiler & Chat")
st.markdown("""
Upload your CSV file to get detailed profiling and ask questions about your data!
This app combines interactive data profiling with a chat interface for data exploration.
""")

# File uploader
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")

# Process uploaded file
if uploaded_file is not None:
    try:
        # Read CSV into DataFrame
        df = pd.read_csv(uploaded_file)
        st.session_state.df = df
        st.success(f"✅ File uploaded successfully! Found {df.shape[0]} rows and {df.shape[1]} columns.")
        
        # Generate suggestions when a new file is uploaded
        if len(st.session_state.suggestions) == 0:
            st.session_state.suggestions = generate_suggestions(df)
        
        # Create tabs for different functionalities
        tab1, tab2 = st.tabs(["📊 Data Profiling", "💬 Data Chat"])
        
        # Tab 1: Data Profiling
        with tab1:
            st.header("Data Profiling")
            
            # Basic info
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Rows", df.shape[0])
            with col2:
                st.metric("Columns", df.shape[1])
            with col3:
                st.metric("Missing Values", df.isna().sum().sum())
            
            # Show raw data sample
            with st.expander("Preview Data"):
                st.dataframe(df.head(10))
            
            # Generate the profile report
            profile = generate_profile_report(df)
            
            # Convert report to HTML and display
            report_html = profile.to_html()
            st.components.v1.html(report_html, height=1000, scrolling=True)
            
            # Provide download button
            st.write("### Download the Profiling Report")
            report_bytes = report_html.encode('utf-8')
            st.download_button(
                label="Download Report (HTML)",
                data=report_bytes,
                file_name="profiling_report.html",
                mime="text/html"
            )
            
        # Tab 2: Interactive Chat
        with tab2:
            st.header("Chat with Your Data")
            st.info("Ask questions about your data and get instant answers!")
            
            # Chat input and suggested questions
            user_question = st.text_input("Your question:", key="question_input")
            
            # Show suggestion chips
            st.write("Suggested questions:")
            cols = st.columns(2)
            for i, suggestion in enumerate(st.session_state.suggestions):
                col_idx = i % 2
                with cols[col_idx]:
                    if st.button(suggestion, key=f"suggestion_{i}"):
                        user_question = suggestion
                        st.session_state.question_input = suggestion
                        st.experimental_rerun()
            
            # Process question
            if user_question:
                st.session_state.chat_history.append({"role": "user", "content": user_question})
                
                # Get answer
                with st.spinner("Thinking..."):
                    answer = execute_pandas_query(df, user_question)
                
                # Add answer to chat history
                st.session_state.chat_history.append({"role": "assistant", "content": answer})
            
            # Display chat history
            st.write("### Conversation History")
            for message in st.session_state.chat_history:
                if message["role"] == "user":
                    st.markdown(f"**You:** {message['content']}")
                else:
                    st.markdown(f"**Assistant:** {message['content']}")
                st.markdown("---")
            
            # Clear chat button
            if st.button("Clear Chat History"):
                st.session_state.chat_history = []
                st.experimental_rerun()
                
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")
else:
    st.info("👆 Please upload a CSV file to begin.")
    
    # Placeholder visuals
    st.markdown("### What you can do with this app:")
    col1, col2 = st.columns(2)
    with col1:
        st.markdown("**📊 Data Profiling**")
        st.markdown("- Automatic data quality assessment")
        st.markdown("- Column statistics and distributions")
        st.markdown("- Correlation analysis")
        st.markdown("- Missing values analysis")
    with col2:
        st.markdown("**💬 Interactive Data Chat**")
        st.markdown("- Ask natural language questions")
        st.markdown("- Get instant insights")
        st.markdown("- Suggested questions for quick exploration")
        st.markdown("- No coding required!")