amiguel commited on
Commit
af7a4c4
Β·
verified Β·
1 Parent(s): b3d73dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -1
app.py CHANGED
@@ -1 +1,123 @@
1
- <...code truncated to avoid repeating full content here...>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import tempfile
4
+ import os
5
+ import json
6
+ from pathlib import Path
7
+
8
+ from langchain.document_loaders import DataFrameLoader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.embeddings import HuggingFaceEmbeddings
11
+ from langchain.vectorstores import FAISS
12
+ from langchain.chains import RetrievalQAWithSourcesChain
13
+ from langchain import HuggingFacePipeline
14
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
15
+
16
+ USER_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/9904d9a0d445ab0488cf7395cb863cce7621d897/USER_AVATAR.png"
17
+ BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/991f4c6e4e1dc7a8e24876ca5aae5228bcdb4dba/Ataliba_Avatar.jpg"
18
+ CHAT_HISTORY_FILE = Path("chat_memory.json")
19
+
20
+ def load_chat_history():
21
+ if CHAT_HISTORY_FILE.exists():
22
+ with open(CHAT_HISTORY_FILE, "r") as f:
23
+ return json.load(f)
24
+ return []
25
+
26
+ def save_chat_history(history):
27
+ with open(CHAT_HISTORY_FILE, "w") as f:
28
+ json.dump(history, f)
29
+
30
+ def preprocess_excel(file_path: str) -> pd.DataFrame:
31
+ df_raw = pd.read_excel(file_path, sheet_name='Data Base', header=None)
32
+ df = df_raw.iloc[4:].copy()
33
+ df.columns = df.iloc[0]
34
+ df = df[1:]
35
+ df.dropna(how='all', inplace=True)
36
+ df.dropna(axis=1, how='all', inplace=True)
37
+ df.reset_index(drop=True, inplace=True)
38
+ return df
39
+
40
+ def build_vectorstore_from_dataframe(df: pd.DataFrame):
41
+ df.fillna("", inplace=True)
42
+ df['combined_text'] = df.apply(lambda row: ' | '.join([str(cell) for cell in row]), axis=1)
43
+ docs_loader = DataFrameLoader(df[['combined_text']], page_content_column='combined_text')
44
+ documents = docs_loader.load()
45
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
46
+ split_docs = splitter.split_documents(documents)
47
+ embeddings = HuggingFaceEmbeddings(
48
+ model_name="sentence-transformers/all-MiniLM-l6-v2",
49
+ model_kwargs={"device": "cpu"},
50
+ encode_kwargs={"normalize_embeddings": False}
51
+ )
52
+ vectorstore = FAISS.from_documents(split_docs, embeddings)
53
+ return vectorstore
54
+
55
+ def create_qa_pipeline(vectorstore):
56
+ model_id = "google/flan-t5-base"
57
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
58
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
59
+ gen_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
60
+ llm = HuggingFacePipeline(pipeline=gen_pipeline)
61
+ retriever = vectorstore.as_retriever()
62
+ qa = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)
63
+ return qa
64
+
65
+ st.set_page_config(page_title="Excel-Aware RAG Chatbot", layout="wide")
66
+ st.title("πŸ“Š Excel-Aware RAG Chatbot (Professional QA)")
67
+
68
+ with st.sidebar:
69
+ uploaded_file = st.file_uploader("Upload your Excel file (.xlsx or .xlsm with 'Data Base' sheet)", type=["xlsx", "xlsm"])
70
+ if st.button("πŸ—‘οΈ Clear Chat History"):
71
+ st.session_state.chat_history = []
72
+ if CHAT_HISTORY_FILE.exists():
73
+ CHAT_HISTORY_FILE.unlink()
74
+ st.rerun()
75
+
76
+ if "chat_history" not in st.session_state:
77
+ st.session_state.chat_history = load_chat_history()
78
+
79
+ if uploaded_file is not None:
80
+ with st.spinner("Processing and indexing your Excel sheet..."):
81
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsm") as tmp_file:
82
+ tmp_file.write(uploaded_file.read())
83
+ tmp_path = tmp_file.name
84
+
85
+ try:
86
+ cleaned_df = preprocess_excel(tmp_path)
87
+ vectorstore = build_vectorstore_from_dataframe(cleaned_df)
88
+ qa = create_qa_pipeline(vectorstore)
89
+ st.success("βœ… File processed and chatbot ready! Ask your questions below.")
90
+ except Exception as e:
91
+ st.error(f"❌ Error processing file: {e}")
92
+ finally:
93
+ os.remove(tmp_path)
94
+
95
+ for message in st.session_state.chat_history:
96
+ st.chat_message(message["role"], avatar=USER_AVATAR if message["role"] == "user" else BOT_AVATAR).markdown(message["content"])
97
+
98
+ user_prompt = st.chat_input("Ask about inspections, delays, backlog...")
99
+
100
+ if user_prompt:
101
+ st.session_state.chat_history.append({"role": "user", "content": user_prompt})
102
+ st.chat_message("user", avatar=USER_AVATAR).markdown(user_prompt)
103
+
104
+ with st.chat_message("assistant", avatar=BOT_AVATAR):
105
+ with st.spinner("Searching and generating..."):
106
+ try:
107
+ response = qa.invoke({"question": user_prompt})
108
+ final_response = response['answer']
109
+ sources = response.get('sources', '')
110
+ placeholder = st.empty()
111
+ streamed = ""
112
+ for word in final_response.split():
113
+ streamed += word + " "
114
+ placeholder.markdown(streamed + "β–Œ")
115
+ placeholder.markdown(f"**{final_response.strip()}**")
116
+ if sources:
117
+ st.markdown(f"<sub>πŸ“Ž <i>{sources}</i></sub>", unsafe_allow_html=True)
118
+ st.session_state.chat_history.append({"role": "assistant", "content": final_response})
119
+ save_chat_history(st.session_state.chat_history)
120
+ except Exception as e:
121
+ st.error(f"❌ Error: {e}")
122
+ else:
123
+ st.info("Upload a file on the left to get started.")