File size: 5,836 Bytes
af7a4c4 6b5fad9 af7a4c4 d7c0178 af7a4c4 d7c0178 af7a4c4 d7c0178 a4869da d7c0178 a4869da d7c0178 2bdefb3 af7a4c4 d7c0178 03bf821 af7a4c4 d7c0178 af7a4c4 d7c0178 af7a4c4 d7c0178 af7a4c4 d7c0178 af7a4c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import streamlit as st
import pandas as pd
import tempfile
import os
import json
from pathlib import Path
from langchain.schema import Document
#from langchain.document_loaders import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
USER_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/9904d9a0d445ab0488cf7395cb863cce7621d897/USER_AVATAR.png"
BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/991f4c6e4e1dc7a8e24876ca5aae5228bcdb4dba/Ataliba_Avatar.jpg"
CHAT_HISTORY_FILE = Path("chat_memory.json")
def load_chat_history():
if CHAT_HISTORY_FILE.exists():
with open(CHAT_HISTORY_FILE, "r") as f:
return json.load(f)
return []
def save_chat_history(history):
with open(CHAT_HISTORY_FILE, "w") as f:
json.dump(history, f)
def preprocess_excel(file_path: str) -> pd.DataFrame:
df_raw = pd.read_excel(file_path, sheet_name='Data Base', header=None)
df = df_raw.iloc[4:].copy()
df.columns = df.iloc[0]
df = df[1:]
df.dropna(how='all', inplace=True)
df.dropna(axis=1, how='all', inplace=True)
df.reset_index(drop=True, inplace=True)
df.columns = df.columns.astype(str)
return df
def build_vectorstore_from_structured_records(df: pd.DataFrame):
df.fillna("", inplace=True)
records = []
for i, row in df.iterrows():
item_class = str(row.get("Item Class", "")).strip()
job_done = str(row.get("Job Done", "")).strip()
backlog = str(row.get("Backlog?", "")).strip()
days = str(row.get("Days in Backlog", "")).strip()
if not any([item_class, job_done, backlog, days]):
continue
sentence = f"Item Class {item_class} has status {job_done}, is in {backlog} backlog, and has {days} days."
records.append(Document(page_content=sentence, metadata={"source": f"Row {i+1}"}))
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
split_docs = splitter.split_documents(records)
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-l6-v2",
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": False}
)
vectorstore = FAISS.from_documents(split_docs, embeddings)
return vectorstore
def create_qa_pipeline(vectorstore):
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
gen_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
llm = HuggingFacePipeline(pipeline=gen_pipeline)
retriever = vectorstore.as_retriever()
qa = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)
return qa
st.set_page_config(page_title="Excel-Aware RAG Chatbot", layout="wide")
st.title("π Excel-Aware RAG Chatbot (Structured QA)")
with st.sidebar:
uploaded_file = st.file_uploader("Upload your Excel file (.xlsx or .xlsm with 'Data Base' sheet)", type=["xlsx", "xlsm"])
if st.button("ποΈ Clear Chat History"):
st.session_state.chat_history = []
if CHAT_HISTORY_FILE.exists():
CHAT_HISTORY_FILE.unlink()
st.rerun()
if "chat_history" not in st.session_state:
st.session_state.chat_history = load_chat_history()
if uploaded_file is not None:
with st.spinner("Processing and indexing your Excel sheet..."):
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsm") as tmp_file:
tmp_file.write(uploaded_file.read())
tmp_path = tmp_file.name
try:
df = preprocess_excel(tmp_path)
vectorstore = build_vectorstore_from_structured_records(df)
qa = create_qa_pipeline(vectorstore)
st.success("β
File processed and chatbot ready! Ask your questions below.")
except Exception as e:
st.error(f"β Error processing file: {e}")
finally:
os.remove(tmp_path)
for message in st.session_state.chat_history:
st.chat_message(message["role"], avatar=USER_AVATAR if message["role"] == "user" else BOT_AVATAR).markdown(message["content"])
user_prompt = st.chat_input("Ask about item classes, backlog, or status...")
if user_prompt:
st.session_state.chat_history.append({"role": "user", "content": user_prompt})
st.chat_message("user", avatar=USER_AVATAR).markdown(user_prompt)
with st.chat_message("assistant", avatar=BOT_AVATAR):
with st.spinner("Thinking..."):
try:
response = qa.invoke({"question": user_prompt})
final_response = response['answer']
sources = response.get('sources', '')
placeholder = st.empty()
streamed = ""
for word in final_response.split():
streamed += word + " "
placeholder.markdown(streamed + "β")
placeholder.markdown(f"**{final_response.strip()}**")
if sources:
st.markdown(f"<sub>π <i>{sources}</i></sub>", unsafe_allow_html=True)
st.session_state.chat_history.append({"role": "assistant", "content": final_response})
save_chat_history(st.session_state.chat_history)
except Exception as e:
st.error(f"β Error: {e}")
else:
st.info("Upload a file on the left to get started.") |