amiguel commited on
Commit
9dd1541
·
verified ·
1 Parent(s): 29fca42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -66
app.py CHANGED
@@ -1,73 +1,8 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import tempfile
4
- import os
5
-
6
- from langchain.document_loaders import DataFrameLoader
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain.embeddings import HuggingFaceEmbeddings
9
- from langchain.vectorstores import FAISS
10
- from langchain.chains import RetrievalQA
11
- from langchain import HuggingFacePipeline
12
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
13
-
14
- def preprocess_excel(file_path: str) -> pd.DataFrame:
15
- df_raw = pd.read_excel(file_path, sheet_name='Data Base', header=None)
16
- df = df_raw.iloc[4:].copy()
17
- df.columns = df.iloc[0]
18
- df = df[1:]
19
- df.dropna(how='all', inplace=True)
20
- df.dropna(axis=1, how='all', inplace=True)
21
- df.reset_index(drop=True, inplace=True)
22
- return df
23
-
24
- def build_vectorstore_from_dataframe(df: pd.DataFrame):
25
- df.fillna("", inplace=True)
26
- df['combined_text'] = df.apply(lambda row: ' | '.join([str(cell) for cell in row]), axis=1)
27
-
28
- docs_loader = DataFrameLoader(df[['combined_text']], page_content_column='combined_text')
29
- documents = docs_loader.load()
30
-
31
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
32
- split_docs = splitter.split_documents(documents)
33
-
34
- embeddings = HuggingFaceEmbeddings(
35
- model_name="sentence-transformers/all-MiniLM-l6-v2",
36
- model_kwargs={"device": "cpu"},
37
- encode_kwargs={"normalize_embeddings": False}
38
- )
39
- vectorstore = FAISS.from_documents(split_docs, embeddings)
40
- return vectorstore
41
-
42
- def create_qa_pipeline(vectorstore):
43
- model_id = "google/flan-t5-base"
44
- tokenizer = AutoTokenizer.from_pretrained(model_id)
45
- model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
46
-
47
- gen_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
48
- llm = HuggingFacePipeline(pipeline=gen_pipeline)
49
-
50
- retriever = vectorstore.as_retriever()
51
- qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff", return_source_documents=False)
52
- return qa
53
-
54
- st.set_page_config(page_title="Excel-Aware RAG Chatbot", layout="wide")
55
- st.title("📊 Excel-Aware RAG Chatbot (Professional QA)")
56
-
57
- with st.sidebar:
58
- uploaded_file = st.file_uploader("Upload your Excel file (.xlsx or .xlsm with 'Data Base' sheet)", type=["xlsx", "xlsm"])
59
-
60
- if uploaded_file is not None:
61
- with st.spinner("Processing and indexing your Excel sheet..."):
62
- with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsm") as tmp_file:
63
- tmp_file.write(uploaded_file.read())
64
- tmp_path = tmp_file.name
65
-
66
  try:
67
  cleaned_df = preprocess_excel(tmp_path)
68
  vectorstore = build_vectorstore_from_dataframe(cleaned_df)
69
  qa = create_qa_pipeline(vectorstore)
70
- st.success(" File processed and chatbot ready! Ask your questions below.")
71
 
72
  if "chat_history" not in st.session_state:
73
  st.session_state.chat_history = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  try:
2
  cleaned_df = preprocess_excel(tmp_path)
3
  vectorstore = build_vectorstore_from_dataframe(cleaned_df)
4
  qa = create_qa_pipeline(vectorstore)
5
+ st.success("✅ File processed and chatbot ready! Ask your questions below.")
6
 
7
  if "chat_history" not in st.session_state:
8
  st.session_state.chat_history = []