amiguel commited on
Commit
29fca42
Β·
verified Β·
1 Parent(s): 83f193f

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +17 -13
  2. app.py +95 -0
  3. requirements.txt +9 -0
README.md CHANGED
@@ -1,13 +1,17 @@
1
- ---
2
- title: RAG
3
- emoji: 😻
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.44.1
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
+ # Excel-Aware RAG Chatbot
2
+
3
+ This Streamlit app lets you upload an Excel file (with a 'Data Base' sheet), processes the data into a retrievable vector database, and allows question answering using a RAG pipeline powered by `flan-t5-base`.
4
+
5
+ ## Features
6
+ - Upload `.xlsm` or `.xlsx` files
7
+ - Automatically cleans and processes the 'Data Base' sheet
8
+ - Embeds entries using `MiniLM` embeddings
9
+ - Uses `flan-t5-base` for fast CPU-friendly QA
10
+ - Works on Hugging Face Spaces (L4 hardware)
11
+
12
+ ## Run Locally
13
+
14
+ ```bash
15
+ pip install -r requirements.txt
16
+ streamlit run app.py
17
+ ```
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import tempfile
4
+ import os
5
+
6
+ from langchain.document_loaders import DataFrameLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.embeddings import HuggingFaceEmbeddings
9
+ from langchain.vectorstores import FAISS
10
+ from langchain.chains import RetrievalQA
11
+ from langchain import HuggingFacePipeline
12
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
13
+
14
+ def preprocess_excel(file_path: str) -> pd.DataFrame:
15
+ df_raw = pd.read_excel(file_path, sheet_name='Data Base', header=None)
16
+ df = df_raw.iloc[4:].copy()
17
+ df.columns = df.iloc[0]
18
+ df = df[1:]
19
+ df.dropna(how='all', inplace=True)
20
+ df.dropna(axis=1, how='all', inplace=True)
21
+ df.reset_index(drop=True, inplace=True)
22
+ return df
23
+
24
+ def build_vectorstore_from_dataframe(df: pd.DataFrame):
25
+ df.fillna("", inplace=True)
26
+ df['combined_text'] = df.apply(lambda row: ' | '.join([str(cell) for cell in row]), axis=1)
27
+
28
+ docs_loader = DataFrameLoader(df[['combined_text']], page_content_column='combined_text')
29
+ documents = docs_loader.load()
30
+
31
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
32
+ split_docs = splitter.split_documents(documents)
33
+
34
+ embeddings = HuggingFaceEmbeddings(
35
+ model_name="sentence-transformers/all-MiniLM-l6-v2",
36
+ model_kwargs={"device": "cpu"},
37
+ encode_kwargs={"normalize_embeddings": False}
38
+ )
39
+ vectorstore = FAISS.from_documents(split_docs, embeddings)
40
+ return vectorstore
41
+
42
+ def create_qa_pipeline(vectorstore):
43
+ model_id = "google/flan-t5-base"
44
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
45
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
46
+
47
+ gen_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
48
+ llm = HuggingFacePipeline(pipeline=gen_pipeline)
49
+
50
+ retriever = vectorstore.as_retriever()
51
+ qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff", return_source_documents=False)
52
+ return qa
53
+
54
+ st.set_page_config(page_title="Excel-Aware RAG Chatbot", layout="wide")
55
+ st.title("πŸ“Š Excel-Aware RAG Chatbot (Professional QA)")
56
+
57
+ with st.sidebar:
58
+ uploaded_file = st.file_uploader("Upload your Excel file (.xlsx or .xlsm with 'Data Base' sheet)", type=["xlsx", "xlsm"])
59
+
60
+ if uploaded_file is not None:
61
+ with st.spinner("Processing and indexing your Excel sheet..."):
62
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsm") as tmp_file:
63
+ tmp_file.write(uploaded_file.read())
64
+ tmp_path = tmp_file.name
65
+
66
+ try:
67
+ cleaned_df = preprocess_excel(tmp_path)
68
+ vectorstore = build_vectorstore_from_dataframe(cleaned_df)
69
+ qa = create_qa_pipeline(vectorstore)
70
+ st.success("βœ… File processed and chatbot ready! Ask your questions below.")
71
+
72
+ if "chat_history" not in st.session_state:
73
+ st.session_state.chat_history = []
74
+
75
+ with st.chat_message("assistant"):
76
+ st.markdown("How can I help you with the inspection data?")
77
+
78
+ user_prompt = st.chat_input("Ask a question like 'How many backlog items are marked Yes?' or 'List overdue inspections'.")
79
+
80
+ if user_prompt:
81
+ st.chat_message("user").markdown(user_prompt)
82
+ with st.chat_message("assistant"):
83
+ with st.spinner("Thinking..."):
84
+ try:
85
+ answer = qa.run(user_prompt)
86
+ st.markdown(f"**Answer:** {answer}")
87
+ st.session_state.chat_history.append((user_prompt, answer))
88
+ except Exception as e:
89
+ st.error(f"Error: {e}")
90
+ except Exception as e:
91
+ st.error(f"Error processing file: {e}")
92
+ finally:
93
+ os.remove(tmp_path)
94
+ else:
95
+ st.info("Upload a file to get started.")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ openpyxl
4
+ langchain
5
+ langchain-community
6
+ sentence-transformers
7
+ transformers
8
+ torch
9
+ faiss-cpu