import streamlit as st import PyPDF2 import os from sentence_transformers import SentenceTransformer import faiss import numpy as np from transformers import pipeline st.set_page_config(page_title="📘 PDF RAG QA", layout="wide") # Custom styles st.markdown(""" """, unsafe_allow_html=True) st.title("📘 Ask Me Anything About Machine Learning") st.caption("Using RAG (Retrieval-Augmented Generation) and a preloaded PDF") # Load PDF from local file PDF_FILE = "ml_large_dataset.pdf" def load_pdf(file_path): with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) return [page.extract_text() for page in reader.pages] def chunk_text(pages, max_len=1000): text = " ".join(pages) words = text.split() return [' '.join(words[i:i+max_len]) for i in range(0, len(words), max_len)] @st.cache_resource def setup_rag(): pages = load_pdf(PDF_FILE) chunks = chunk_text(pages) model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = model.encode(chunks) index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(np.array(embeddings)) qa = pipeline("question-answering", model="deepset/roberta-base-squad2") return chunks, model, index, qa def retrieve_answer(question, chunks, model, index, qa_pipeline, k=6): q_embed = model.encode([question]) _, I = index.search(np.array(q_embed), k) context = "\n\n".join([chunks[i] for i in I[0]]) result = qa_pipeline(question=question, context=context) return result['answer'] chunks, embed_model, faiss_index, qa_model = setup_rag() st.subheader("💬 Ask a Question") question = st.text_input("Enter your question:", placeholder="e.g., What is supervised learning?") if question: with st.spinner("🧠 Searching for the answer..."): answer = retrieve_answer(question, chunks, embed_model, faiss_index, qa_model) st.markdown("#### 📖 Answer:") st.write(answer)