Omarrran's picture
Create app.py
f491b53 verified
raw
history blame contribute delete
4.3 kB
import gradio as gr
import chromadb
import os
import tempfile
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
def process_pdf(file_binary):
log = []
status_message = ""
if not file_binary:
return "No file uploaded.", "Error: No file was provided."
try:
log.append("Starting PDF upload and processing...")
# Write uploaded PDF bytes to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(file_binary)
temp_path = temp_file.name
log.append(f"Temporary PDF path: {temp_path}")
# Load and extract text from the PDF
try:
loader = PyPDFLoader(temp_path)
documents = loader.load()
log.append(f"Loaded {len(documents)} page(s) from PDF.")
except Exception as e:
raise RuntimeError(f"Error loading PDF: {e}")
# Split text into chunks
try:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = text_splitter.split_documents(documents)
log.append(f"Text split into {len(splits)} chunk(s).")
except Exception as e:
raise RuntimeError(f"Error splitting text: {e}")
# Create an in-memory Chroma client (ephemeral)
try:
log.append("Initializing in-memory ChromaDB...")
chroma_client = chromadb.Client() # in-memory, no local storage
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
Chroma.from_documents(
splits,
embeddings,
client=chroma_client
)
log.append("Successfully stored PDF chunks in ChromaDB.")
except Exception as e:
raise RuntimeError(f"Error creating ChromaDB vector store: {e}")
status_message = "PDF processed and stored in (ephemeral) ChromaDB successfully!"
log.append(status_message)
except Exception as e:
status_message = "Error"
log.append(f"Exception occurred: {str(e)}")
return status_message, "\n".join(log)
def retrieve_context(query):
log = []
if not query:
return "Error: No query provided."
try:
log.append("Retrieving context from in-memory ChromaDB...")
# Re-initialize the in-memory Chroma client each time
chroma_client = chromadb.Client() # ephemeral
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
vectorstore = Chroma(embedding_function=embeddings, client=chroma_client)
# Perform similarity search
results = vectorstore.similarity_search(query, k=3)
if results:
log.append(f"Found {len(results)} matching chunk(s).")
return "\n\n".join([doc.page_content for doc in results])
else:
log.append("No matching context found in the current in-memory DB.")
return "No relevant context found. Have you processed a PDF yet?"
except Exception as e:
log.append(f"Error retrieving context: {str(e)}")
return "\n".join(log)
with gr.Blocks() as demo:
gr.Markdown("## PDF Context Retriever with ChromaDB (In-Memory)")
with gr.Row():
# Use type 'binary' to receive file data as binary
pdf_upload = gr.File(label="Upload PDF", type="binary")
process_button = gr.Button("Process PDF")
output_text = gr.Textbox(label="Processing Status")
log_output = gr.Textbox(label="Log Output", interactive=False)
# Outputs: [status_message, log_output]
process_button.click(
fn=process_pdf,
inputs=pdf_upload,
outputs=[output_text, log_output]
)
query_input = gr.Textbox(label="Enter your query")
retrieve_button = gr.Button("Retrieve Context")
context_output = gr.Textbox(label="Retrieved Context")
retrieve_button.click(
fn=retrieve_context,
inputs=query_input,
outputs=context_output
)
demo.launch()