ExplrChatbot / chatbot /retrieval.py
anhkhoiphan's picture
Update chatbot/retrieval.py
6213ab1 verified
from langchain.vectorstores import Qdrant
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import os
import fitz # PyMuPDF
from config import EMBEDDING_MODEL,QDRANT_HOST,QDRANT_API_KEY
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
def extract_text_from_pdf(pdf_path):
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"File not found: {pdf_path}")
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text("text") for page in doc])
return text
def load_pdf_data(pdf_path):
text = extract_text_from_pdf(pdf_path)
splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=100,
)
chunks = splitter.split_text(text)
documents = [
Document(page_content=chunk, metadata={"source": pdf_path})
for chunk in chunks
]
return documents
def get_vector_db():
qdrant_url = QDRANT_HOST
api_key = QDRANT_API_KEY
collection_name = "discvr_embeddings"
docs = load_pdf_data("data/Explorer.pdf")
vector_db = Qdrant.from_documents(
docs, embedding_model,
location=qdrant_url,
collection_name=collection_name,
api_key=api_key,
timeout=500
)
return vector_db
def retrieve_info(query, k=20):
vector_db = get_vector_db()
docs = vector_db.similarity_search(query, k=k)
return "\n".join([doc.page_content for doc in docs])