from langchain.vectorstores import Qdrant from langchain_huggingface import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document import os import fitz # PyMuPDF from config import EMBEDDING_MODEL,QDRANT_HOST,QDRANT_API_KEY embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) def extract_text_from_pdf(pdf_path): if not os.path.exists(pdf_path): raise FileNotFoundError(f"File not found: {pdf_path}") doc = fitz.open(pdf_path) text = "\n".join([page.get_text("text") for page in doc]) return text def load_pdf_data(pdf_path): text = extract_text_from_pdf(pdf_path) splitter = RecursiveCharacterTextSplitter( chunk_size=2000, chunk_overlap=100, ) chunks = splitter.split_text(text) documents = [ Document(page_content=chunk, metadata={"source": pdf_path}) for chunk in chunks ] return documents def get_vector_db(): qdrant_url = QDRANT_HOST api_key = QDRANT_API_KEY collection_name = "discvr_embeddings" docs = load_pdf_data("data/Explorer.pdf") vector_db = Qdrant.from_documents( docs, embedding_model, location=qdrant_url, collection_name=collection_name, api_key=api_key, timeout=500 ) return vector_db def retrieve_info(query, k=20): vector_db = get_vector_db() docs = vector_db.similarity_search(query, k=k) return "\n".join([doc.page_content for doc in docs])