Spaces:
Runtime error
Runtime error
from langchain.vectorstores import Qdrant | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.schema import Document | |
import os | |
import fitz # PyMuPDF | |
from config import EMBEDDING_MODEL,QDRANT_HOST,QDRANT_API_KEY | |
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) | |
def extract_text_from_pdf(pdf_path): | |
if not os.path.exists(pdf_path): | |
raise FileNotFoundError(f"File not found: {pdf_path}") | |
doc = fitz.open(pdf_path) | |
text = "\n".join([page.get_text("text") for page in doc]) | |
return text | |
def load_pdf_data(pdf_path): | |
text = extract_text_from_pdf(pdf_path) | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=2000, | |
chunk_overlap=100, | |
) | |
chunks = splitter.split_text(text) | |
documents = [ | |
Document(page_content=chunk, metadata={"source": pdf_path}) | |
for chunk in chunks | |
] | |
return documents | |
def get_vector_db(): | |
qdrant_url = QDRANT_HOST | |
api_key = QDRANT_API_KEY | |
collection_name = "discvr_embeddings" | |
docs = load_pdf_data("data/Explorer.pdf") | |
vector_db = Qdrant.from_documents( | |
docs, embedding_model, | |
location=qdrant_url, | |
collection_name=collection_name, | |
api_key=api_key, | |
timeout=500 | |
) | |
return vector_db | |
def retrieve_info(query, k=20): | |
vector_db = get_vector_db() | |
docs = vector_db.similarity_search(query, k=k) | |
return "\n".join([doc.page_content for doc in docs]) |