""" Load and parse files (pdf) in the "data/documents" and save cached pkl files. It will load and parse files and save 4 caches: 1. "docs.pkl" for loaded text documents 2. "docs_chunks.pkl" for chunked text 3. "docstore.pkl" for small-to-big retriever 4. faiss_index for FAISS vectore store """ import os import pickle from dotenv import load_dotenv from huggingface_hub import login from documents import load_pdf_as_docs, get_doc_chunks from embeddings import get_jinaai_embeddings # Load and set env variables load_dotenv() # Set huggingface api for downloading embedding model HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"] login(HUGGINGFACEHUB_API_TOKEN) def save_to_pickle(obj, filename): """Save obj to disk using pickle.""" with open(filename, "wb") as file: pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Set database path, should be same as defined in "app.py" database_root = "./data/db" document_path = "./data/documents" # Parse pdf as "Documents" instances and save as "docs.pkl" docs = load_pdf_as_docs(document_path) save_to_pickle(docs, os.path.join(database_root, "docs.pkl")) # Get text chunks and save as "docs_chunks.pkl" document_chunks = get_doc_chunks(docs) save_to_pickle(docs, os.path.join(database_root, "docs_chunks.pkl")) embeddings = get_jinaai_embeddings(device="auto") # Create and save vectorstore from vectorestores import get_faiss_vectorestore vectorstore = get_faiss_vectorestore(embeddings) # Create retrievers from retrievers import get_parent_doc_retriever # Get parent doc (small-to-big) retriever and save as "docstore.pkl" parent_doc_retriever = get_parent_doc_retriever( docs, vectorstore, save_path_root=database_root, save_vectorstore=True, save_docstore=True, )