|
""" |
|
Load and parse files (pdf) in the "data/documents" and save cached pkl files. |
|
It will load and parse files and save 4 caches: |
|
1. "docs.pkl" for loaded text documents |
|
2. "docs_chunks.pkl" for chunked text |
|
3. "docstore.pkl" for small-to-big retriever |
|
4. faiss_index for FAISS vectore store |
|
""" |
|
|
|
import os |
|
import pickle |
|
|
|
from dotenv import load_dotenv |
|
from huggingface_hub import login |
|
from documents import load_pdf_as_docs, get_doc_chunks |
|
from embeddings import get_jinaai_embeddings |
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"] |
|
login(HUGGINGFACEHUB_API_TOKEN) |
|
|
|
|
|
def save_to_pickle(obj, filename): |
|
"""Save obj to disk using pickle.""" |
|
|
|
with open(filename, "wb") as file: |
|
pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL) |
|
|
|
|
|
|
|
|
|
database_root = "./data/db" |
|
document_path = "./data/documents" |
|
|
|
|
|
docs = load_pdf_as_docs(document_path) |
|
save_to_pickle(docs, os.path.join(database_root, "docs.pkl")) |
|
|
|
|
|
document_chunks = get_doc_chunks(docs) |
|
save_to_pickle(docs, os.path.join(database_root, "docs_chunks.pkl")) |
|
|
|
embeddings = get_jinaai_embeddings(device="auto") |
|
|
|
|
|
from vectorestores import get_faiss_vectorestore |
|
|
|
vectorstore = get_faiss_vectorestore(embeddings) |
|
|
|
|
|
from retrievers import get_parent_doc_retriever |
|
|
|
|
|
parent_doc_retriever = get_parent_doc_retriever( |
|
docs, |
|
vectorstore, |
|
save_path_root=database_root, |
|
save_vectorstore=True, |
|
save_docstore=True, |
|
) |
|
|