File size: 1,863 Bytes
646f8c2 1a20a59 646f8c2 1a20a59 646f8c2 1a20a59 646f8c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
"""
Load and parse files (pdf) in the "data/documents" and save cached pkl files.
It will load and parse files and save 4 caches:
1. "docs.pkl" for loaded text documents
2. "docs_chunks.pkl" for chunked text
3. "docstore.pkl" for small-to-big retriever
4. faiss_index for FAISS vectore store
"""
import os
import pickle
from dotenv import load_dotenv
from huggingface_hub import login
from documents import load_pdf_as_docs, get_doc_chunks
from embeddings import get_jinaai_embeddings
# Load and set env variables
load_dotenv()
# Set huggingface api for downloading embedding model
HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
login(HUGGINGFACEHUB_API_TOKEN)
def save_to_pickle(obj, filename):
"""Save obj to disk using pickle."""
with open(filename, "wb") as file:
pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Set database path, should be same as defined in "app.py"
database_root = "./data/db"
document_path = "./data/documents"
# Parse pdf as "Documents" instances and save as "docs.pkl"
docs = load_pdf_as_docs(document_path)
save_to_pickle(docs, os.path.join(database_root, "docs.pkl"))
# Get text chunks and save as "docs_chunks.pkl"
document_chunks = get_doc_chunks(docs)
save_to_pickle(docs, os.path.join(database_root, "docs_chunks.pkl"))
embeddings = get_jinaai_embeddings(device="auto")
# Create and save vectorstore
from vectorestores import get_faiss_vectorestore
vectorstore = get_faiss_vectorestore(embeddings)
# Create retrievers
from retrievers import get_parent_doc_retriever
# Get parent doc (small-to-big) retriever and save as "docstore.pkl"
parent_doc_retriever = get_parent_doc_retriever(
docs,
vectorstore,
save_path_root=database_root,
save_vectorstore=True,
save_docstore=True,
)
|