File size: 1,863 Bytes
646f8c2
1a20a59
 
 
 
 
 
646f8c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a20a59
 
646f8c2
 
 
 
 
1a20a59
646f8c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Load and parse files (pdf) in the "data/documents" and save cached pkl files.
It will load and parse files and save 4 caches:
1. "docs.pkl" for loaded text documents
2. "docs_chunks.pkl" for chunked text
3. "docstore.pkl" for small-to-big retriever
4. faiss_index for FAISS vectore store
"""

import os
import pickle

from dotenv import load_dotenv
from huggingface_hub import login
from documents import load_pdf_as_docs, get_doc_chunks
from embeddings import get_jinaai_embeddings


# Load and set env variables
load_dotenv()

# Set huggingface api for downloading embedding model
HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
login(HUGGINGFACEHUB_API_TOKEN)


def save_to_pickle(obj, filename):
    """Save obj to disk using pickle."""

    with open(filename, "wb") as file:
        pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Set database path, should be same as defined in "app.py"
database_root = "./data/db"
document_path = "./data/documents"

# Parse pdf as "Documents" instances and save as "docs.pkl"
docs = load_pdf_as_docs(document_path)
save_to_pickle(docs, os.path.join(database_root, "docs.pkl"))

# Get text chunks and save as "docs_chunks.pkl"
document_chunks = get_doc_chunks(docs)
save_to_pickle(docs, os.path.join(database_root, "docs_chunks.pkl"))

embeddings = get_jinaai_embeddings(device="auto")

# Create and save vectorstore
from vectorestores import get_faiss_vectorestore

vectorstore = get_faiss_vectorestore(embeddings)

# Create retrievers
from retrievers import get_parent_doc_retriever

# Get parent doc (small-to-big) retriever and save as "docstore.pkl"
parent_doc_retriever = get_parent_doc_retriever(
    docs,
    vectorstore,
    save_path_root=database_root,
    save_vectorstore=True,
    save_docstore=True,
)