Spaces:

LISA-Kadi
/

LISA-demo

Running

App Files Files Community

LISA-demo / preprocess_documents.py

Kadi-IAM

Clean code and add readme

1a20a59 6 months ago

raw

history blame contribute delete

1.86 kB

	"""
	Load and parse files (pdf) in the "data/documents" and save cached pkl files.
	It will load and parse files and save 4 caches:
	1. "docs.pkl" for loaded text documents
	2. "docs_chunks.pkl" for chunked text
	3. "docstore.pkl" for small-to-big retriever
	4. faiss_index for FAISS vectore store
	"""

	import os
	import pickle

	from dotenv import load_dotenv
	from huggingface_hub import login
	from documents import load_pdf_as_docs, get_doc_chunks
	from embeddings import get_jinaai_embeddings


	# Load and set env variables
	load_dotenv()

	# Set huggingface api for downloading embedding model
	HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
	login(HUGGINGFACEHUB_API_TOKEN)


	def save_to_pickle(obj, filename):
	"""Save obj to disk using pickle."""

	with open(filename, "wb") as file:
	pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)


	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	# Set database path, should be same as defined in "app.py"
	database_root = "./data/db"
	document_path = "./data/documents"

	# Parse pdf as "Documents" instances and save as "docs.pkl"
	docs = load_pdf_as_docs(document_path)
	save_to_pickle(docs, os.path.join(database_root, "docs.pkl"))

	# Get text chunks and save as "docs_chunks.pkl"
	document_chunks = get_doc_chunks(docs)
	save_to_pickle(docs, os.path.join(database_root, "docs_chunks.pkl"))

	embeddings = get_jinaai_embeddings(device="auto")

	# Create and save vectorstore
	from vectorestores import get_faiss_vectorestore

	vectorstore = get_faiss_vectorestore(embeddings)

	# Create retrievers
	from retrievers import get_parent_doc_retriever

	# Get parent doc (small-to-big) retriever and save as "docstore.pkl"
	parent_doc_retriever = get_parent_doc_retriever(
	docs,
	vectorstore,
	save_path_root=database_root,
	save_vectorstore=True,
	save_docstore=True,
	)