from langchain.document_loaders import TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma CHUNK_SIZE = 500 def get_documents(filename: str): loader = TextLoader(filename) docs = loader.load() return docs def get_text_chunk(docs): text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0) texts = text_splitter.split_documents(docs) return texts def store(texts, dir_name): embeddings = OpenAIEmbeddings() db = Chroma.from_documents(texts, embeddings, persist_directory=dir_name) db.persist() def main(filename: str, dir_name: str): docs = get_documents(filename) texts = get_text_chunk(docs) store(texts, dir_name) if __name__ == "__main__": import sys args = sys.argv if len(args) != 3: print("No args, you need one args for text filename") else: filename = args[1] dir_name = args[2] main(filename, dir_name)