pyhackcon-qa2 / store.py
terapyon's picture
added base script
e6cff8e
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
CHUNK_SIZE = 500
def get_documents(filename: str):
loader = TextLoader(filename)
docs = loader.load()
return docs
def get_text_chunk(docs):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0)
texts = text_splitter.split_documents(docs)
return texts
def store(texts, dir_name):
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(texts, embeddings, persist_directory=dir_name)
db.persist()
def main(filename: str, dir_name: str):
docs = get_documents(filename)
texts = get_text_chunk(docs)
store(texts, dir_name)
if __name__ == "__main__":
import sys
args = sys.argv
if len(args) != 3:
print("No args, you need one args for text filename")
else:
filename = args[1]
dir_name = args[2]
main(filename, dir_name)