File size: 1,067 Bytes
11ed8cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma


CHUNK_SIZE = 500


def get_documents(filename: str):
    loader = TextLoader(filename)
    docs = loader.load()
    return docs


def get_text_chunk(docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0)
    texts = text_splitter.split_documents(docs)
    return texts


def store(texts, dir_name):
    embeddings = OpenAIEmbeddings()
    db = Chroma.from_documents(texts, embeddings, persist_directory=dir_name)
    db.persist()


def main(filename: str, dir_name: str):
    docs = get_documents(filename)
    texts = get_text_chunk(docs)
    store(texts, dir_name)


if __name__ == "__main__":
    import sys
    args = sys.argv
    if len(args) != 3:
        print("No args, you need one args for text filename")
    else:
        filename = args[1]
        dir_name = args[2]
        main(filename, dir_name)