File size: 4,033 Bytes
dae990d
 
 
 
 
fc2bb77
dae990d
 
 
 
 
 
 
 
 
 
 
 
fc2bb77
dae990d
 
 
 
 
 
 
fc2bb77
dae990d
 
 
 
 
 
 
 
 
 
fc2bb77
dae990d
fc2bb77
 
 
 
 
 
dae990d
 
 
 
 
 
 
 
 
 
 
 
fc2bb77
 
 
 
 
 
 
 
 
dae990d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc2bb77
 
 
 
 
 
dae990d
 
 
 
 
 
 
 
 
 
fc2bb77
dae990d
 
 
 
 
 
 
fc2bb77
dae990d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import argparse

import os

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, LatexTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document



# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("folder", help="The folder to be ingested", type=str)
parser.add_argument("--chunk_size", help="Chunk size", type=int, default=1500)
parser.add_argument('--chunk_overlap', help='Chunk overlap', type=int, default=400)
parser.add_argument('--separator', help='Separator', type=str, default='\n')
parser.add_argument('--use_tex_splitter', help='Use tex splitter', type=bool, default=False)

args = parser.parse_args()

FOLDER = args.folder
CHUNK_SIZE = args.chunk_size
CHUNK_OVERLAP = args.chunk_overlap
SEPARATOR = args.separator
USE_TEX_SPLITTER = args.use_tex_splitter


class Ingest():

    def __init__(
            self,
            folder,
            chunk_size,
            separator,
            chunk_overlap,
            use_tex_splitter,
    ):
        self.vectorstore = Chroma(persist_directory='./chroma', embedding_function=OpenAIEmbeddings())
        print(f"Count of {self.vectorstore._collection.count()} in vectostore")
        print(f"Deleting previous items from {folder}")
        self.vectorstore._collection.delete(where={'module' : folder})
        print(f"New count, {self.vectorstore._collection.count()}")

        self.folder = folder
        self.chunk_size = chunk_size

        self.data_path = os.path.join('./data', self.folder)

        self.splitter = CharacterTextSplitter(        
            separator = separator,
            chunk_size = chunk_size,
            chunk_overlap  = chunk_overlap,
            length_function = len,
        )

        if use_tex_splitter:
            self.splitter = LatexTextSplitter(
                chunk_size = chunk_size,
                chunk_overlap  = chunk_overlap,
            )

    def _load_tex(self, path):
        with open(path, "r") as f:
            return f.read()

    def ingest(self):
        # find all .pdf files in the data folder

        documents = []
        # pdfs
        pdffiles = [os.path.join(self.data_path, f) for f in os.listdir(self.data_path) if f.endswith(".pdf")]
        for f in pdffiles:
            loader = PyPDFLoader(f)
            docs = loader.load()
            for i in docs: i.metadata['source'] = os.path.basename(f).split(".")[0]
            documents.extend(docs)
        
        #txts
        txtfiles = [f for f in os.listdir(os.path.join('./data', self.folder)) if f.endswith(".txt")]
        for t in txtfiles:
            with open(os.path.join('./data', os.path.join(self.folder, t)), "r") as f:
                documents.append(Document(page_content=f.read(), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'}))

        # tex
        texfiles = [f for f in os.listdir(os.path.join('./data', self.folder)) if f.endswith(".tex")]
        for t in texfiles:
            documents.append(Document(page_content=self._load_tex(os.path.join('./data', os.path.join(self.folder, t))), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'}))


        for i in documents:
            i.metadata['module'] = self.folder

        # split texts into chunks
        print("Splitting texts into chunks...")
        chunks = self.splitter.split_documents(documents)
        #[chunks.extend(self.splitter.split_documents(i)) for i in documents]
        embeddings = OpenAIEmbeddings()
        # create store
        print("Embedding chunks...")
        self.vectorstore.add_texts(texts=[d.page_content for d in chunks], metadatas=[d.metadata for d in chunks])

if __name__ == "__main__":
    ingest = Ingest(
        folder = FOLDER,
        chunk_size = CHUNK_SIZE,
        separator = SEPARATOR,
        chunk_overlap = CHUNK_OVERLAP,
        use_tex_splitter = USE_TEX_SPLITTER,
    )
    ingest.ingest()