File size: 3,007 Bytes
939262b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from typing import List
import os
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

def fixed_size_split(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
    text_splitter = CharacterTextSplitter(
        separator="\n\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    docs = text_splitter.split_text(text)
    return docs

def recursive_character_split(text: str, chunk_size: int, chunk_overlap: int, model_name: str = "gpt-4o") -> List[str]:
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        model_name=model_name,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n"],
        allowed_special={'<|endoftext|>'},
        disallowed_special=()
    )
    docs = text_splitter.split_text(text)
    return docs

def save_chunks(chunks, output_dir, base_filename):
    for i, chunk in enumerate(chunks):
        with open(os.path.join(output_dir, f"{base_filename}_chunk_{i}.txt"), 'w') as f:
            f.write(chunk)

def chunk_documents(embedding_model: str):
    # Set chunk size and overlap based on embedding model
    if embedding_model == "sentence-transformers/all-MiniLM-L6-v2":
        chunk_size = 500
        chunk_overlap = 50
    elif embedding_model == "BAAI/bge-large-en-v1.5":
        chunk_size = 1000
        chunk_overlap = 100
    elif embedding_model == "openai/text-embedding-ada-002":
        chunk_size = 4096
        chunk_overlap = 200
    else:
        raise ValueError(f"Unsupported embedding model: {embedding_model}")

    # Directory paths
    input_dir = "/Users/anvereshko/Desktop/rag-gradio-sample-project/gradio_app/documentation"
    model_dir = os.path.join(os.path.dirname(input_dir), f"{embedding_model.replace('/', '_')}")
    fixed_output_dir = os.path.join(model_dir, "fixed_chunks")
    recursive_output_dir = os.path.join(model_dir, "recursive_chunks")

    # Create output directories if they don't exist
    os.makedirs(fixed_output_dir, exist_ok=True)
    os.makedirs(recursive_output_dir, exist_ok=True)

    # Process each document
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(input_dir, filename), 'r') as file:
                text = file.read()
            
            # Chunk using fixed size
            fixed_chunks = fixed_size_split(text, chunk_size, chunk_overlap)
            save_chunks(fixed_chunks, fixed_output_dir, filename)
            
            # Chunk using recursive character splitting
            recursive_chunks = recursive_character_split(text, chunk_size, chunk_overlap)
            save_chunks(recursive_chunks, recursive_output_dir, filename)

if __name__ == "__main__":
    embedding_models = [
        "sentence-transformers/all-MiniLM-L6-v2",
        "BAAI/bge-large-en-v1.5",
        "openai/text-embedding-ada-002"
    ]
    for model in embedding_models:
        chunk_documents(model)