Spaces:
Running
Running
from typing import List | |
import os | |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
def fixed_size_split(text: str, chunk_size: int, chunk_overlap: int) -> List[str]: | |
text_splitter = CharacterTextSplitter( | |
separator="\n\n", | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap | |
) | |
docs = text_splitter.split_text(text) | |
return docs | |
def recursive_character_split(text: str, chunk_size: int, chunk_overlap: int, model_name: str = "gpt-4o") -> List[str]: | |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
model_name=model_name, | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
separators=["\n\n", "\n"], | |
allowed_special={'<|endoftext|>'}, | |
disallowed_special=() | |
) | |
docs = text_splitter.split_text(text) | |
return docs | |
def save_chunks(chunks, output_dir, base_filename): | |
for i, chunk in enumerate(chunks): | |
with open(os.path.join(output_dir, f"{base_filename}_chunk_{i}.txt"), 'w') as f: | |
f.write(chunk) | |
def chunk_documents(embedding_model: str): | |
# Set chunk size and overlap based on embedding model | |
if embedding_model == "sentence-transformers/all-MiniLM-L6-v2": | |
chunk_size = 500 | |
chunk_overlap = 50 | |
elif embedding_model == "BAAI/bge-large-en-v1.5": | |
chunk_size = 1000 | |
chunk_overlap = 100 | |
elif embedding_model == "openai/text-embedding-ada-002": | |
chunk_size = 4096 | |
chunk_overlap = 200 | |
else: | |
raise ValueError(f"Unsupported embedding model: {embedding_model}") | |
# Directory paths | |
input_dir = "/Users/anvereshko/Desktop/rag-gradio-sample-project/gradio_app/documentation" | |
model_dir = os.path.join(os.path.dirname(input_dir), f"{embedding_model.replace('/', '_')}") | |
fixed_output_dir = os.path.join(model_dir, "fixed_chunks") | |
recursive_output_dir = os.path.join(model_dir, "recursive_chunks") | |
# Create output directories if they don't exist | |
os.makedirs(fixed_output_dir, exist_ok=True) | |
os.makedirs(recursive_output_dir, exist_ok=True) | |
# Process each document | |
for filename in os.listdir(input_dir): | |
if filename.endswith(".txt"): | |
with open(os.path.join(input_dir, filename), 'r') as file: | |
text = file.read() | |
# Chunk using fixed size | |
fixed_chunks = fixed_size_split(text, chunk_size, chunk_overlap) | |
save_chunks(fixed_chunks, fixed_output_dir, filename) | |
# Chunk using recursive character splitting | |
recursive_chunks = recursive_character_split(text, chunk_size, chunk_overlap) | |
save_chunks(recursive_chunks, recursive_output_dir, filename) | |
if __name__ == "__main__": | |
embedding_models = [ | |
"sentence-transformers/all-MiniLM-L6-v2", | |
"BAAI/bge-large-en-v1.5", | |
"openai/text-embedding-ada-002" | |
] | |
for model in embedding_models: | |
chunk_documents(model) |