from typing import List import os from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter def fixed_size_split(text: str, chunk_size: int, chunk_overlap: int) -> List[str]: text_splitter = CharacterTextSplitter( separator="\n\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap ) docs = text_splitter.split_text(text) return docs def recursive_character_split(text: str, chunk_size: int, chunk_overlap: int, model_name: str = "gpt-4o") -> List[str]: text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( model_name=model_name, chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n"], allowed_special={'<|endoftext|>'}, disallowed_special=() ) docs = text_splitter.split_text(text) return docs def save_chunks(chunks, output_dir, base_filename): for i, chunk in enumerate(chunks): with open(os.path.join(output_dir, f"{base_filename}_chunk_{i}.txt"), 'w') as f: f.write(chunk) def chunk_documents(embedding_model: str): # Set chunk size and overlap based on embedding model if embedding_model == "sentence-transformers/all-MiniLM-L6-v2": chunk_size = 500 chunk_overlap = 50 elif embedding_model == "BAAI/bge-large-en-v1.5": chunk_size = 1000 chunk_overlap = 100 elif embedding_model == "openai/text-embedding-ada-002": chunk_size = 4096 chunk_overlap = 200 else: raise ValueError(f"Unsupported embedding model: {embedding_model}") # Directory paths input_dir = "/Users/anvereshko/Desktop/rag-gradio-sample-project/gradio_app/documentation" model_dir = os.path.join(os.path.dirname(input_dir), f"{embedding_model.replace('/', '_')}") fixed_output_dir = os.path.join(model_dir, "fixed_chunks") recursive_output_dir = os.path.join(model_dir, "recursive_chunks") # Create output directories if they don't exist os.makedirs(fixed_output_dir, exist_ok=True) os.makedirs(recursive_output_dir, exist_ok=True) # Process each document for filename in os.listdir(input_dir): if filename.endswith(".txt"): with open(os.path.join(input_dir, filename), 'r') as file: text = file.read() # Chunk using fixed size fixed_chunks = fixed_size_split(text, chunk_size, chunk_overlap) save_chunks(fixed_chunks, fixed_output_dir, filename) # Chunk using recursive character splitting recursive_chunks = recursive_character_split(text, chunk_size, chunk_overlap) save_chunks(recursive_chunks, recursive_output_dir, filename) if __name__ == "__main__": embedding_models = [ "sentence-transformers/all-MiniLM-L6-v2", "BAAI/bge-large-en-v1.5", "openai/text-embedding-ada-002" ] for model in embedding_models: chunk_documents(model)