Spaces:

atolat30
/

pythonic-rag

Sleeping

App Files Files Community

atolat30 commited on 17 days ago

Commit

51ee036

1 Parent(s): 83f80d2

Fix token limit issues and improve text chunking

Browse files

Files changed (2) hide show

aimakerspace/text_utils.py +30 -4
app.py +12 -3

aimakerspace/text_utils.py CHANGED Viewed

@@ -40,8 +40,8 @@ class TextFileLoader:
 class CharacterTextSplitter:
     def __init__(
         self,
-        chunk_size: int = 2000,
-        chunk_overlap: int = 400,
     ):
         assert (
             chunk_size > chunk_overlap
@@ -59,7 +59,17 @@ class CharacterTextSplitter:
             if len(current_chunk) + len(paragraph) > self.chunk_size:
                 if current_chunk:
                     chunks.append(current_chunk.strip())
-                current_chunk = paragraph
             else:
                 if current_chunk:
                     current_chunk += "\n\n" + paragraph
@@ -69,7 +79,23 @@ class CharacterTextSplitter:
         if current_chunk:
             chunks.append(current_chunk.strip())
-        return chunks
     def split_texts(self, texts: List[str]) -> List[str]:
         chunks = []

 class CharacterTextSplitter:
     def __init__(
         self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
     ):
         assert (
             chunk_size > chunk_overlap
             if len(current_chunk) + len(paragraph) > self.chunk_size:
                 if current_chunk:
                     chunks.append(current_chunk.strip())
+                if len(paragraph) > self.chunk_size:
+                    words = paragraph.split()
+                    current_chunk = ""
+                    for word in words:
+                        if len(current_chunk) + len(word) + 1 > self.chunk_size:
+                            chunks.append(current_chunk.strip())
+                            current_chunk = word
+                        else:
+                            current_chunk += " " + word if current_chunk else word
+                else:
+                    current_chunk = paragraph
             else:
                 if current_chunk:
                     current_chunk += "\n\n" + paragraph
         if current_chunk:
             chunks.append(current_chunk.strip())
+        final_chunks = []
+        for chunk in chunks:
+            if len(chunk) > 8000:
+                words = chunk.split()
+                current = ""
+                for word in words:
+                    if len(current) + len(word) + 1 > 8000:
+                        final_chunks.append(current.strip())
+                        current = word
+                    else:
+                        current += " " + word if current else word
+                if current:
+                    final_chunks.append(current.strip())
+            else:
+                final_chunks.append(chunk)
+        return final_chunks
     def split_texts(self, texts: List[str]) -> List[str]:
         chunks = []

app.py CHANGED Viewed

@@ -31,14 +31,23 @@ class RetrievalAugmentedQAPipeline:
         self.vector_db_retriever = vector_db_retriever
     async def arun_pipeline(self, user_query: str):
-        context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
         context_prompt = ""
         for context in context_list:
             context_prompt += context[0] + "\n"
         formatted_system_prompt = system_role_prompt.create_message()
         formatted_user_prompt = user_role_prompt.create_message(question=user_query, context=context_prompt)
         async def generate_response():

         self.vector_db_retriever = vector_db_retriever
     async def arun_pipeline(self, user_query: str):
+        # Get more contexts but limit the total length
+        context_list = self.vector_db_retriever.search_by_text(user_query, k=6)
+        # Limit total context length to approximately 6000 tokens (24000 characters)
         context_prompt = ""
+        total_length = 0
+        max_length = 24000  # Rough estimate: 1 token ≈ 4 characters
         for context in context_list:
+            if total_length + len(context[0]) > max_length:
+                break
             context_prompt += context[0] + "\n"
+            total_length += len(context[0])
+        print(f"Using {len(context_prompt.split())} words of context")
         formatted_system_prompt = system_role_prompt.create_message()
         formatted_user_prompt = user_role_prompt.create_message(question=user_query, context=context_prompt)
         async def generate_response():