atolat30 commited on
Commit
51ee036
·
1 Parent(s): 83f80d2

Fix token limit issues and improve text chunking

Browse files
Files changed (2) hide show
  1. aimakerspace/text_utils.py +30 -4
  2. app.py +12 -3
aimakerspace/text_utils.py CHANGED
@@ -40,8 +40,8 @@ class TextFileLoader:
40
  class CharacterTextSplitter:
41
  def __init__(
42
  self,
43
- chunk_size: int = 2000,
44
- chunk_overlap: int = 400,
45
  ):
46
  assert (
47
  chunk_size > chunk_overlap
@@ -59,7 +59,17 @@ class CharacterTextSplitter:
59
  if len(current_chunk) + len(paragraph) > self.chunk_size:
60
  if current_chunk:
61
  chunks.append(current_chunk.strip())
62
- current_chunk = paragraph
 
 
 
 
 
 
 
 
 
 
63
  else:
64
  if current_chunk:
65
  current_chunk += "\n\n" + paragraph
@@ -69,7 +79,23 @@ class CharacterTextSplitter:
69
  if current_chunk:
70
  chunks.append(current_chunk.strip())
71
 
72
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  def split_texts(self, texts: List[str]) -> List[str]:
75
  chunks = []
 
40
  class CharacterTextSplitter:
41
  def __init__(
42
  self,
43
+ chunk_size: int = 1000,
44
+ chunk_overlap: int = 200,
45
  ):
46
  assert (
47
  chunk_size > chunk_overlap
 
59
  if len(current_chunk) + len(paragraph) > self.chunk_size:
60
  if current_chunk:
61
  chunks.append(current_chunk.strip())
62
+ if len(paragraph) > self.chunk_size:
63
+ words = paragraph.split()
64
+ current_chunk = ""
65
+ for word in words:
66
+ if len(current_chunk) + len(word) + 1 > self.chunk_size:
67
+ chunks.append(current_chunk.strip())
68
+ current_chunk = word
69
+ else:
70
+ current_chunk += " " + word if current_chunk else word
71
+ else:
72
+ current_chunk = paragraph
73
  else:
74
  if current_chunk:
75
  current_chunk += "\n\n" + paragraph
 
79
  if current_chunk:
80
  chunks.append(current_chunk.strip())
81
 
82
+ final_chunks = []
83
+ for chunk in chunks:
84
+ if len(chunk) > 8000:
85
+ words = chunk.split()
86
+ current = ""
87
+ for word in words:
88
+ if len(current) + len(word) + 1 > 8000:
89
+ final_chunks.append(current.strip())
90
+ current = word
91
+ else:
92
+ current += " " + word if current else word
93
+ if current:
94
+ final_chunks.append(current.strip())
95
+ else:
96
+ final_chunks.append(chunk)
97
+
98
+ return final_chunks
99
 
100
  def split_texts(self, texts: List[str]) -> List[str]:
101
  chunks = []
app.py CHANGED
@@ -31,14 +31,23 @@ class RetrievalAugmentedQAPipeline:
31
  self.vector_db_retriever = vector_db_retriever
32
 
33
  async def arun_pipeline(self, user_query: str):
34
- context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
35
-
 
 
36
  context_prompt = ""
 
 
 
37
  for context in context_list:
 
 
38
  context_prompt += context[0] + "\n"
 
 
 
39
 
40
  formatted_system_prompt = system_role_prompt.create_message()
41
-
42
  formatted_user_prompt = user_role_prompt.create_message(question=user_query, context=context_prompt)
43
 
44
  async def generate_response():
 
31
  self.vector_db_retriever = vector_db_retriever
32
 
33
  async def arun_pipeline(self, user_query: str):
34
+ # Get more contexts but limit the total length
35
+ context_list = self.vector_db_retriever.search_by_text(user_query, k=6)
36
+
37
+ # Limit total context length to approximately 6000 tokens (24000 characters)
38
  context_prompt = ""
39
+ total_length = 0
40
+ max_length = 24000 # Rough estimate: 1 token ≈ 4 characters
41
+
42
  for context in context_list:
43
+ if total_length + len(context[0]) > max_length:
44
+ break
45
  context_prompt += context[0] + "\n"
46
+ total_length += len(context[0])
47
+
48
+ print(f"Using {len(context_prompt.split())} words of context")
49
 
50
  formatted_system_prompt = system_role_prompt.create_message()
 
51
  formatted_user_prompt = user_role_prompt.create_message(question=user_query, context=context_prompt)
52
 
53
  async def generate_response():