zaldivards commited on
Commit
f66e4f3
·
1 Parent(s): 0918d3a

feat: enhance predictions quality

Browse files

- Introduce a new model which is more performant
with texts in spanish.
- Normalize chunks' text

Files changed (2) hide show
  1. README.md +16 -2
  2. app.py +17 -13
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Rag Community Tool Template
3
  emoji: 📊
4
  colorFrom: indigo
5
  colorTo: blue
@@ -8,5 +8,19 @@ sdk_version: 4.42.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
11
 
12
- Clone this space, add your documents to the `sources` folder and use your space directly from HuggingChat!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: ContextqaSV Tool
3
  emoji: 📊
4
  colorFrom: indigo
5
  colorTo: blue
 
8
  app_file: app.py
9
  pinned: false
10
  ---
11
+ # ContextqaSV tool
12
 
13
+ HuggingChat tool for querying law documents from El Salvador. This is a forked and customized version of the [RAG community template](https://huggingface.co/spaces/nsarrazin/rag-tool-template/tree/main)
14
+
15
+ ## Updated or added features
16
+
17
+ * Better model for texts in Spanish (This tool's target is spanish-speaking people, and the data sources are in Spanish as well)
18
+ * Chunk normalization
19
+ * Enhanced text splitting logic based on a specific chunk size and the split character (.).
20
+ * Cosine similarity instead of dot product for finding text similarities.
21
+ * Cached embeddings (local development).
22
+ * Typed functions.
23
+ * Simplified existing functions.
24
+
25
+ ### Contact
26
+ For inquiries, contributions, or feedback, please contact [me](mailto:[email protected]). Visit my [github account](https://github.com/zaldivards) and check out my other projects :)
app.py CHANGED
@@ -1,20 +1,21 @@
1
- import os
2
  import glob
 
3
  import pickle
 
4
  from pathlib import Path
5
 
6
  import gradio as gr
7
  import spaces
8
  import numpy as np
9
  from pypdf import PdfReader
10
- from sentence_transformers import SentenceTransformer
11
 
12
 
13
- model_name = os.environ.get("MODEL", "Snowflake/snowflake-arctic-embed-m")
14
  chunk_size = int(os.environ.get("CHUNK_SIZE", 1000))
15
  default_k = int(os.environ.get("DEFAULT_K", 5))
16
 
17
- model = SentenceTransformer(model_name)
 
18
  docs = {}
19
 
20
 
@@ -99,8 +100,10 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
99
  chunk = ""
100
 
101
  for current_segment in segments:
 
 
102
  if len(chunk) < max_length:
103
- chunk += current_segment
104
  else:
105
  chunks.append(chunk)
106
  chunk = current_segment
@@ -133,23 +136,23 @@ def predict(query: str, k: int = 5) -> str:
133
 
134
  """
135
  # Embed the query
136
- query_embedding = model.encode(query, prompt_name="query")
137
 
138
  # Initialize a list to store all chunks and their similarities across all documents
139
  all_chunks = []
140
  # Iterate through all documents
141
- for doc in docs.values():
142
- # Calculate dot product between query and document embeddings
143
  similarities = np.dot(doc["embeddings"], query_embedding) / (
144
  np.linalg.norm(doc["embeddings"]) * np.linalg.norm(query_embedding)
145
  )
146
  # Add chunks and similarities to the all_chunks list
147
- all_chunks.extend(list(zip(doc["chunks"], similarities)))
148
 
149
  # Sort all chunks by similarity
150
- all_chunks.sort(key=lambda x: x[1], reverse=True)
151
 
152
- return "CONTEXT:\n\n" + "\n\n".join(chunk for chunk, _ in all_chunks[:k])
153
 
154
 
155
  def init():
@@ -158,7 +161,7 @@ def init():
158
  It will load or calculate the embeddings
159
  """
160
  global docs # pylint: disable=W0603
161
- embeddings_file = Path("embeddings.pickle")
162
  if embeddings_file.exists():
163
  with open(embeddings_file, "rb") as embeddings_pickle:
164
  docs = pickle.load(embeddings_pickle)
@@ -167,7 +170,8 @@ def init():
167
  converted_doc = convert(filename)
168
  chunks = generate_chunks(converted_doc, chunk_size)
169
  embeddings = model.encode(chunks)
170
- docs[filename] = {
 
171
  "chunks": chunks,
172
  "embeddings": embeddings,
173
  }
 
 
1
  import glob
2
+ import os
3
  import pickle
4
+ import re
5
  from pathlib import Path
6
 
7
  import gradio as gr
8
  import spaces
9
  import numpy as np
10
  from pypdf import PdfReader
11
+ from transformers import AutoModel
12
 
13
 
 
14
  chunk_size = int(os.environ.get("CHUNK_SIZE", 1000))
15
  default_k = int(os.environ.get("DEFAULT_K", 5))
16
 
17
+ model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
18
+
19
  docs = {}
20
 
21
 
 
100
  chunk = ""
101
 
102
  for current_segment in segments:
103
+ # try to normalize the current chunk
104
+ current_segment = re.sub(r"\s+", " ", current_segment).strip()
105
  if len(chunk) < max_length:
106
+ chunk += f". {current_segment}"
107
  else:
108
  chunks.append(chunk)
109
  chunk = current_segment
 
136
 
137
  """
138
  # Embed the query
139
+ query_embedding = model.encode(query)
140
 
141
  # Initialize a list to store all chunks and their similarities across all documents
142
  all_chunks = []
143
  # Iterate through all documents
144
+ for filename, doc in docs.items():
145
+ # Calculate cosine similarity between the query and the document embeddings
146
  similarities = np.dot(doc["embeddings"], query_embedding) / (
147
  np.linalg.norm(doc["embeddings"]) * np.linalg.norm(query_embedding)
148
  )
149
  # Add chunks and similarities to the all_chunks list
150
+ all_chunks.extend([(filename, chunk, sim) for chunk, sim in zip(doc["chunks"], similarities)])
151
 
152
  # Sort all chunks by similarity
153
+ all_chunks.sort(key=lambda x: x[2], reverse=True)
154
 
155
+ return "CONTEXT:\n\n" + "\n\n".join(f"{filename}: {chunk}" for filename, chunk, _ in all_chunks[:k])
156
 
157
 
158
  def init():
 
161
  It will load or calculate the embeddings
162
  """
163
  global docs # pylint: disable=W0603
164
+ embeddings_file = Path("embeddings-es.pickle")
165
  if embeddings_file.exists():
166
  with open(embeddings_file, "rb") as embeddings_pickle:
167
  docs = pickle.load(embeddings_pickle)
 
170
  converted_doc = convert(filename)
171
  chunks = generate_chunks(converted_doc, chunk_size)
172
  embeddings = model.encode(chunks)
173
+ # get the filename and slugify it
174
+ docs[filename.rsplit("/", 1)[-1].lower().replace(" ", "-")] = {
175
  "chunks": chunks,
176
  "embeddings": embeddings,
177
  }