contextqasv-tool

Running

App Files Files Community

zaldivards commited on Feb 10

Commit

f66e4f3

1 Parent(s): 0918d3a

feat: enhance predictions quality

Browse files

- Introduce a new model which is more performant
with texts in spanish.
- Normalize chunks' text

Files changed (2) hide show

README.md +16 -2
app.py +17 -13

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Rag Community Tool Template
 emoji: 📊
 colorFrom: indigo
 colorTo: blue
@@ -8,5 +8,19 @@ sdk_version: 4.42.0
 app_file: app.py
 pinned: false
 ---
-Clone this space, add your documents to the `sources` folder and use your space directly from HuggingChat!

 ---
+title: ContextqaSV Tool
 emoji: 📊
 colorFrom: indigo
 colorTo: blue
 app_file: app.py
 pinned: false
 ---
+# ContextqaSV tool
+HuggingChat tool for querying law documents from El Salvador. This is a forked and customized version of the [RAG community template](https://huggingface.co/spaces/nsarrazin/rag-tool-template/tree/main)
+## Updated or added features
+* Better model for texts in Spanish (This tool's target is spanish-speaking people, and the data sources are in Spanish as well)
+* Chunk normalization
+* Enhanced text splitting logic based on a specific chunk size and the split character (.).
+* Cosine similarity instead of dot product for finding text similarities.
+* Cached embeddings (local development).
+* Typed functions.
+* Simplified existing functions.
+### Contact
+For inquiries, contributions, or feedback, please contact [me](mailto:[email protected]). Visit my [github account](https://github.com/zaldivards) and check out my other projects :)

app.py CHANGED Viewed

@@ -1,20 +1,21 @@
-import os
 import glob
 import pickle
 from pathlib import Path
 import gradio as gr
 import spaces
 import numpy as np
 from pypdf import PdfReader
-from sentence_transformers import SentenceTransformer
-model_name = os.environ.get("MODEL", "Snowflake/snowflake-arctic-embed-m")
 chunk_size = int(os.environ.get("CHUNK_SIZE", 1000))
 default_k = int(os.environ.get("DEFAULT_K", 5))
-model = SentenceTransformer(model_name)
 docs = {}
@@ -99,8 +100,10 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
     chunk = ""
     for current_segment in segments:
         if len(chunk) < max_length:
-            chunk += current_segment
         else:
             chunks.append(chunk)
             chunk = current_segment
@@ -133,23 +136,23 @@ def predict(query: str, k: int = 5) -> str:
     """
     # Embed the query
-    query_embedding = model.encode(query, prompt_name="query")
     # Initialize a list to store all chunks and their similarities across all documents
     all_chunks = []
     # Iterate through all documents
-    for doc in docs.values():
-        # Calculate dot product between query and document embeddings
         similarities = np.dot(doc["embeddings"], query_embedding) / (
             np.linalg.norm(doc["embeddings"]) * np.linalg.norm(query_embedding)
         )
         # Add chunks and similarities to the all_chunks list
-        all_chunks.extend(list(zip(doc["chunks"], similarities)))
     # Sort all chunks by similarity
-    all_chunks.sort(key=lambda x: x[1], reverse=True)
-    return "CONTEXT:\n\n" + "\n\n".join(chunk for chunk, _ in all_chunks[:k])
 def init():
@@ -158,7 +161,7 @@ def init():
     It will load or calculate the embeddings
     """
     global docs  # pylint: disable=W0603
-    embeddings_file = Path("embeddings.pickle")
     if embeddings_file.exists():
         with open(embeddings_file, "rb") as embeddings_pickle:
             docs = pickle.load(embeddings_pickle)
@@ -167,7 +170,8 @@ def init():
             converted_doc = convert(filename)
             chunks = generate_chunks(converted_doc, chunk_size)
             embeddings = model.encode(chunks)
-            docs[filename] = {
                 "chunks": chunks,
                 "embeddings": embeddings,
             }

 import glob
+import os
 import pickle
+import re
 from pathlib import Path
 import gradio as gr
 import spaces
 import numpy as np
 from pypdf import PdfReader
+from transformers import AutoModel
 chunk_size = int(os.environ.get("CHUNK_SIZE", 1000))
 default_k = int(os.environ.get("DEFAULT_K", 5))
+model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
 docs = {}
     chunk = ""
     for current_segment in segments:
+        # try to normalize the current chunk
+        current_segment = re.sub(r"\s+", " ", current_segment).strip()
         if len(chunk) < max_length:
+            chunk += f". {current_segment}"
         else:
             chunks.append(chunk)
             chunk = current_segment
     """
     # Embed the query
+    query_embedding = model.encode(query)
     # Initialize a list to store all chunks and their similarities across all documents
     all_chunks = []
     # Iterate through all documents
+    for filename, doc in docs.items():
+        # Calculate cosine similarity between the query and the document embeddings
         similarities = np.dot(doc["embeddings"], query_embedding) / (
             np.linalg.norm(doc["embeddings"]) * np.linalg.norm(query_embedding)
         )
         # Add chunks and similarities to the all_chunks list
+        all_chunks.extend([(filename, chunk, sim) for chunk, sim in zip(doc["chunks"], similarities)])
     # Sort all chunks by similarity
+    all_chunks.sort(key=lambda x: x[2], reverse=True)
+    return "CONTEXT:\n\n" + "\n\n".join(f"{filename}: {chunk}" for filename, chunk, _ in all_chunks[:k])
 def init():
     It will load or calculate the embeddings
     """
     global docs  # pylint: disable=W0603
+    embeddings_file = Path("embeddings-es.pickle")
     if embeddings_file.exists():
         with open(embeddings_file, "rb") as embeddings_pickle:
             docs = pickle.load(embeddings_pickle)
             converted_doc = convert(filename)
             chunks = generate_chunks(converted_doc, chunk_size)
             embeddings = model.encode(chunks)
+            # get the filename and slugify it
+            docs[filename.rsplit("/", 1)[-1].lower().replace(" ", "-")] = {
                 "chunks": chunks,
                 "embeddings": embeddings,
             }