Spaces:

Codegeass321
/

ChatDocxAI

Running

App Files Files Community

Codegeass321 commited on 15 days ago

Commit

16d6410

1 Parent(s): 187abb4

Indent fixes

Browse files

Files changed (3) hide show

__pycache__/utils.cpython-312.pyc +0 -0
requirements.txt +1 -1
utils.py +54 -54

__pycache__/utils.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-312.pyc and b/__pycache__/utils.cpython-312.pyc differ

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio==5.25.1
 google-genai
 langchain
 langchain-community

+gradio
 google-genai
 langchain
 langchain-community

utils.py CHANGED Viewed

@@ -62,80 +62,80 @@ def load_documents_gradio(uploaded_files):
 def split_documents(docs, chunk_size=500, chunk_overlap=100):
-    """Splits documents into smaller chunks using RecursiveCharacterTextSplitter."""
-    splitter = RecursiveCharacterTextSplitter(
-        chunk_size=chunk_size, chunk_overlap=chunk_overlap
-    )
-    return splitter.split_documents(docs)
 def build_vectorstore(docs, embedding_model_name="all-MiniLM-L6-v2"):
-    """Builds a FAISS vector store from the document chunks."""
-    texts = [doc.page_content.strip() for doc in docs if doc.page_content.strip()]
-    if not texts:
-        raise ValueError("No valid text found in the documents.")
-    print(f"No. of Chunks: {len(texts)}")
-    model = SentenceTransformer(embedding_model_name)
-    embeddings = model.encode(texts)
-    print(embeddings.shape)
-    index = faiss.IndexFlatL2(embeddings.shape[1])
-    index.add(np.array(embeddings).astype("float32"))
-    return {
-        "index": index,
-        "texts": texts,
-        "embedding_model": model,
-        "embeddings": embeddings,
-        "chunks": len(texts),
-    }
 def retrieve_context(query, store, k=6):
-    """Retrieves the top-k context chunks most similar to the query."""
-    query_vec = store["embedding_model"].encode([query])
-    k = min(k, len(store["texts"]))
-    distances, indices = store["index"].search(query_vec, k)
-    return [store["texts"][i] for i in indices[0]]
 def retrieve_context_approx(query, store, k=6):
-    """Retrieves context chunks using approximate nearest neighbor search."""
-    ncells = 50
-    D = store["index"].d
-    index = faiss.IndexFlatL2(D)
-    nindex = faiss.IndexIVFFlat(index, D, ncells)
-    nindex.nprobe = 10
-    if not nindex.is_trained:
-        nindex.train(np.array(store["embeddings"]).astype("float32"))
-    nindex.add(np.array(store["embeddings"]).astype("float32"))
-    query_vec = store["embedding_model"].encode([query])
-    k = min(k, len(store["texts"]))
-    _, indices = nindex.search(np.array(query_vec).astype("float32"), k)
-    return [store["texts"][i] for i in indices[0]]
 def build_prompt(context_chunks, query):
-    """Builds the prompt for the Gemini API using context and query."""
-    context = "\n".join(context_chunks)
-    return f"""You are a highly knowledgeable and helpful assistant. Use the following context to generate a **detailed and step-by-step** answer to the user's question. Include explanations, examples, and reasoning wherever helpful.
-Context:
-{context}
-Question: {query}
-Answer:"""
 def ask_gemini(prompt, client):
-    """Calls the Gemini API with the given prompt and returns the response."""
-    response = client.models.generate_content(
-        model="gemini-2.0-flash",  # Or your preferred model
-        contents=[prompt],
-        config=types.GenerateContentConfig(max_output_tokens=2048, temperature=0.5, seed=42),
-    )
-    return response.text

 def split_documents(docs, chunk_size=500, chunk_overlap=100):
+  """Splits documents into smaller chunks using RecursiveCharacterTextSplitter."""
+  splitter = RecursiveCharacterTextSplitter(
+    chunk_size=chunk_size, chunk_overlap=chunk_overlap
+  )
+  return splitter.split_documents(docs)
 def build_vectorstore(docs, embedding_model_name="all-MiniLM-L6-v2"):
+  """Builds a FAISS vector store from the document chunks."""
+  texts = [doc.page_content.strip() for doc in docs if doc.page_content.strip()]
+  if not texts:
+    raise ValueError("No valid text found in the documents.")
+  print(f"No. of Chunks: {len(texts)}")
+  model = SentenceTransformer(embedding_model_name)
+  embeddings = model.encode(texts)
+  print(embeddings.shape)
+  index = faiss.IndexFlatL2(embeddings.shape[1])
+  index.add(np.array(embeddings).astype("float32"))
+  return {
+    "index": index,
+    "texts": texts,
+    "embedding_model": model,
+    "embeddings": embeddings,
+    "chunks": len(texts)
+  }
 def retrieve_context(query, store, k=6):
+  """Retrieves the top-k context chunks most similar to the query."""
+  query_vec = store["embedding_model"].encode([query])
+  k = min(k, len(store["texts"]))
+  distances, indices = store["index"].search(query_vec, k)
+  return [store["texts"][i] for i in indices[0]]
 def retrieve_context_approx(query, store, k=6):
+  """Retrieves context chunks using approximate nearest neighbor search."""
+  ncells = 50
+  D = store["index"].d
+  index = faiss.IndexFlatL2(D)
+  nindex = faiss.IndexIVFFlat(index, D, ncells)
+  nindex.nprobe = 10
+  if not nindex.is_trained:
+    nindex.train(np.array(store["embeddings"]).astype("float32"))
+  nindex.add(np.array(store["embeddings"]).astype("float32"))
+  query_vec = store["embedding_model"].encode([query])
+  k = min(k, len(store["texts"]))
+  _, indices = nindex.search(np.array(query_vec).astype("float32"), k)
+  return [store["texts"][i] for i in indices[0]]
 def build_prompt(context_chunks, query):
+  """Builds the prompt for the Gemini API using context and query."""
+  context = "\n".join(context_chunks)
+  return f"""You are a highly knowledgeable and helpful assistant. Use the following context to generate a **detailed and step-by-step** answer to the user's question. Include explanations, examples, and reasoning wherever helpful.
+  Context:
+  {context}
+  Question: {query}
+  Answer:"""
 def ask_gemini(prompt, client):
+  """Calls the Gemini API with the given prompt and returns the response."""
+  response = client.models.generate_content(
+    model="gemini-2.0-flash",  # Or your preferred model
+    contents=[prompt],
+    config=types.GenerateContentConfig(max_output_tokens=2048, temperature=0.5, seed=42),
+  )
+  return response.text