Spaces:
Running
Running
Commit
·
16d6410
1
Parent(s):
187abb4
Indent fixes
Browse files- __pycache__/utils.cpython-312.pyc +0 -0
- requirements.txt +1 -1
- utils.py +54 -54
__pycache__/utils.cpython-312.pyc
CHANGED
Binary files a/__pycache__/utils.cpython-312.pyc and b/__pycache__/utils.cpython-312.pyc differ
|
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
gradio
|
2 |
google-genai
|
3 |
langchain
|
4 |
langchain-community
|
|
|
1 |
+
gradio
|
2 |
google-genai
|
3 |
langchain
|
4 |
langchain-community
|
utils.py
CHANGED
@@ -62,80 +62,80 @@ def load_documents_gradio(uploaded_files):
|
|
62 |
|
63 |
|
64 |
def split_documents(docs, chunk_size=500, chunk_overlap=100):
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
|
71 |
|
72 |
def build_vectorstore(docs, embedding_model_name="all-MiniLM-L6-v2"):
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
|
78 |
-
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
|
84 |
-
|
85 |
-
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
|
95 |
|
96 |
def retrieve_context(query, store, k=6):
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
|
103 |
|
104 |
def retrieve_context_approx(query, store, k=6):
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
|
112 |
-
|
113 |
-
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
|
121 |
|
122 |
def build_prompt(context_chunks, query):
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
|
127 |
-
Context:
|
128 |
-
{context}
|
129 |
|
130 |
-
Question: {query}
|
131 |
-
Answer:"""
|
132 |
|
133 |
|
134 |
def ask_gemini(prompt, client):
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
62 |
|
63 |
|
64 |
def split_documents(docs, chunk_size=500, chunk_overlap=100):
|
65 |
+
"""Splits documents into smaller chunks using RecursiveCharacterTextSplitter."""
|
66 |
+
splitter = RecursiveCharacterTextSplitter(
|
67 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
68 |
+
)
|
69 |
+
return splitter.split_documents(docs)
|
70 |
|
71 |
|
72 |
def build_vectorstore(docs, embedding_model_name="all-MiniLM-L6-v2"):
|
73 |
+
"""Builds a FAISS vector store from the document chunks."""
|
74 |
+
texts = [doc.page_content.strip() for doc in docs if doc.page_content.strip()]
|
75 |
+
if not texts:
|
76 |
+
raise ValueError("No valid text found in the documents.")
|
77 |
|
78 |
+
print(f"No. of Chunks: {len(texts)}")
|
79 |
|
80 |
+
model = SentenceTransformer(embedding_model_name)
|
81 |
+
embeddings = model.encode(texts)
|
82 |
+
print(embeddings.shape)
|
83 |
|
84 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
85 |
+
index.add(np.array(embeddings).astype("float32"))
|
86 |
|
87 |
+
return {
|
88 |
+
"index": index,
|
89 |
+
"texts": texts,
|
90 |
+
"embedding_model": model,
|
91 |
+
"embeddings": embeddings,
|
92 |
+
"chunks": len(texts)
|
93 |
+
}
|
94 |
|
95 |
|
96 |
def retrieve_context(query, store, k=6):
|
97 |
+
"""Retrieves the top-k context chunks most similar to the query."""
|
98 |
+
query_vec = store["embedding_model"].encode([query])
|
99 |
+
k = min(k, len(store["texts"]))
|
100 |
+
distances, indices = store["index"].search(query_vec, k)
|
101 |
+
return [store["texts"][i] for i in indices[0]]
|
102 |
|
103 |
|
104 |
def retrieve_context_approx(query, store, k=6):
|
105 |
+
"""Retrieves context chunks using approximate nearest neighbor search."""
|
106 |
+
ncells = 50
|
107 |
+
D = store["index"].d
|
108 |
+
index = faiss.IndexFlatL2(D)
|
109 |
+
nindex = faiss.IndexIVFFlat(index, D, ncells)
|
110 |
+
nindex.nprobe = 10
|
111 |
|
112 |
+
if not nindex.is_trained:
|
113 |
+
nindex.train(np.array(store["embeddings"]).astype("float32"))
|
114 |
|
115 |
+
nindex.add(np.array(store["embeddings"]).astype("float32"))
|
116 |
+
query_vec = store["embedding_model"].encode([query])
|
117 |
+
k = min(k, len(store["texts"]))
|
118 |
+
_, indices = nindex.search(np.array(query_vec).astype("float32"), k)
|
119 |
+
return [store["texts"][i] for i in indices[0]]
|
120 |
|
121 |
|
122 |
def build_prompt(context_chunks, query):
|
123 |
+
"""Builds the prompt for the Gemini API using context and query."""
|
124 |
+
context = "\n".join(context_chunks)
|
125 |
+
return f"""You are a highly knowledgeable and helpful assistant. Use the following context to generate a **detailed and step-by-step** answer to the user's question. Include explanations, examples, and reasoning wherever helpful.
|
126 |
|
127 |
+
Context:
|
128 |
+
{context}
|
129 |
|
130 |
+
Question: {query}
|
131 |
+
Answer:"""
|
132 |
|
133 |
|
134 |
def ask_gemini(prompt, client):
|
135 |
+
"""Calls the Gemini API with the given prompt and returns the response."""
|
136 |
+
response = client.models.generate_content(
|
137 |
+
model="gemini-2.0-flash", # Or your preferred model
|
138 |
+
contents=[prompt],
|
139 |
+
config=types.GenerateContentConfig(max_output_tokens=2048, temperature=0.5, seed=42),
|
140 |
+
)
|
141 |
+
return response.text
|