Codegeass321 commited on
Commit
16d6410
·
1 Parent(s): 187abb4

Indent fixes

Browse files
Files changed (3) hide show
  1. __pycache__/utils.cpython-312.pyc +0 -0
  2. requirements.txt +1 -1
  3. utils.py +54 -54
__pycache__/utils.cpython-312.pyc CHANGED
Binary files a/__pycache__/utils.cpython-312.pyc and b/__pycache__/utils.cpython-312.pyc differ
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio==5.25.1
2
  google-genai
3
  langchain
4
  langchain-community
 
1
+ gradio
2
  google-genai
3
  langchain
4
  langchain-community
utils.py CHANGED
@@ -62,80 +62,80 @@ def load_documents_gradio(uploaded_files):
62
 
63
 
64
  def split_documents(docs, chunk_size=500, chunk_overlap=100):
65
- """Splits documents into smaller chunks using RecursiveCharacterTextSplitter."""
66
- splitter = RecursiveCharacterTextSplitter(
67
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
68
- )
69
- return splitter.split_documents(docs)
70
 
71
 
72
  def build_vectorstore(docs, embedding_model_name="all-MiniLM-L6-v2"):
73
- """Builds a FAISS vector store from the document chunks."""
74
- texts = [doc.page_content.strip() for doc in docs if doc.page_content.strip()]
75
- if not texts:
76
- raise ValueError("No valid text found in the documents.")
77
 
78
- print(f"No. of Chunks: {len(texts)}")
79
 
80
- model = SentenceTransformer(embedding_model_name)
81
- embeddings = model.encode(texts)
82
- print(embeddings.shape)
83
 
84
- index = faiss.IndexFlatL2(embeddings.shape[1])
85
- index.add(np.array(embeddings).astype("float32"))
86
 
87
- return {
88
- "index": index,
89
- "texts": texts,
90
- "embedding_model": model,
91
- "embeddings": embeddings,
92
- "chunks": len(texts),
93
- }
94
 
95
 
96
  def retrieve_context(query, store, k=6):
97
- """Retrieves the top-k context chunks most similar to the query."""
98
- query_vec = store["embedding_model"].encode([query])
99
- k = min(k, len(store["texts"]))
100
- distances, indices = store["index"].search(query_vec, k)
101
- return [store["texts"][i] for i in indices[0]]
102
 
103
 
104
  def retrieve_context_approx(query, store, k=6):
105
- """Retrieves context chunks using approximate nearest neighbor search."""
106
- ncells = 50
107
- D = store["index"].d
108
- index = faiss.IndexFlatL2(D)
109
- nindex = faiss.IndexIVFFlat(index, D, ncells)
110
- nindex.nprobe = 10
111
 
112
- if not nindex.is_trained:
113
- nindex.train(np.array(store["embeddings"]).astype("float32"))
114
 
115
- nindex.add(np.array(store["embeddings"]).astype("float32"))
116
- query_vec = store["embedding_model"].encode([query])
117
- k = min(k, len(store["texts"]))
118
- _, indices = nindex.search(np.array(query_vec).astype("float32"), k)
119
- return [store["texts"][i] for i in indices[0]]
120
 
121
 
122
  def build_prompt(context_chunks, query):
123
- """Builds the prompt for the Gemini API using context and query."""
124
- context = "\n".join(context_chunks)
125
- return f"""You are a highly knowledgeable and helpful assistant. Use the following context to generate a **detailed and step-by-step** answer to the user's question. Include explanations, examples, and reasoning wherever helpful.
126
 
127
- Context:
128
- {context}
129
 
130
- Question: {query}
131
- Answer:"""
132
 
133
 
134
  def ask_gemini(prompt, client):
135
- """Calls the Gemini API with the given prompt and returns the response."""
136
- response = client.models.generate_content(
137
- model="gemini-2.0-flash", # Or your preferred model
138
- contents=[prompt],
139
- config=types.GenerateContentConfig(max_output_tokens=2048, temperature=0.5, seed=42),
140
- )
141
- return response.text
 
62
 
63
 
64
  def split_documents(docs, chunk_size=500, chunk_overlap=100):
65
+ """Splits documents into smaller chunks using RecursiveCharacterTextSplitter."""
66
+ splitter = RecursiveCharacterTextSplitter(
67
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
68
+ )
69
+ return splitter.split_documents(docs)
70
 
71
 
72
  def build_vectorstore(docs, embedding_model_name="all-MiniLM-L6-v2"):
73
+ """Builds a FAISS vector store from the document chunks."""
74
+ texts = [doc.page_content.strip() for doc in docs if doc.page_content.strip()]
75
+ if not texts:
76
+ raise ValueError("No valid text found in the documents.")
77
 
78
+ print(f"No. of Chunks: {len(texts)}")
79
 
80
+ model = SentenceTransformer(embedding_model_name)
81
+ embeddings = model.encode(texts)
82
+ print(embeddings.shape)
83
 
84
+ index = faiss.IndexFlatL2(embeddings.shape[1])
85
+ index.add(np.array(embeddings).astype("float32"))
86
 
87
+ return {
88
+ "index": index,
89
+ "texts": texts,
90
+ "embedding_model": model,
91
+ "embeddings": embeddings,
92
+ "chunks": len(texts)
93
+ }
94
 
95
 
96
  def retrieve_context(query, store, k=6):
97
+ """Retrieves the top-k context chunks most similar to the query."""
98
+ query_vec = store["embedding_model"].encode([query])
99
+ k = min(k, len(store["texts"]))
100
+ distances, indices = store["index"].search(query_vec, k)
101
+ return [store["texts"][i] for i in indices[0]]
102
 
103
 
104
  def retrieve_context_approx(query, store, k=6):
105
+ """Retrieves context chunks using approximate nearest neighbor search."""
106
+ ncells = 50
107
+ D = store["index"].d
108
+ index = faiss.IndexFlatL2(D)
109
+ nindex = faiss.IndexIVFFlat(index, D, ncells)
110
+ nindex.nprobe = 10
111
 
112
+ if not nindex.is_trained:
113
+ nindex.train(np.array(store["embeddings"]).astype("float32"))
114
 
115
+ nindex.add(np.array(store["embeddings"]).astype("float32"))
116
+ query_vec = store["embedding_model"].encode([query])
117
+ k = min(k, len(store["texts"]))
118
+ _, indices = nindex.search(np.array(query_vec).astype("float32"), k)
119
+ return [store["texts"][i] for i in indices[0]]
120
 
121
 
122
  def build_prompt(context_chunks, query):
123
+ """Builds the prompt for the Gemini API using context and query."""
124
+ context = "\n".join(context_chunks)
125
+ return f"""You are a highly knowledgeable and helpful assistant. Use the following context to generate a **detailed and step-by-step** answer to the user's question. Include explanations, examples, and reasoning wherever helpful.
126
 
127
+ Context:
128
+ {context}
129
 
130
+ Question: {query}
131
+ Answer:"""
132
 
133
 
134
  def ask_gemini(prompt, client):
135
+ """Calls the Gemini API with the given prompt and returns the response."""
136
+ response = client.models.generate_content(
137
+ model="gemini-2.0-flash", # Or your preferred model
138
+ contents=[prompt],
139
+ config=types.GenerateContentConfig(max_output_tokens=2048, temperature=0.5, seed=42),
140
+ )
141
+ return response.text