import json import faiss import numpy as np from sentence_transformers import SentenceTransformer, CrossEncoder from sklearn.feature_extraction.text import TfidfVectorizer from transformers import AutoTokenizer, T5ForConditionalGeneration class RAGPipeline: def __init__( self, json_path: str = "calebdata.json", embedder_model: str = "infly/inf-retriever-v1-1.5b", reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", generator_model: str = "google/flan-t5-base" ): self.chunks = self._load_chunks(json_path) self.texts = list(set([chunk["text"] or "" for chunk in self.chunks])) self.embedder = SentenceTransformer(embedder_model) self.reranker = CrossEncoder(reranker_model) self.tokenizer = AutoTokenizer.from_pretrained(generator_model) self.generator = T5ForConditionalGeneration.from_pretrained(generator_model) self.index = self._build_faiss_index() self.tfidf_vectorizer, self.tfidf_matrix = self._build_tfidf() def _load_chunks(self, path): with open(path, "r") as f: return json.load(f) def _build_faiss_index(self): embeddings = self.embedder.encode(self.texts, convert_to_numpy=True) dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) return index def _build_tfidf(self): vectorizer = TfidfVectorizer() matrix = vectorizer.fit_transform(self.texts) return vectorizer, matrix def _rerank(self, query, docs): pairs = [(query, doc) for doc in docs] scores = self.reranker.predict(pairs) return [doc for _, doc in sorted(zip(scores, docs), reverse=True)] def hybrid_search(self, query, top_k=3): query_embedding = self.embedder.encode([query])[0] _, faiss_indices = self.index.search(np.array([query_embedding]), top_k) faiss_results = [self.texts[i] for i in faiss_indices[0]] query_tfidf = self.tfidf_vectorizer.transform([query]) tfidf_scores = np.array(query_tfidf.dot(self.tfidf_matrix.T).toarray()).flatten() tfidf_indices = tfidf_scores.argsort()[-top_k:][::-1] tfidf_results = [self.texts[i] for i in tfidf_indices] combined = list(set(faiss_results + tfidf_results)) return self._rerank(query, combined)[:top_k] def generate_answer(self, query, top_k=3): context = "\n".join(self.hybrid_search(query, top_k)) prompt = f"Answer the question based on the context.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:" inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512) output = self.generator.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=300, do_sample=True, top_p=0.95, top_k=50, pad_token_id=self.tokenizer.eos_token_id ) return self.tokenizer.decode(output[0], skip_special_tokens=True) if __name__ == "__main__": # Example Usage: rag = RAGPipeline() print(rag.generate_answer("Who is the President of NACOS?"))