Spaces:
Running
Running
import json | |
import faiss | |
import numpy as np | |
from sentence_transformers import SentenceTransformer, CrossEncoder | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from transformers import AutoTokenizer, T5ForConditionalGeneration | |
class RAGPipeline: | |
def __init__( | |
self, | |
json_path: str = "calebdata.json", | |
embedder_model: str = "infly/inf-retriever-v1-1.5b", | |
reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", | |
generator_model: str = "google/flan-t5-base" | |
): | |
self.chunks = self._load_chunks(json_path) | |
self.texts = list(set([chunk["text"] or "" for chunk in self.chunks])) | |
self.embedder = SentenceTransformer(embedder_model) | |
self.reranker = CrossEncoder(reranker_model) | |
self.tokenizer = AutoTokenizer.from_pretrained(generator_model) | |
self.generator = T5ForConditionalGeneration.from_pretrained(generator_model) | |
self.index = self._build_faiss_index() | |
self.tfidf_vectorizer, self.tfidf_matrix = self._build_tfidf() | |
def _load_chunks(self, path): | |
with open(path, "r") as f: | |
return json.load(f) | |
def _build_faiss_index(self): | |
embeddings = self.embedder.encode(self.texts, convert_to_numpy=True) | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(embeddings) | |
return index | |
def _build_tfidf(self): | |
vectorizer = TfidfVectorizer() | |
matrix = vectorizer.fit_transform(self.texts) | |
return vectorizer, matrix | |
def _rerank(self, query, docs): | |
pairs = [(query, doc) for doc in docs] | |
scores = self.reranker.predict(pairs) | |
return [doc for _, doc in sorted(zip(scores, docs), reverse=True)] | |
def hybrid_search(self, query, top_k=3): | |
query_embedding = self.embedder.encode([query])[0] | |
_, faiss_indices = self.index.search(np.array([query_embedding]), top_k) | |
faiss_results = [self.texts[i] for i in faiss_indices[0]] | |
query_tfidf = self.tfidf_vectorizer.transform([query]) | |
tfidf_scores = np.array(query_tfidf.dot(self.tfidf_matrix.T).toarray()).flatten() | |
tfidf_indices = tfidf_scores.argsort()[-top_k:][::-1] | |
tfidf_results = [self.texts[i] for i in tfidf_indices] | |
combined = list(set(faiss_results + tfidf_results)) | |
return self._rerank(query, combined)[:top_k] | |
def generate_answer(self, query, top_k=3): | |
context = "\n".join(self.hybrid_search(query, top_k)) | |
prompt = f"Answer the question based on the context.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:" | |
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512) | |
output = self.generator.generate( | |
input_ids=inputs["input_ids"], | |
attention_mask=inputs["attention_mask"], | |
max_length=300, | |
do_sample=True, | |
top_p=0.95, | |
top_k=50, | |
pad_token_id=self.tokenizer.eos_token_id | |
) | |
return self.tokenizer.decode(output[0], skip_special_tokens=True) | |
if __name__ == "__main__": | |
# Example Usage: | |
rag = RAGPipeline() | |
print(rag.generate_answer("Who is the President of NACOS?")) |