Spaces:
Running
Running
File size: 3,328 Bytes
d3ae497 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, T5ForConditionalGeneration
class RAGPipeline:
def __init__(
self,
json_path: str = "calebdata.json",
embedder_model: str = "infly/inf-retriever-v1-1.5b",
reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
generator_model: str = "google/flan-t5-base"
):
self.chunks = self._load_chunks(json_path)
self.texts = list(set([chunk["text"] or "" for chunk in self.chunks]))
self.embedder = SentenceTransformer(embedder_model)
self.reranker = CrossEncoder(reranker_model)
self.tokenizer = AutoTokenizer.from_pretrained(generator_model)
self.generator = T5ForConditionalGeneration.from_pretrained(generator_model)
self.index = self._build_faiss_index()
self.tfidf_vectorizer, self.tfidf_matrix = self._build_tfidf()
def _load_chunks(self, path):
with open(path, "r") as f:
return json.load(f)
def _build_faiss_index(self):
embeddings = self.embedder.encode(self.texts, convert_to_numpy=True)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return index
def _build_tfidf(self):
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(self.texts)
return vectorizer, matrix
def _rerank(self, query, docs):
pairs = [(query, doc) for doc in docs]
scores = self.reranker.predict(pairs)
return [doc for _, doc in sorted(zip(scores, docs), reverse=True)]
def hybrid_search(self, query, top_k=3):
query_embedding = self.embedder.encode([query])[0]
_, faiss_indices = self.index.search(np.array([query_embedding]), top_k)
faiss_results = [self.texts[i] for i in faiss_indices[0]]
query_tfidf = self.tfidf_vectorizer.transform([query])
tfidf_scores = np.array(query_tfidf.dot(self.tfidf_matrix.T).toarray()).flatten()
tfidf_indices = tfidf_scores.argsort()[-top_k:][::-1]
tfidf_results = [self.texts[i] for i in tfidf_indices]
combined = list(set(faiss_results + tfidf_results))
return self._rerank(query, combined)[:top_k]
def generate_answer(self, query, top_k=3):
context = "\n".join(self.hybrid_search(query, top_k))
prompt = f"Answer the question based on the context.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512)
output = self.generator.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_length=300,
do_sample=True,
top_p=0.95,
top_k=50,
pad_token_id=self.tokenizer.eos_token_id
)
return self.tokenizer.decode(output[0], skip_special_tokens=True)
if __name__ == "__main__":
# Example Usage:
rag = RAGPipeline()
print(rag.generate_answer("Who is the President of NACOS?")) |