|
|
|
import re |
|
import faiss |
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
def clean_text(text): |
|
""" |
|
Cleans text by removing unnecessary symbols and whitespace. |
|
""" |
|
text = re.sub(r"\s+", " ", text) |
|
text = re.sub(r"[^ء-يa-zA-Z0-9.,!?؛:\-\(\)\n ]+", "", text) |
|
return text.strip() |
|
|
|
|
|
def create_faiss_index(texts): |
|
""" |
|
Create a FAISS index from the provided list of texts. |
|
""" |
|
|
|
texts = [clean_text(t) for t in texts] |
|
|
|
|
|
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") |
|
embeddings = model.encode(texts) |
|
|
|
|
|
dimension = embeddings.shape[1] |
|
index = faiss.IndexFlatL2(dimension) |
|
index.add(embeddings) |
|
|
|
return index, texts |
|
|
|
|
|
def search_faiss(faiss_index, stored_texts, query, top_k=3): |
|
""" |
|
Search FAISS for the most relevant texts. |
|
""" |
|
from sentence_transformers import SentenceTransformer |
|
|
|
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") |
|
query_embedding = model.encode([query]) |
|
|
|
distances, indices = faiss_index.search(query_embedding, top_k) |
|
results = [stored_texts[i] for i in indices[0] if i < len(stored_texts)] |
|
|
|
return results |