File size: 1,451 Bytes
a99e8b1 512d794 1d20113 e663f87 a99e8b1 512d794 e663f87 1d20113 512d794 e663f87 1d20113 512d794 e139833 e663f87 4b151ce e139833 aa7ef94 e139833 aa7ef94 e139833 aa7ef94 e139833 a99e8b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# Import necessary modules
import re
import faiss
from sentence_transformers import SentenceTransformer
# Clean text function
def clean_text(text):
"""
Cleans text by removing unnecessary symbols and whitespace.
"""
text = re.sub(r"\s+", " ", text) # Replace multiple spaces with one
text = re.sub(r"[^ء-يa-zA-Z0-9.,!?؛:\-\(\)\n ]+", "", text) # Keep Arabic, English, and punctuation
return text.strip()
# Create FAISS index
def create_faiss_index(texts):
"""
Create a FAISS index from the provided list of texts.
"""
# Clean the text before indexing
texts = [clean_text(t) for t in texts]
# Load pre-trained SentenceTransformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(texts)
# Create the FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return index, texts
# Search the FAISS index
def search_faiss(faiss_index, stored_texts, query, top_k=3):
"""
Search FAISS for the most relevant texts.
"""
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
query_embedding = model.encode([query])
distances, indices = faiss_index.search(query_embedding, top_k)
results = [stored_texts[i] for i in indices[0] if i < len(stored_texts)]
return results |