File size: 3,366 Bytes
acc49a5 8ecdadf acc49a5 8ecdadf acc49a5 8ecdadf acc49a5 8ecdadf acc49a5 8ecdadf acc49a5 8ecdadf acc49a5 8ecdadf acc49a5 8ecdadf 84fe5bb 8ecdadf acc49a5 8ecdadf acc49a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import faiss
import gradio as gr
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import zipfile
import os
import logging
logging.basicConfig(level=logging.ERROR)
# if not os.path.exists("faiss.index"):
with zipfile.ZipFile("files.zip", "r") as z:
z.extractall()
pr_number = 14
logging.info("Loading embedding model")
model = SentenceTransformer(
"intfloat/multilingual-e5-small",
revision=f"refs/pr/{pr_number}",
backend="openvino",
)
class FaissIndex:
def __init__(
self,
model: SentenceTransformer,
data_path: str = "faiss.lookup.csv",
index_path="faiss.index",
):
self.model = model
self.df = pd.read_csv(data_path)
self.index = faiss.read_index(index_path)
def search(self, query, k=5):
query = np.array(query).astype("float32")
distances, indices = self.index.search(query, k)
return distances, indices
def extract_docs(self, indices, k):
indices = list(indices[0])
lookup = self.df.iloc[indices]
questions = lookup["query"].values
answers = lookup["answer"].values
pairs = list(zip(questions, answers))
# ensure we only have unique answers. The questions can be duplicates
filtered_pairs = []
seen = set()
for pair in pairs:
if pair[1] not in seen:
seen.add(pair[1])
filtered_pairs.append(pair)
# format pairs as: f"{answer}\n{kilde: {question}}"
formatted_pairs = []
for pair in filtered_pairs:
formatted_pairs.append(f"{pair[1]}")
return formatted_pairs
def search(self, query: str, k: int = 5):
query = "query: " + query
enc = self.model.encode([query])
emb = np.array(enc).astype("float32").reshape(1, -1)
_, indices = self.index.search(emb, k)
return self.extract_docs(indices, k)
logging.info("Loading FAISS index")
index = FaissIndex(model)
def query_faiss_index(søketekst):
if len(søketekst) < 3:
return
"""
Queries the FAISS index with the provided search text and returns the top 5 results.
Args:
søketekst (str): The search text to query the FAISS index.
Returns:
str: A string containing the top 5 search results, separated by double newlines.
"""
results = index.search(søketekst, k=2)
return "\n\n".join(results)
# Create the Gradio interface
# iface = gr.Interface(
# fn=query_faiss_index,
# inputs=gr.Textbox(lines=2, placeholder="Søk etter info i SIKT", interactive=True, min_width="30vw"),
# outputs=gr.Textbox(label="Søkeresultater", type="text", lines=20, min_width="70vw"),
# title="SIKT-FAQ",
# description="Semantisk søk i SIKT med Openvino.",
# live=True
# )
with gr.Blocks() as blocks:
gr.Markdown("## SIKT-FAQ")
with gr.Row():
box_search = gr.Textbox(label="Søk etter informasjon i SIKT", lines=1, placeholder="Innlogging i FEIDE...", interactive=True)
with gr.Row():
box_output = gr.Textbox(label="Søkeresultater", type="text", lines=20)
box_search.change(fn=query_faiss_index, inputs=box_search, outputs=box_output, max_batch_size=1)
blocks.launch()
# Launch the Gradio app
# if __name__ == "__main__":
# iface.launch()
|