|
|
|
import faiss |
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
from sentence_transformers import SentenceTransformer |
|
import zipfile |
|
import os |
|
import logging |
|
|
|
logging.basicConfig(level=logging.ERROR) |
|
|
|
|
|
|
|
with zipfile.ZipFile("files.zip", "r") as z: |
|
z.extractall() |
|
|
|
pr_number = 14 |
|
logging.info("Loading embedding model") |
|
model = SentenceTransformer( |
|
"intfloat/multilingual-e5-small", |
|
revision=f"refs/pr/{pr_number}", |
|
backend="openvino", |
|
) |
|
|
|
class FaissIndex: |
|
def __init__( |
|
self, |
|
model: SentenceTransformer, |
|
data_path: str = "faiss.lookup.csv", |
|
index_path="faiss.index", |
|
): |
|
self.model = model |
|
self.df = pd.read_csv(data_path) |
|
self.index = faiss.read_index(index_path) |
|
|
|
def search(self, query, k=5): |
|
query = np.array(query).astype("float32") |
|
distances, indices = self.index.search(query, k) |
|
return distances, indices |
|
|
|
def extract_docs(self, indices, k): |
|
indices = list(indices[0]) |
|
lookup = self.df.iloc[indices] |
|
questions = lookup["query"].values |
|
answers = lookup["answer"].values |
|
|
|
pairs = list(zip(questions, answers)) |
|
|
|
filtered_pairs = [] |
|
seen = set() |
|
for pair in pairs: |
|
if pair[1] not in seen: |
|
seen.add(pair[1]) |
|
filtered_pairs.append(pair) |
|
|
|
|
|
formatted_pairs = [] |
|
for pair in filtered_pairs: |
|
formatted_pairs.append(f"{pair[1]}") |
|
return formatted_pairs |
|
|
|
def search(self, query: str, k: int = 5): |
|
query = "query: " + query |
|
enc = self.model.encode([query]) |
|
emb = np.array(enc).astype("float32").reshape(1, -1) |
|
_, indices = self.index.search(emb, k) |
|
return self.extract_docs(indices, k) |
|
|
|
|
|
logging.info("Loading FAISS index") |
|
index = FaissIndex(model) |
|
|
|
|
|
def query_faiss_index(søketekst): |
|
if len(søketekst) < 3: |
|
return |
|
""" |
|
Queries the FAISS index with the provided search text and returns the top 5 results. |
|
Args: |
|
søketekst (str): The search text to query the FAISS index. |
|
Returns: |
|
str: A string containing the top 5 search results, separated by double newlines. |
|
""" |
|
|
|
results = index.search(søketekst, k=2) |
|
return "\n\n".join(results) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as blocks: |
|
gr.Markdown("## SIKT-FAQ") |
|
with gr.Row(): |
|
box_search = gr.Textbox(label="Søk etter informasjon i SIKT", lines=1, placeholder="Innlogging i FEIDE...", interactive=True) |
|
with gr.Row(): |
|
box_output = gr.Textbox(label="Søkeresultater", type="text", lines=20) |
|
|
|
box_search.change(fn=query_faiss_index, inputs=box_search, outputs=box_output, max_batch_size=1) |
|
|
|
|
|
blocks.launch() |
|
|
|
|
|
|
|
|