File size: 3,366 Bytes
acc49a5
8ecdadf
 
 
 
 
 
acc49a5
 
 
 
8ecdadf
 
acc49a5
 
8ecdadf
 
 
acc49a5
8ecdadf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acc49a5
8ecdadf
 
 
 
 
 
 
 
 
 
 
 
 
 
acc49a5
8ecdadf
 
 
 
 
 
 
 
 
 
acc49a5
8ecdadf
 
 
 
acc49a5
 
8ecdadf
 
 
 
 
 
 
 
84fe5bb
8ecdadf
 
 
 
acc49a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ecdadf
acc49a5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114

import faiss
import gradio as gr
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import zipfile
import os
import logging

logging.basicConfig(level=logging.ERROR)


# if not os.path.exists("faiss.index"):
with zipfile.ZipFile("files.zip", "r") as z:
    z.extractall()

pr_number = 14
logging.info("Loading embedding model")
model = SentenceTransformer(
    "intfloat/multilingual-e5-small",
    revision=f"refs/pr/{pr_number}",
    backend="openvino",
)

class FaissIndex:
    def __init__(
        self,
        model: SentenceTransformer,
        data_path: str = "faiss.lookup.csv",
        index_path="faiss.index",
    ):
        self.model = model
        self.df = pd.read_csv(data_path)
        self.index = faiss.read_index(index_path)

    def search(self, query, k=5):
        query = np.array(query).astype("float32")
        distances, indices = self.index.search(query, k)
        return distances, indices

    def extract_docs(self, indices, k):
        indices = list(indices[0])
        lookup = self.df.iloc[indices]
        questions = lookup["query"].values
        answers = lookup["answer"].values

        pairs = list(zip(questions, answers))
        # ensure we only have unique answers. The questions can be duplicates
        filtered_pairs = []
        seen = set()
        for pair in pairs:
            if pair[1] not in seen:
                seen.add(pair[1])
                filtered_pairs.append(pair)

        # format pairs as: f"{answer}\n{kilde: {question}}"
        formatted_pairs = []
        for pair in filtered_pairs:
            formatted_pairs.append(f"{pair[1]}")
        return formatted_pairs

    def search(self, query: str, k: int = 5):
        query = "query: " + query
        enc = self.model.encode([query])
        emb = np.array(enc).astype("float32").reshape(1, -1)
        _, indices = self.index.search(emb, k)
        return self.extract_docs(indices, k)


logging.info("Loading FAISS index")
index = FaissIndex(model)


def query_faiss_index(søketekst):
    if len(søketekst) < 3:
        return
    """
    Queries the FAISS index with the provided search text and returns the top 5 results.
    Args:
        søketekst (str): The search text to query the FAISS index.
    Returns:
        str: A string containing the top 5 search results, separated by double newlines.
    """

    results = index.search(søketekst, k=2)
    return "\n\n".join(results)


# Create the Gradio interface
# iface = gr.Interface(
#     fn=query_faiss_index,
#     inputs=gr.Textbox(lines=2, placeholder="Søk etter info i SIKT", interactive=True, min_width="30vw"),
#     outputs=gr.Textbox(label="Søkeresultater", type="text", lines=20, min_width="70vw"),
#     title="SIKT-FAQ",
#     description="Semantisk søk i SIKT med Openvino.",
#     live=True
# )

with gr.Blocks() as blocks:
    gr.Markdown("## SIKT-FAQ")
    with gr.Row():
        box_search = gr.Textbox(label="Søk etter informasjon i SIKT", lines=1, placeholder="Innlogging i FEIDE...", interactive=True)
    with gr.Row():
        box_output = gr.Textbox(label="Søkeresultater", type="text", lines=20)
    
    box_search.change(fn=query_faiss_index, inputs=box_search, outputs=box_output, max_batch_size=1)


blocks.launch()
# Launch the Gradio app
# if __name__ == "__main__":
#     iface.launch()